In [1]:
# Classifier Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split



In [2]:
import pandas as pd

url ="fraud_transactions.csv"

df_actual = pd.read_csv(url, sep=",")
df_actual.head()

Unnamed: 0.1,Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_FRAUD
0,0,0,2023-02-01 00:43:37,901,8047,82,1
1,1,1,2023-02-01 01:20:13,2611,7777,15,0
2,2,2,2023-02-01 01:22:52,4212,3336,53,0
3,3,3,2023-02-01 01:26:40,1293,7432,59,0
4,4,4,2023-02-01 01:52:23,2499,1024,25,0


In [3]:
df_transactions = df_actual[['CUSTOMER_ID','TERMINAL_ID','TX_AMOUNT','TX_FRAUD']]
#df_transactions=df_transactions.head(10)
df_transactions

Unnamed: 0,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_FRAUD
0,901,8047,82,1
1,2611,7777,15,0
2,4212,3336,53,0
3,1293,7432,59,0
4,2499,1024,25,0
...,...,...,...,...
4557161,1465,7455,92,1
4557162,4009,3429,36,0
4557163,1336,3116,50,0
4557164,1611,3314,81,1


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

print('No Frauds', round(df_transactions['TX_FRAUD'].value_counts()[0]/len(df_transactions) * 100,2), '% of the dataset')
print('Frauds', round(df_transactions['TX_FRAUD'].value_counts()[1]/len(df_transactions) * 100,2), '% of the dataset')

X = df_transactions.drop('TX_FRAUD', axis=1)
y = df_transactions['TX_FRAUD']



No Frauds 75.71 % of the dataset
Frauds 24.29 % of the dataset


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Turn the values into an array for feeding the classification algorithms.
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

In [6]:
from sklearn.model_selection import cross_val_score

logreg = LogisticRegression(random_state=0)

In [7]:
logreg.fit(X_train, y_train)
training_score = cross_val_score(logreg, X_train, y_train, cv=2)
print('Logistic Regression Cross Validation Score: ', round(training_score.mean() * 100, 2).astype(str) + '%')


Logistic Regression Cross Validation Score:  100.0%


In [8]:
import numpy as np
np.sum(logreg.predict(X_test) == y_test)/X_test.shape[0]

1.0

In [9]:
logreg.intercept_[0], logreg.coef_[0]

(-1168.308115256604,
 array([-2.47724513e-05,  3.17749573e-06,  1.54748556e+01]))

In [10]:


data=[79,3115,78]
weights = [-2.47724513e-05,  3.17749573e-06,  1.54748556e+01]
intercept = -1168.308115256604 



In [11]:
def predict(data,coefficients,intercept):
    print(coefficients)
    yhat = intercept
    for i in range(len(data)):
        yhat += coefficients[i] * data[i]
    return 1.0 / (1.0 + np.exp(-yhat))

yhat = predict(data,weights,intercept)
yhat

[-2.47724513e-05, 3.17749573e-06, 15.4748556]


1.0

In [12]:
X_train[:10]

array([[4548, 8796,   17],
       [1141,   95,   52],
       [4740, 6652,   51],
       [3583, 5609,   70],
       [2329, 8399,    4],
       [3011, 2707,   70],
       [ 365, 7609,   43],
       [2749, 6011,   19],
       [3871, 1806,   83],
       [3521,  721,   95]])

In [13]:
y_train[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1])

In [45]:
def predict(data,coefficients,intercept):
    yhat = intercept
    for i in range(len(data)):
        yhat += coefficients[i] * data[i]
    return 1.0 / (1.0 + np.exp(-yhat))

In [46]:
def final_gradients(gradients):
    length_grads=len(grads)
    avg_grads=[0,0,0]
    for i in range(0,length_grads):
        avg_grads[0]+=grads[i][0]
        avg_grads[1]+=grads[i][1]
        avg_grads[2]+=grads[i][2]
    avg_grads=[i/length_grads for i in avg_grads]
    
    return avg_grads
     

In [47]:
def sgd(train,y_train,l_rate, n_epoch):
    coef = [0,0,0]
    final_grads = [0,0,0]
    intercept = 0
    for epoch in range(n_epoch):
        predictions=[]
        gradients=[]
        sum_error = 0.0
        for i in range(len(train)):
            yhat = predict(train[i], coef,intercept)  
            predictions.append(yhat)
            error = y_train[i] - yhat
            sum_error += error**2
            intercept= intercept + l_rate * error * yhat * (1.0 - yhat)  ## intercept
            temp=train[i]
            for j in range(3):
                coef[j] = coef[j] + l_rate * error * yhat * (1.0 - yhat) * temp[j]  
            gradients.append(coef)
        final_grads = final_gradients(gradients)
        print('>epoch=%d, lrate=%.3f, error=%.3f, intercept=%.3f '% (epoch, l_rate, sum_error,intercept))
    return final_grads

In [48]:
l_rate = 0.24
n_epoch = 4
coef = sgd(X_train[:10],y_train[:10],l_rate, n_epoch)
print(coef)

>epoch=0, lrate=0.240, error=2.250, intercept=-0.030 
>epoch=1, lrate=0.240, error=2.000, intercept=-0.030 
>epoch=2, lrate=0.240, error=2.000, intercept=-0.030 
>epoch=3, lrate=0.240, error=2.000, intercept=-0.030 
[-136.44000000000003, -263.88000000000005, -0.5099999999999999]


  return 1.0 / (1.0 + np.exp(-yhat))


In [93]:
def clip(v, b):
    norm = np.linalg.norm(v, ord=2)

    if norm > b:
        return b * (v / norm)
    else:
        return v

print( clip([[4548, 8796,   17]],5.0) )


[[2.29645183 4.44142267 0.00858392]]


In [94]:
clip(X_train[:5], 5)

array([[1.35772596e+00, 2.62589215e+00, 5.07505304e-03],
       [3.40625619e-01, 2.83605905e-02, 1.55236917e-02],
       [1.41504420e+00, 1.98583840e+00, 1.52251591e-02],
       [1.06964206e+00, 1.67446897e+00, 2.08972772e-02],
       [6.95282267e-01, 2.50737474e+00, 1.19413013e-03]])

In [84]:
def dp_final_gradients(gradients):
    length_grads=len(grads)
    
    sensitivity = 1
    epsilon= 0.8
    noise = np.random.laplace(loc=0, scale=sensitivity/epsilon)
    
    noise_lenth = length_grads + noise
    
    avg_grads=[0,0,0]
    
    for i in range(0,length_grads):
        avg_grads[0]+=grads[i][0]
        avg_grads[1]+=grads[i][1]
        avg_grads[2]+=grads[i][2]
        
    avg_grads=[i/noise_lenth for i in avg_grads]
    
    return avg_grads
     

In [85]:
def dp_sgd(train,y_train,l_rate, n_epoch):
    train = clip(train, 5)
    coef = [0,0,0]
    final_grads = [0,0,0]
    intercept = 0
    for epoch in range(n_epoch):
        predictions=[]
        gradients=[]
        sum_error = 0.0
        for i in range(len(train)):
            yhat = predict(train[i], coef,intercept)  
            predictions.append(yhat)
            error = y_train[i] - yhat
            sum_error += error**2
            intercept= intercept + l_rate * error * yhat * (1.0 - yhat)  ## intercept
            temp=train[i]
            for j in range(3):
                coef[j] = coef[j] + l_rate * error * yhat * (1.0 - yhat) * temp[j]  
            gradients.append(coef)
        final_grads = dp_final_gradients(gradients)
        print('>epoch=%d, lrate=%.3f, error=%.3f, intercept=%.3f '% (epoch, l_rate, sum_error,intercept))
    return final_grads

In [88]:
l_rate = 0.24
n_epoch = 4

print("Gradients using Normal SGD ")
coef = sgd(X_train[:10],y_train[:10],l_rate, n_epoch)

print("Gradients using Differntially Private SGD ")
coef = dp_sgd(X_train[:10],y_train[:10],l_rate, n_epoch)
print(coef)

Gradients using Normal SGD 
>epoch=0, lrate=0.240, error=2.250, intercept=-0.030 
>epoch=1, lrate=0.240, error=2.000, intercept=-0.030 
>epoch=2, lrate=0.240, error=2.000, intercept=-0.030 
>epoch=3, lrate=0.240, error=2.000, intercept=-0.030 
Gradients using Differntially Private SGD 
>epoch=0, lrate=0.240, error=2.146, intercept=-0.127 
>epoch=1, lrate=0.240, error=1.654, intercept=-0.193 
>epoch=2, lrate=0.240, error=1.478, intercept=-0.229 
>epoch=3, lrate=0.240, error=1.396, intercept=-0.249 
[-115.01700212848986, -222.44713076565455, -0.42992283117509383]


  return 1.0 / (1.0 + np.exp(-yhat))
