In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix

In [2]:
data=pd.read_csv('balanced_dataset_50_50.csv')

In [3]:
data.head()

Unnamed: 0,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,TRANSFER,129526.58,C1786736565,0.0,0.0,C1189777296,2103690.88,2233217.46,0.0
1,CASH_IN,304210.97,C2138211913,6638620.67,6942831.64,C1766776273,513871.6,209660.63,0.0
2,CASH_OUT,57905.63,C393598530,188.0,0.0,C769319919,611506.02,669411.65,0.0
3,CASH_IN,83102.7,C807731926,2274365.81,2357468.51,C322340400,122204.31,39101.61,0.0
4,PAYMENT,14341.19,C603392505,459181.0,444839.81,M1570710058,0.0,0.0,0.0


In [4]:
data.describe()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
count,5070.0,5070.0,5070.0,5070.0,5070.0,5070.0
mean,730543.7,1087883.0,452310.1,778625.3,1165501.0,0.5
std,1605670.0,2672994.0,2136752.0,2252226.0,2612199.0,0.500049
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,33617.76,10119.47,0.0,0.0,0.0,0.0
50%,167715.5,113339.0,0.0,0.0,143878.9,0.5
75%,522343.8,798007.4,0.0,553355.6,1152234.0,1.0
max,10000000.0,37670040.0,36365350.0,32961660.0,34645700.0,1.0


In [5]:
data = data.drop(['nameOrig','nameDest'], axis=1)

In [6]:
encoder = OneHotEncoder(drop='first', sparse_output=False)

encoded_type = encoder.fit_transform(data[['type']])

In [7]:
encoded_type_df = pd.DataFrame(encoded_type,
                               columns=encoder.get_feature_names_out(['type']),
                               index=data.index)

# Dropping the original 'type' column and adding the encoded columns
data_encoded = pd.concat([data.drop('type', axis=1), encoded_type_df], axis=1)

# Display the first few rows to verify the result
print(data_encoded.head())

      amount  oldbalanceOrg  newbalanceOrig  oldbalanceDest  newbalanceDest  \
0  129526.58           0.00            0.00      2103690.88      2233217.46   
1  304210.97     6638620.67      6942831.64       513871.60       209660.63   
2   57905.63         188.00            0.00       611506.02       669411.65   
3   83102.70     2274365.81      2357468.51       122204.31        39101.61   
4   14341.19      459181.00       444839.81            0.00            0.00   

   isFraud  type_CASH_OUT  type_DEBIT  type_PAYMENT  type_TRANSFER  
0      0.0            0.0         0.0           0.0            1.0  
1      0.0            0.0         0.0           0.0            0.0  
2      0.0            1.0         0.0           0.0            0.0  
3      0.0            0.0         0.0           0.0            0.0  
4      0.0            0.0         0.0           1.0            0.0  


In [8]:
X = data_encoded.drop('isFraud', axis=1)  # Features (all columns except the target)
y = data_encoded['isFraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scaling the data (SVM and KNN need scaled data)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Decision Tree

In [9]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Predictions
y_pred_dt = dt_model.predict(X_test)

# Evaluation
print("Decision Tree Classification Report:\n", classification_report(y_test, y_pred_dt))
print("Decision Tree Accuracy: ", accuracy_score(y_test, y_pred_dt))

Decision Tree Classification Report:
               precision    recall  f1-score   support

         0.0       0.99      0.98      0.99       774
         1.0       0.98      0.99      0.99       747

    accuracy                           0.99      1521
   macro avg       0.99      0.99      0.99      1521
weighted avg       0.99      0.99      0.99      1521

Decision Tree Accuracy:  0.9861932938856016


In [11]:
dt_roc_auc = roc_auc_score(y_test, dt_model.predict_proba(X_test)[:, 1])
print(f"Decision Tree ROC-AUC: {dt_roc_auc}")
conf_matrix = confusion_matrix(y_test, y_pred_dt)
print("Confusion Matrix:\n", conf_matrix)

Decision Tree ROC-AUC: 0.9863173624731485
Confusion Matrix:
 [[758  16]
 [  5 742]]


Logistic Regression

In [12]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_lr = lr_model.predict(X_test_scaled)

# Evaluation
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_lr))
print("Logistic Regression Accuracy: ", accuracy_score(y_test, y_pred_lr))

Logistic Regression Classification Report:
               precision    recall  f1-score   support

         0.0       0.92      0.92      0.92       774
         1.0       0.92      0.92      0.92       747

    accuracy                           0.92      1521
   macro avg       0.92      0.92      0.92      1521
weighted avg       0.92      0.92      0.92      1521

Logistic Regression Accuracy:  0.9184746877054569


In [13]:
lr_roc_auc = roc_auc_score(y_test, lr_model.predict_proba(X_test_scaled)[:, 1])
print(f"Logistic Regression ROC-AUC: {lr_roc_auc}")
conf_matrix = confusion_matrix(y_test, y_pred_lr)
print("Confusion Matrix:\n", conf_matrix)

Logistic Regression ROC-AUC: 0.9757894627606031
Confusion Matrix:
 [[711  63]
 [ 61 686]]


Support Vector Machine

In [14]:
svm_model = SVC(kernel='linear', probability=True)  # You can change the kernel type
svm_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_svm = svm_model.predict(X_test_scaled)

# Evaluation
print("SVM Classification Report:\n", classification_report(y_test, y_pred_svm))
print("SVM Accuracy: ", accuracy_score(y_test, y_pred_svm))

SVM Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      0.95      0.90       774
         1.0       0.94      0.84      0.89       747

    accuracy                           0.90      1521
   macro avg       0.90      0.90      0.90      1521
weighted avg       0.90      0.90      0.90      1521

SVM Accuracy:  0.8961209730440499


In [15]:
svm_roc_auc = roc_auc_score(y_test, svm_model.predict_proba(X_test_scaled)[:, 1])
print(f"SVM ROC-AUC: {svm_roc_auc}")
conf_matrix = confusion_matrix(y_test, y_pred_svm)
print("Confusion Matrix:\n", conf_matrix)

SVM ROC-AUC: 0.9786674691876895
Confusion Matrix:
 [[736  38]
 [120 627]]


K-Nearest Neighbour

In [16]:
knn_model = KNeighborsClassifier(n_neighbors=5)  # You can tune 'n_neighbors'
knn_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_knn = knn_model.predict(X_test_scaled)

# Evaluation
print("KNN Classification Report:\n", classification_report(y_test, y_pred_knn))
print("KNN Accuracy: ", accuracy_score(y_test, y_pred_knn))

KNN Classification Report:
               precision    recall  f1-score   support

         0.0       0.95      0.93      0.94       774
         1.0       0.93      0.95      0.94       747

    accuracy                           0.94      1521
   macro avg       0.94      0.94      0.94      1521
weighted avg       0.94      0.94      0.94      1521

KNN Accuracy:  0.9388560157790927


In [17]:
knn_roc_auc = roc_auc_score(y_test, knn_model.predict_proba(X_test_scaled)[:, 1])
print(f"KNN ROC-AUC: {knn_roc_auc}")
conf_matrix = confusion_matrix(y_test, y_pred_knn)
print("Confusion Matrix:\n", conf_matrix)

KNN ROC-AUC: 0.9820150541874647
Confusion Matrix:
 [[721  53]
 [ 40 707]]


In [18]:
import pickle
with open('Fraud_detection_dtree.pkl', 'wb') as f:
    pickle.dump(dt_model, f)

In [19]:
load_model=pickle.load(open('Fraud_detection_dtree.pkl', 'rb'))

In [20]:
model_score=load_model.score(X_test, y_test)
model_score

0.9861932938856016