**Importing dataset and necessary libraries**

In [49]:
import pandas as pd

In [50]:
data=pd.read_csv("F:\Downloads\Retrieved_File.csv")

In [51]:
data.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,359,1,105733.45,105733.45,0.0,934471.58,1040205.03,1
1,625,1,249749.05,249749.05,0.0,0.0,249749.05,1
2,11,4,12461.0,12461.0,0.0,0.0,0.0,1
3,462,4,67921.66,67921.66,0.0,0.0,0.0,1
4,93,1,269545.63,269545.63,0.0,905690.55,1175236.17,1


In [52]:
print("Size of dataset: ", data.shape)

Size of dataset:  (16426, 8)


**Checking for Missing Values**

In [53]:
missing_values = data.isnull().sum()
print(missing_values)

step              0
type              0
amount            0
oldbalanceOrg     0
newbalanceOrig    0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
dtype: int64


In [54]:
has_missing = data.isnull().values.any()
print("\nAny missing values in DataFrame?:", has_missing)


Any missing values in DataFrame?: False


**Filling missing numerical values with mean**

In [55]:
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
data[numerical_columns] = data[numerical_columns].apply(lambda x: x.fillna(x.mean()))

**Normalizing Numerical features using StandardScalar**

In [56]:
from sklearn.preprocessing import StandardScaler
# Select only the numerical columns for normalization i.e, all the coloumns with float and int types.
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns

# Initialize the StandardScaler
scaler = StandardScaler()

# Apply the StandardScaler to the numerical columns  using fit_transform
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

In [57]:
data.head(10)

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,0.276483,-0.748453,-0.386318,-0.347741,-0.20833,0.032779,-0.060909,1.0
1,1.651365,-0.748453,-0.309358,-0.303809,-0.20833,-0.256064,-0.272165,1.0
2,-1.522234,1.278603,-0.436161,-0.376194,-0.20833,-0.256064,-0.338913,1.0
3,0.808862,1.278603,-0.406524,-0.359276,-0.20833,-0.256064,-0.338913,1.0
4,-1.098398,-0.748453,-0.298779,-0.29777,-0.20833,0.023883,-0.024821,1.0
5,1.351579,-0.748453,-0.325135,-0.379995,-0.20833,0.193456,0.108618,-1.0
6,0.462558,0.602917,-0.429339,-0.379995,-0.20833,-0.256064,-0.338913,-1.0
7,1.67204,-0.748453,-0.441939,-0.379492,-0.20833,-0.256064,-0.338472,1.0
8,1.336073,1.278603,-0.426658,-0.370769,-0.20833,-0.256064,-0.338913,1.0
9,-0.860637,-0.748453,-0.2089,-0.379995,-0.20833,0.540864,0.467134,-1.0


**Split the Data:**

Define the feature set (X) and the target variable (y)

In [58]:
#X contains all the columns except the target variable (isFraud).
#y contains only the target variable (isFraud).
X = data.drop(columns=['isFraud'])
y = data['isFraud']

# Convert target variable from -1 to 0 for binary classification
y = y.replace({-1: 0})

Split the data into training, validation, and test sets

In [59]:
from sklearn.model_selection import train_test_split
# Split the data into training (60%) and temporary sets (40%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
# Split the temporary set into validation (20%) and test sets (20%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

**Display splits**

In [60]:
# Display the sizes of the splits
print(f"\nTraining set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")


Training set size: 9855
Validation set size: 3285
Test set size: 3286


In [61]:
# Display the splits
print("\nTraining Set (X_train, y_train):")
print(X_train)
print(y_train)

print("\nValidation Set (X_val, y_val):")
print(X_val)
print(y_val)

print("\nTest Set (X_test, y_test):")
print(X_test)
print(y_test)


Training Set (X_train, y_train):
           step      type    amount  oldbalanceOrg  newbalanceOrig  \
6499   0.105915 -0.748453 -0.409745      -0.379995       -0.208330   
7914   1.491134 -0.748453 -0.234810      -0.261254       -0.208330   
9214  -0.111171 -0.748453 -0.047596      -0.154385       -0.208330   
7964  -0.385114 -0.748453 -0.228660      -0.257744       -0.208330   
16366 -0.855468  0.602917 -0.435071      -0.347148       -0.171638   
...         ...       ...       ...            ...             ...   
11284 -1.542908 -0.748453 -0.302579      -0.299940       -0.208330   
11964 -0.359270  1.278603  0.074814      -0.084508       -0.208330   
5390  -1.346497 -0.748453 -0.306229      -0.379995       -0.208330   
860   -0.292077  1.278603 -0.073779      -0.377183       -0.208330   
15795 -1.393015 -0.748453 -0.324758      -0.368671       -0.208330   

       oldbalanceDest  newbalanceDest  
6499        -0.039800       -0.135380  
7914        -0.177874       -0.167276  
9214 

In [62]:
print("Shapes of the splits:")
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("X_test shape:", X_temp.shape)
print("Y_train shape:", y_train.shape)
print("Y_val shape:", y_val.shape)
print("Y_test shape:", y_temp.shape)
print()
print("The size of Training set is: ",X_train.shape)
print("The size of Validation set is: ",X_val.shape)
print("The size of Test set is: ",X_temp.shape)

Shapes of the splits:
X_train shape: (9855, 7)
X_val shape: (3285, 7)
X_test shape: (6571, 7)
Y_train shape: (9855,)
Y_val shape: (3285,)
Y_test shape: (6571,)

The size of Training set is:  (9855, 7)
The size of Validation set is:  (3285, 7)
The size of Test set is:  (6571, 7)


***Evaluating different models***

In [63]:
# List of models to evaluate
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier

models = [
    ('Logistic Regression', LogisticRegression()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('SVM', SVC(probability=True)),
    ('KNN', KNeighborsClassifier()),
    ('Naive Bayes', GaussianNB()),
    ('Random Forest', RandomForestClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier()),
    ('XGBoost', xgb.XGBClassifier()),
    ('LightGBM', lgb.LGBMClassifier()),
    ('CatBoost', CatBoostClassifier(verbose=0)),
    ('Neural Network', MLPClassifier())
]

# Evaluate each model
for name, model in models:
    # Load and fit model
    model.fit(X_train, y_train)
    
    # Validate model
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)
    
    # Predict and evaluate accuracy and precision
    val_accuracy = accuracy_score(y_val, y_val_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    val_precision = precision_score(y_val, y_val_pred)
    test_precision = precision_score(y_test, y_test_pred)
    
    # Print results
    print(f"\nModel: {name}")
    print(f"Validation Accuracy: {val_accuracy:.2f}")
    print(f"Test Accuracy: {test_accuracy:.2f}")
    print(f"Validation Precision: {val_precision:.2f}")
    print(f"Test Precision: {test_precision:.2f}")


Model: Logistic Regression
Validation Accuracy: 0.89
Test Accuracy: 0.89
Validation Precision: 0.97
Test Precision: 0.97

Model: Decision Tree
Validation Accuracy: 0.99
Test Accuracy: 0.99
Validation Precision: 0.99
Test Precision: 0.99

Model: SVM
Validation Accuracy: 0.91
Test Accuracy: 0.90
Validation Precision: 0.96
Test Precision: 0.95

Model: KNN
Validation Accuracy: 0.93
Test Accuracy: 0.94
Validation Precision: 0.96
Test Precision: 0.95

Model: Naive Bayes
Validation Accuracy: 0.76
Test Accuracy: 0.76
Validation Precision: 0.90
Test Precision: 0.90

Model: Random Forest
Validation Accuracy: 0.99
Test Accuracy: 0.99
Validation Precision: 0.99
Test Precision: 0.98

Model: Gradient Boosting
Validation Accuracy: 0.99
Test Accuracy: 0.99
Validation Precision: 0.98
Test Precision: 0.98

Model: XGBoost
Validation Accuracy: 0.99
Test Accuracy: 0.99
Validation Precision: 0.99
Test Precision: 0.99
[LightGBM] [Info] Number of positive: 4861, number of negative: 4994
[LightGBM] [Info] Aut



In [64]:
for name, model in models:
    # Load and fit model
    model.fit(X_train, y_train)
    
    # Validate model
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)
    
    # Predict and evaluate accuracy and precision
    val_accuracy = accuracy_score(y_val, y_val_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    val_precision = precision_score(y_val, y_val_pred)
    test_precision = precision_score(y_test, y_test_pred)
    
    # Print results
    print(f"\nModel: {name}")
    print(f"Classification Report (Validation):\n{classification_report(y_val, y_val_pred)}")
    print(f"Classification Report (Test):\n{classification_report(y_test, y_test_pred)}")


Model: Logistic Regression
Classification Report (Validation):
              precision    recall  f1-score   support

         0.0       0.83      0.98      0.90      1609
         1.0       0.97      0.81      0.89      1676

    accuracy                           0.89      3285
   macro avg       0.90      0.89      0.89      3285
weighted avg       0.90      0.89      0.89      3285

Classification Report (Test):
              precision    recall  f1-score   support

         0.0       0.83      0.97      0.90      1610
         1.0       0.97      0.81      0.88      1676

    accuracy                           0.89      3286
   macro avg       0.90      0.89      0.89      3286
weighted avg       0.90      0.89      0.89      3286


Model: Decision Tree
Classification Report (Validation):
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      1609
         1.0       0.99      0.99      0.99      1676

    accuracy                  

