# Importing the requried libraries

In [90]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import joblib

# loading the dataset

In [91]:
file_path='cleaned_dataset.csv'
df=pd.read_csv(file_path)

In [92]:
print("original Dataframe")
print(df.head())

original Dataframe
   type      amount  oldbalanceOrg  newbalanceOrig  isFraud
0     1  2949339.09     2949339.09            0.00        1
1     4    63156.44         918.00        64074.44        0
2     0     4468.49           0.00            0.00        0
3     2  9887819.06     9887819.06            0.00        1
4     1    79441.66       79441.66            0.00        1


# Check for missing values

In [93]:
print("\nMissing values in each columns:")
print(df.isnull().sum())


Missing values in each columns:
type              0
amount            0
oldbalanceOrg     0
newbalanceOrig    0
isFraud           0
dtype: int64


# Fill missing numerical values with the mean

In [94]:
numerical_cols = df.select_dtypes(include=['number']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())
print("\nMissing values in each column after filling numerical columns with mean:")
print(df.isnull().sum())


Missing values in each column after filling numerical columns with mean:
type              0
amount            0
oldbalanceOrg     0
newbalanceOrig    0
isFraud           0
dtype: int64


#  Convert categorical features to numerical representations

In [95]:
categorical_cols = df.select_dtypes(include=['object']).columns
label_encoder = LabelEncoder()
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])


In [96]:
print("\nDataFrame after converting categorical features:")
print(df.head())



DataFrame after converting categorical features:
   type      amount  oldbalanceOrg  newbalanceOrig  isFraud
0     1  2949339.09     2949339.09            0.00        1
1     4    63156.44         918.00        64074.44        0
2     0     4468.49           0.00            0.00        0
3     2  9887819.06     9887819.06            0.00        1
4     1    79441.66       79441.66            0.00        1


# Normalize Numerical Features

In [97]:
# List of numerical columns to normalize
numerical_cols = df.select_dtypes(include=['number']).columns
print("\nNumerical columns to normalize:")
print(numerical_cols)


Numerical columns to normalize:
Index(['type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'isFraud'], dtype='object')


In [98]:
# Initialize the scaler
scaler = StandardScaler()

In [99]:
# Initialize the scaler
scaler = StandardScaler()

In [100]:
# Convert the numpy array back to DataFrame to keep column names
df[numerical_cols] = pd.DataFrame(df[numerical_cols], columns=numerical_cols)

#  DataFrame after preprocessing

In [101]:
print("\nDataFrame after preprocessing:")
print(df.head())


DataFrame after preprocessing:
   type      amount  oldbalanceOrg  newbalanceOrig  isFraud
0     1  2949339.09     2949339.09            0.00        1
1     4    63156.44         918.00        64074.44        0
2     0     4468.49           0.00            0.00        0
3     2  9887819.06     9887819.06            0.00        1
4     1    79441.66       79441.66            0.00        1


# save the objects using piclel

In [103]:
import os
import pickle

#create the folder name and scaler file name

scaler_filename = "scaler.pkl"

#create the file path
file_path = os.path.join( scaler_filename)



# Save the scaler object to the file
with open(file_path, 'wb') as f:
  pickle.dump(scaler, f)

print(f"Standard scaler saved to {file_path}")

Standard scaler saved to scaler.pkl


# Split the Data:

In [104]:
# Define the feature set (X) and the target variable (y)
target_variable = 'isFraud'
X = df.drop(columns=[target_variable])
y = df[target_variable]

In [105]:
# Ensure the target variable is categorical
if y.dtype == 'float64' or y.dtype == 'int64':
    y = pd.cut(y, bins=len(y.unique()), labels=range(len(y.unique())))

In [106]:
# Convert the target variable to integer type if it is not already
if y.dtype != 'int64':
    y = y.astype('int64')

In [107]:
# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [108]:
# Verify the shapes of the splits
print("\nShapes of the data splits:")
print(f"Training set: X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Validation set: X_val: {X_val.shape}, y_val: {y_val.shape}")
print(f"Test set: X_test: {X_test.shape}, y_test: {y_test.shape}")


Shapes of the data splits:
Training set: X_train: (1370, 4), y_train: (1370,)
Validation set: X_val: (457, 4), y_val: (457,)
Test set: X_test: (457, 4), y_test: (457,)


#  Initialize models

# Train and evaluate each model

In [109]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),  
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC()
}
for model_name, model in models.items():
    print(f"Training {model_name}...")
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on validation set
    y_val_pred = model.predict(X_val)
    
    # Evaluate the model
    accuracy = accuracy_score(y_val, y_val_pred)*100
    report = classification_report(y_val, y_val_pred)
    
    # Display results
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.2f}%")
    print("Classification Report:")
    print(report)
    print("-" * 60)

Training Logistic Regression...
Model: Logistic Regression
Accuracy: 93.00%
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.94      0.93       222
           1       0.94      0.92      0.93       235

    accuracy                           0.93       457
   macro avg       0.93      0.93      0.93       457
weighted avg       0.93      0.93      0.93       457

------------------------------------------------------------
Training Random Forest...
Model: Random Forest
Accuracy: 97.16%
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       222
           1       0.97      0.97      0.97       235

    accuracy                           0.97       457
   macro avg       0.97      0.97      0.97       457
weighted avg       0.97      0.97      0.97       457

------------------------------------------------------------
Training Support Vector Machine...
Mod

# Implement, train, and save the final model (Random Forest)

In [110]:
final_model = RandomForestClassifier()
final_model.fit(X_train, y_train)

# load the model

In [111]:
joblib.load('final_model.joblib')
print("model loaded sucessfully")

model loaded sucessfully


# save the model

In [112]:
# Save the trained model using joblib
model_filename = 'final_model.joblib'
joblib.dump(final_model, model_filename)
print(f"Trained model saved as {model_filename}")

Trained model saved as final_model.joblib


# Predict on the test data and evaluate the model

In [113]:
y_test_pred = final_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)*100
test_report = classification_report(y_test, y_test_pred)

In [114]:
print("\nTest Set Evaluation")
print(f"Test Accuracy: {test_accuracy:.2f}%")
print("Test Classification Report:")
print(test_report)


Test Set Evaluation
Test Accuracy: 97.37%
Test Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       224
           1       0.99      0.96      0.97       233

    accuracy                           0.97       457
   macro avg       0.97      0.97      0.97       457
weighted avg       0.97      0.97      0.97       457

