In [None]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns




In [None]:
# Step 2: Load Data
train_df = pd.read_csv("/content/fraudTest.csv")
test_df = pd.read_csv("/content/fraudTrain.csv")



In [None]:
# Step 3: Combine train and test data for uniform preprocessing
df = pd.concat([train_df, test_df])



In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497.0,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371817000.0,33.986391,-81.200714,0.0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302.0,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371817000.0,39.450498,-109.960431,0.0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496.0,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371817000.0,40.49581,-74.196111,0.0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767.0,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371817000.0,28.812398,-80.883061,0.0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126.0,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371817000.0,44.959148,-85.884734,0.0


In [None]:
# Step 4: Drop unnecessary columns
df.drop(columns=['Unnamed: 0', 'trans_num', 'dob', 'trans_date_trans_time', 'cc_num',
                 'first', 'last', 'street'], inplace=True)

In [None]:
# Step 5: Encode categorical features
categorical_cols = ['merchant', 'category', 'gender', 'city', 'state', 'job']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


In [None]:
# Step 6: Prepare features and target
df.dropna(subset=['is_fraud'], inplace=True)
X = df.drop(columns=['is_fraud'])
y = df['is_fraud']

In [None]:
# Step 7: Split back into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)



In [None]:
# Step 8: Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(max_depth=10),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=10)
}



In [None]:
# Step 9: Train and Evaluate
for name, model in models.items():
    print(f"\n===== {name} =====")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    # Evaluation
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print(f"ROC-AUC Score: {roc_auc_score(y_test, y_proba):.4f}")


===== Logistic Regression =====
Confusion Matrix:
[[107954      0]
 [   651      0]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00    107954
         1.0       0.00      0.00      0.00       651

    accuracy                           0.99    108605
   macro avg       0.50      0.50      0.50    108605
weighted avg       0.99      0.99      0.99    108605

ROC-AUC Score: 0.5660

===== Decision Tree =====


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Confusion Matrix:
[[107833    121]
 [   230    421]]

Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    107954
         1.0       0.78      0.65      0.71       651

    accuracy                           1.00    108605
   macro avg       0.89      0.82      0.85    108605
weighted avg       1.00      1.00      1.00    108605

ROC-AUC Score: 0.9421

===== Random Forest =====
Confusion Matrix:
[[107927     27]
 [   392    259]]

Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    107954
         1.0       0.91      0.40      0.55       651

    accuracy                           1.00    108605
   macro avg       0.95      0.70      0.78    108605
weighted avg       1.00      1.00      1.00    108605

ROC-AUC Score: 0.9744
