Import Libraries

In [10]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


Load Dataset

In [11]:
df = pd.read_csv("/content/fraudTest.csv")
df = pd.read_csv("/content/sample_data/fraudTrain.csv")

df.head()


Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495.0,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376000.0,36.011293,-82.048315,0.0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149.0,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376000.0,49.159047,-118.186462,0.0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154.0,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376000.0,43.150704,-112.154481,0.0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939.0,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376000.0,47.034331,-112.561071,0.0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99.0,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376000.0,38.674999,-78.632459,0.0


Data Exploration (Important for Fraud Detection)

In [13]:
df["is_fraud"].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
is_fraud,Unnamed: 1_level_1
0.0,0.99008
1.0,0.00992


Feature Scaling

In [16]:
scaler = StandardScaler()

df["amt"] = scaler.fit_transform(df[["amt"]])
df["unix_time"] = scaler.fit_transform(df[["unix_time"]])

Train-Test Split

In [23]:
df_cleaned = df.dropna(subset=["is_fraud"])
X = df_cleaned.drop("is_fraud", axis=1)
y = df_cleaned["is_fraud"]

columns_to_drop = [
    "Unnamed: 0", "trans_date_trans_time", "cc_num", "merchant", "category",
    "first", "last", "gender", "street", "city", "state", "job", "dob", "trans_num"
]
X = X.drop(columns=columns_to_drop, errors='ignore')


X = X.select_dtypes(include=np.number)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

training model using Decision Tree

In [24]:
dt = DecisionTreeClassifier(
    max_depth=6,
    class_weight="balanced",
    random_state=42
)

dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)
y_prob_dt = dt.predict_proba(X_test)[:, 1]

print("Decision Tree")
print("ROC-AUC:", roc_auc_score(y_test, y_prob_dt))
print(classification_report(y_test, y_pred_dt))

Decision Tree
ROC-AUC: 0.9416355487218109
              precision    recall  f1-score   support

         0.0       1.00      0.96      0.98      6168
         1.0       0.18      0.81      0.30        62

    accuracy                           0.96      6230
   macro avg       0.59      0.89      0.64      6230
weighted avg       0.99      0.96      0.97      6230

