In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kartik2112/fraud-detection")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'fraud-detection' dataset.
Path to dataset files: /kaggle/input/fraud-detection


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np

In [3]:
data_file_path = path + "/fraudTest.csv"
df = pd.read_csv(data_file_path)

In [4]:
print("Data Head:")
print(df.head())


print("\nMissing Values Check")
print(df.isnull().sum().max())

Data Head:
   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2020-06-21 12:14:25  2291163933867244   
1           1   2020-06-21 12:14:33  3573030041201292   
2           2   2020-06-21 12:14:53  3598215285024754   
3           3   2020-06-21 12:15:15  3591919803438423   
4           4   2020-06-21 12:15:17  3526826139003047   

                               merchant        category    amt   first  \
0                 fraud_Kirlin and Sons   personal_care   2.86    Jeff   
1                  fraud_Sporer-Keebler   personal_care  29.84  Joanne   
2  fraud_Swaniawski, Nitzsche and Welch  health_fitness  41.28  Ashley   
3                     fraud_Haley Group        misc_pos  60.05   Brian   
4                 fraud_Johnston-Casper          travel   3.19  Nathan   

       last gender                       street  ...      lat      long  \
0   Elliott      M            351 Darlene Green  ...  33.9659  -80.9355   
1  Williams      F             3638 Marsh Union  ..

In [5]:
TARGET_COLUMN = 'is_fraud'
fraud_count = df[TARGET_COLUMN].sum()
total_count = len(df)
legit_count = total_count - fraud_count

print("\nClass Distribution ")
print(f"Total Transactions: {total_count}")
print(f"Legitimate (Class 0): {legit_count}")
print(f"Fraudulent (Class 1): {fraud_count}")
print(f"Fraud Rate: {(fraud_count / total_count) * 100:.4f}%")


Class Distribution 
Total Transactions: 555719
Legitimate (Class 0): 553574
Fraudulent (Class 1): 2145
Fraud Rate: 0.3860%


In [11]:
# FEATURE PREPARATION AND SCALING
X = df.drop(TARGET_COLUMN, axis=1)
y = df[TARGET_COLUMN]


features_to_keep = [
    'amt', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long', 'zip'
]


actual_features_to_keep = [col for col in features_to_keep if col in X.columns]
X = X[actual_features_to_keep]

scaler = StandardScaler()


if 'amt' in X.columns:
    X['amt'] = scaler.fit_transform(X[['amt']])
if 'unix_time' in X.columns:
    X['unix_time'] = scaler.fit_transform(X[['unix_time']])

In [7]:
# DATA SPLITTING

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")


Training set size: 444575 samples
Testing set size: 111144 samples


In [8]:
# MODEL TRAINING AND EVALUATION (MODEL 1: LOGISTIC REGRESSION)

print("       MODEL 1: LOGISTIC REGRESSION           ")

log_model = LogisticRegression(max_iter=1000, random_state=42)
log_model.fit(X_train, y_train)
y_pred_lr = log_model.predict(X_test)


print(f"Accuracy Score: {accuracy_score(y_test, y_pred_lr):.4f}")
print("\nClassification Report (Focus on Class 1: Fraud):")
print(classification_report(y_test, y_pred_lr))


print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))

       MODEL 1: LOGISTIC REGRESSION           
Accuracy Score: 0.9959

Classification Report (Focus on Class 1: Fraud):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    110715
           1       0.00      0.00      0.00       429

    accuracy                           1.00    111144
   macro avg       0.50      0.50      0.50    111144
weighted avg       0.99      1.00      0.99    111144


Confusion Matrix:
[[110688     27]
 [   429      0]]


In [9]:
# MODEL TRAINING AND EVALUATION (MODEL 2: RANDOM FOREST)

print("         MODEL 2: RANDOM FOREST               ")

rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, class_weight='balanced')


rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy Score: {accuracy_rf:.4f}")
print("\nClassification Report (Focus on Class 1: Fraud):")
print(classification_report(y_test, y_pred_rf))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

         MODEL 2: RANDOM FOREST               
Accuracy Score: 0.9774

Classification Report (Focus on Class 1: Fraud):
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    110715
           1       0.12      0.79      0.21       429

    accuracy                           0.98    111144
   macro avg       0.56      0.88      0.60    111144
weighted avg       1.00      0.98      0.99    111144


Confusion Matrix:
[[108290   2425]
 [    92    337]]


In [10]:
# CONCLUSION

print("             PREDICTION SUMMARY               ")
print(f"Logistic Regression Recall (Class 1): {classification_report(y_test, y_pred_lr, output_dict=True)['1']['recall']:.4f}")
print(f"Random Forest Recall (Class 1):     {classification_report(y_test, y_pred_rf, output_dict=True)['1']['recall']:.4f}")

if classification_report(y_test, y_pred_rf, output_dict=True)['1']['recall'] > classification_report(y_test, y_pred_lr, output_dict=True)['1']['recall']:
    print("\nThe Random Forest model achieved a much higher Recall, which is better for catching actual fraud!")
else:
    print("\nThe Logistic Regression model performed better!")

             PREDICTION SUMMARY               
Logistic Regression Recall (Class 1): 0.0000
Random Forest Recall (Class 1):     0.7855

The Random Forest model achieved a much higher Recall, which is better for catching actual fraud!
