In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load Train & Test Data
train_df = pd.read_csv("fraudTrain.csv")
test_df = pd.read_csv("fraudTest.csv")

train_df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [26]:
drop_cols = ["Unnamed: 0", "trans_date_trans_time", "first", "last", "street", "dob", "trans_num","city","state"]

In [27]:
train_df.drop(columns=drop_cols, inplace=True)
test_df.drop(columns=drop_cols, inplace=True)

In [28]:
train_df.head()

Unnamed: 0,cc_num,merchant,category,amt,gender,zip,lat,long,city_pop,job,unix_time,merch_lat,merch_long,is_fraud
0,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,28654,36.0788,-81.1781,3495,"Psychologist, counselling",1325376018,36.011293,-82.048315,0
1,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,99160,48.8878,-118.2105,149,Special educational needs teacher,1325376044,49.159047,-118.186462,0
2,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,M,83252,42.1808,-112.262,4154,Nature conservation officer,1325376051,43.150704,-112.154481,0
3,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,59632,46.2306,-112.1138,1939,Patent attorney,1325376076,47.034331,-112.561071,0
4,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,M,24433,38.4207,-79.4629,99,Dance movement psychotherapist,1325376186,38.674999,-78.632459,0


In [29]:
#Handling categorical columnns
categorical_cols = ["merchant", "category", "job", "gender"]

for col in categorical_cols:
    train_df[col], unique = pd.factorize(train_df[col])  
    test_df[col] = test_df[col].apply(lambda x: np.where(unique == x)[0][0] if x in unique else -1)

In [30]:
train_df.head()

Unnamed: 0,cc_num,merchant,category,amt,gender,zip,lat,long,city_pop,job,unix_time,merch_lat,merch_long,is_fraud
0,2703186189652095,0,0,4.97,0,28654,36.0788,-81.1781,3495,0,1325376018,36.011293,-82.048315,0
1,630423337322,1,1,107.23,0,99160,48.8878,-118.2105,149,1,1325376044,49.159047,-118.186462,0
2,38859492057661,2,2,220.11,1,83252,42.1808,-112.262,4154,2,1325376051,43.150704,-112.154481,0
3,3534093764340240,3,3,45.0,1,59632,46.2306,-112.1138,1939,3,1325376076,47.034331,-112.561071,0
4,375534208663984,4,4,41.96,1,24433,38.4207,-79.4629,99,4,1325376186,38.674999,-78.632459,0


In [31]:
# numerical col 
scaler = StandardScaler()
numeric_cols = ["amt", "lat", "long", "city_pop", "unix_time", "merch_lat", "merch_long"]

train_df[numeric_cols] = scaler.fit_transform(train_df[numeric_cols])
test_df[numeric_cols] = scaler.transform(test_df[numeric_cols])

In [32]:
X_train = train_df.drop(columns=["is_fraud"])
y_train = train_df["is_fraud"]
X_test = test_df.drop(columns=["is_fraud"])
y_test = test_df["is_fraud"]


In [33]:
# ML Algos
log=LogisticRegression()
forest=RandomForestClassifier(n_estimators=100, random_state=42)

In [34]:
log.fit(X_train,y_train)
forest.fit(X_train,y_train)

In [35]:
y_pred_log=log.predict(X_test)
y_pred_forest=forest.predict(X_test)

In [36]:
print("\nLogistic Regression:")
print(classification_report(y_test, y_pred_log))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log))

print("\nRandom Forest:")
print(classification_report(y_test, y_pred_forest))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_forest))


Logistic Regression:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.00      0.00      0.00      2145

    accuracy                           1.00    555719
   macro avg       0.50      0.50      0.50    555719
weighted avg       0.99      1.00      0.99    555719

Confusion Matrix:
 [[553574      0]
 [  2145      0]]

Random Forest:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.75      0.46      0.57      2145

    accuracy                           1.00    555719
   macro avg       0.87      0.73      0.78    555719
weighted avg       1.00      1.00      1.00    555719

Confusion Matrix:
 [[553242    332]
 [  1162    983]]


In [39]:
x=accuracy_score(y_test,y_pred_log)
print("Accuracy of Log ",x)

Accuracy of Log  0.9961401355721147


In [40]:
x=accuracy_score(y_test,y_pred_forest)
print("Accuracy of Random Forest ",x)

Accuracy of Random Forest  0.9973115909299484
