In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [86]:
df=pd.read_csv("credit_card_fraud_dataset (1).csv")

In [87]:
df.head(15)

Unnamed: 0,TransactionID,TransactionDate,Amount,MerchantID,TransactionType,Location,IsFraud
0,1,2024-04-03 14:15:35.462794,4189.27,688,refund,San Antonio,0
1,2,2024-03-19 13:20:35.462824,2659.71,109,refund,Dallas,0
2,3,2024-01-08 10:08:35.462834,784.0,394,purchase,New York,0
3,4,2024-04-13 23:50:35.462850,3514.4,944,purchase,Philadelphia,0
4,5,2024-07-12 18:51:35.462858,369.07,475,purchase,Phoenix,0
5,6,2024-08-30 01:52:35.462867,3086.22,302,purchase,New York,0
6,7,2024-01-02 11:31:35.462875,1466.9,902,refund,Philadelphia,0
7,8,2024-05-12 12:25:35.462885,917.21,266,purchase,New York,0
8,9,2023-11-16 02:20:35.462894,1625.73,32,refund,Chicago,0
9,10,2023-10-24 06:51:35.462901,662.07,193,purchase,Dallas,0


In [88]:
df.columns

Index(['TransactionID', 'TransactionDate', 'Amount', 'MerchantID',
       'TransactionType', 'Location', 'IsFraud'],
      dtype='object')

In [89]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   TransactionID    100000 non-null  int64  
 1   TransactionDate  100000 non-null  object 
 2   Amount           100000 non-null  float64
 3   MerchantID       100000 non-null  int64  
 4   TransactionType  100000 non-null  object 
 5   Location         100000 non-null  object 
 6   IsFraud          100000 non-null  int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 5.3+ MB


In [90]:
df.isnull().sum()

TransactionID      0
TransactionDate    0
Amount             0
MerchantID         0
TransactionType    0
Location           0
IsFraud            0
dtype: int64

In [91]:
df.describe()

Unnamed: 0,TransactionID,Amount,MerchantID,IsFraud
count,100000.0,100000.0,100000.0,100000.0
mean,50000.5,2497.092666,501.67607,0.01
std,28867.657797,1442.415999,288.715868,0.099499
min,1.0,1.05,1.0,0.0
25%,25000.75,1247.955,252.0,0.0
50%,50000.5,2496.5,503.0,0.0
75%,75000.25,3743.5925,753.0,0.0
max,100000.0,4999.77,1000.0,1.0


In [92]:
df.columns=df.columns.str.strip()

In [93]:
df.drop(columns=["TransactionID"],inplace=True)

In [94]:
df.columns

Index(['TransactionDate', 'Amount', 'MerchantID', 'TransactionType',
       'Location', 'IsFraud'],
      dtype='object')

In [95]:
df['MerchantID'].value_counts()

MerchantID
461    149
188    134
192    129
517    128
820    126
      ... 
119     75
936     74
968     73
635     71
512     69
Name: count, Length: 1000, dtype: int64

In [96]:
df["IsFraud"].value_counts() 

IsFraud
0    99000
1     1000
Name: count, dtype: int64

In [97]:
from sklearn.preprocessing import LabelEncoder
from imblearn.combine import SMOTETomek

df_encoded = df.copy()

le = LabelEncoder()
df_encoded["TransactionType"] = le.fit_transform(df_encoded["TransactionType"])
df_encoded["Location"] = le.fit_transform(df_encoded["Location"])

X = df_encoded.drop(columns=["IsFraud", "TransactionDate"]) 
y = df_encoded["IsFraud"]

st = SMOTETomek(random_state=42)
X_resample, y_resample = st.fit_resample(X, y)

bal_df = pd.concat([pd.DataFrame(X_resample, columns=X.columns), pd.DataFrame(y_resample, columns=["IsFraud"])], axis=1)


In [98]:
bal_df["IsFraud"].value_counts()

IsFraud
0    96495
1    96495
Name: count, dtype: int64

In [99]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, test_size=0.2, random_state=42)
print(X_train.shape,X_test.shape)

(154392, 4) (38598, 4)


In [100]:
from sklearn.linear_model import LogisticRegression
lg=LogisticRegression(random_state=42,max_iter=1000)
lg.fit(X_train,y_train)
y_pred=lg.predict(X_test)

In [101]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.50      0.57     19283
           1       0.60      0.74      0.66     19315

    accuracy                           0.62     38598
   macro avg       0.63      0.62      0.61     38598
weighted avg       0.63      0.62      0.61     38598



In [102]:
lrg = LogisticRegression(C=0.5, class_weight='balanced', max_iter=2000, solver='liblinear',random_state=42)
lrg.fit(X_train,y_train)
y_pred=lrg.predict(X_test)

In [103]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.50      0.57     19283
           1       0.60      0.74      0.66     19315

    accuracy                           0.62     38598
   macro avg       0.63      0.62      0.61     38598
weighted avg       0.63      0.62      0.61     38598



In [104]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
rf.fit(X_train,y_train)
y_pred=rf.predict(X_test)

In [105]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.92      0.94     19283
           1       0.92      0.98      0.95     19315

    accuracy                           0.95     38598
   macro avg       0.95      0.95      0.95     38598
weighted avg       0.95      0.95      0.95     38598



In [106]:
import xgboost as xgb
from sklearn.metrics import classification_report
xgb = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.80      0.69      0.74     19283
           1       0.73      0.83      0.77     19315

    accuracy                           0.76     38598
   macro avg       0.76      0.76      0.76     38598
weighted avg       0.76      0.76      0.76     38598



In [None]:
# so RandomForest is Best

In [107]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
cv_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='f1')

In [108]:
cv_scores

array([0.94180225, 0.94233005, 0.94111736, 0.93984373, 0.94127612])

In [110]:
print(cv_scores.mean())
print(cv_scores.std())

0.941273900978439
0.0008324095752953302
