# **Credit card Fraud Detection**

1. Build a machine learning model to identify fraudulent credit card
transactions.
2. Preprocess and normalize the transaction data, handle class imbalance issues, and split the dataset into training and testing sets.
3. Train a classification algorithm, such as logistic regression or random
forests, to classify transactions as fraudulent or genuine.
4. Evaluate the model's performance using metrics like precision, recall,
and F1-score, and consider techniques like oversampling or
undersampling for improving results.

**Dataset Link -**
https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud/code

# 1. Import Libraries and Load Data


In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, precision_score, f1_score
from imblearn.over_sampling import SMOTE

In [26]:
data = pd.read_csv('/content/creditcard.csv')
data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47623,43281,-0.696585,1.108944,1.413769,-0.127104,0.018555,-0.455078,0.587554,0.113729,-0.121544,...,-0.239606,-0.547457,0.045935,0.029361,-0.228841,0.108686,0.388269,0.173609,5.49,0.0
47624,43282,-0.282262,-1.212401,2.174897,-0.181501,-1.604027,-0.332694,-1.154018,0.223029,0.605392,...,-0.444623,-0.224011,0.728520,0.958983,-0.712869,0.908167,-0.054453,-0.107284,10.00,0.0
47625,43282,-1.513495,-0.206282,-0.702405,1.056423,1.058711,-0.708973,0.611571,0.249949,-0.844057,...,0.089904,0.723147,1.244035,-0.526450,-0.267320,-0.336504,0.398350,-0.017047,89.99,0.0
47626,43282,-0.659193,1.270953,0.946464,-0.370180,1.619207,-0.288297,1.588317,-0.492843,-0.829508,...,-0.060858,-0.028588,-0.704004,-0.971430,1.036058,-0.329102,-0.276970,-0.183521,0.99,0.0


# 2. Data Preprocessing and Normalization

In [27]:
print(data.isnull().sum)

<bound method NDFrame._add_numeric_operations.<locals>.sum of         Time     V1     V2     V3     V4     V5     V6     V7     V8     V9  \
0      False  False  False  False  False  False  False  False  False  False   
1      False  False  False  False  False  False  False  False  False  False   
2      False  False  False  False  False  False  False  False  False  False   
3      False  False  False  False  False  False  False  False  False  False   
4      False  False  False  False  False  False  False  False  False  False   
...      ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
47623  False  False  False  False  False  False  False  False  False  False   
47624  False  False  False  False  False  False  False  False  False  False   
47625  False  False  False  False  False  False  False  False  False  False   
47626  False  False  False  False  False  False  False  False  False  False   
47627  False  False  False  False  False  False  False  False  False 

In [33]:
scaler = StandardScaler()
data['Amount'] = scaler.fit_transform(data['Amount'].values.reshape(-1, 1))

In [34]:
data = data.drop(['Time'], axis=1)

In [35]:
X = data.drop('Class', axis=1)
y = data['Class']

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [38]:
print(y_train.isnull().sum())

1


In [39]:
train_data = pd.concat([X_train, y_train], axis=1)

In [40]:
train_data = train_data.dropna(subset=['Class'])

In [41]:
X_train = train_data.drop('Class', axis=1)
y_train = train_data['Class']

In [42]:
print(y_train.isnull().sum())

0


# 3. Handle Class Imbalance

In [44]:
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print(f'Original dataset shape: {y_train.value_counts()}')
print(f'Resampled dataset shape: {y_train_res.value_counts()}')

Original dataset shape: Class
0.0    33238
1.0      100
Name: count, dtype: int64
Resampled dataset shape: Class
0.0    33238
1.0    33238
Name: count, dtype: int64


# 4. Train the Classification Algorithm

Logistic Regression

In [45]:
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train_res, y_train_res)

In [46]:
y_pred_logreg = logreg.predict(X_test)

Random Forest

In [51]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_res, y_train_res)

# 5. Evaluate the Model's Performance

Logistic Regression

In [52]:
y_pred_rf = rf.predict(X_test)

print("Logistic Regression Performance:")
print(confusion_matrix(y_test, y_pred_logreg))
print(classification_report(y_test, y_pred_logreg, target_names=['Genuine', 'Fraud']))

Logistic Regression Performance:
[[14109   134]
 [    3    43]]
              precision    recall  f1-score   support

     Genuine       1.00      0.99      1.00     14243
       Fraud       0.24      0.93      0.39        46

    accuracy                           0.99     14289
   macro avg       0.62      0.96      0.69     14289
weighted avg       1.00      0.99      0.99     14289



Random Forest

In [53]:
print("Random Forest Performance:")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, target_names=['Genuine', 'Fraud']))

Random Forest Performance:
[[14242     1]
 [    7    39]]
              precision    recall  f1-score   support

     Genuine       1.00      1.00      1.00     14243
       Fraud       0.97      0.85      0.91        46

    accuracy                           1.00     14289
   macro avg       0.99      0.92      0.95     14289
weighted avg       1.00      1.00      1.00     14289



# 6. Summary of Evaluation Metrics

In [54]:
precision_logreg = precision_score(y_test, y_pred_logreg)
recall_logreg = recall_score(y_test, y_pred_logreg)
f1_logreg = f1_score(y_test, y_pred_logreg)

In [57]:
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

print(f"Logistic Regression: Precision: {precision_logreg}, Recall: {recall_logreg}, F1-Score: {f1_logreg}")
print(f"Random Forest: Precision: {precision_rf}, Recall: {recall_rf}, F1-Score: {f1_rf}")

Logistic Regression: Precision: 0.24293785310734464, Recall: 0.9347826086956522, F1-Score: 0.3856502242152467
Random Forest: Precision: 0.975, Recall: 0.8478260869565217, F1-Score: 0.9069767441860466
