In [9]:
!pip install imbalanced-learn


Collecting imbalanced-learn
  Downloading imbalanced_learn-0.11.0-py3-none-any.whl (235 kB)
                                              0.0/235.6 kB ? eta -:--:--
     -                                        10.2/235.6 kB ? eta -:--:--
     ----                                  30.7/235.6 kB 435.7 kB/s eta 0:00:01
     ------                                41.0/235.6 kB 326.8 kB/s eta 0:00:01
     ------------                          81.9/235.6 kB 459.5 kB/s eta 0:00:01
     --------------                        92.2/235.6 kB 374.1 kB/s eta 0:00:01
     -----------------                    112.6/235.6 kB 386.4 kB/s eta 0:00:01
     -----------------                    112.6/235.6 kB 386.4 kB/s eta 0:00:01
     ------------------                   122.9/235.6 kB 343.4 kB/s eta 0:00:01
     ---------------------                143.4/235.6 kB 315.4 kB/s eta 0:00:01
     -------------------------            163.8/235.6 kB 339.1 kB/s eta 0:00:01
     -----------------------------       


[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE



In [13]:
# dataset taken from kaggle: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv("creditcard.csv")

# dataset information
credit_card_data.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [14]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]



In [15]:
# This Dataset is highly unbalanced
# 0 --> Normal Transaction
# 1 --> Fraudulent transaction

# statistical measures of the data
legit.Amount.describe()
fraud.Amount.describe()

# compare the values for both transactions
credit_card_data.groupby('Class').mean()



Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


In [16]:
'''
Using SMOTE (Synthetic Minority Over-sampling Technique)
'''

# Split the data into Features & Targets
X = credit_card_data.drop(columns='Class', axis=1)
Y = credit_card_data['Class']



In [17]:
# Apply SMOTE to balance the dataset
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, Y_resampled = smote.fit_resample(X, Y)

# Split the resampled data into Features & Targets
X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=2)

print(X_resampled.shape, X_train.shape, X_test.shape)



(568630, 30) (454904, 30) (113726, 30)


In [18]:
# Model Training using Logistic Regression
model = LogisticRegression()

# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)



In [19]:
# Model Evaluation

# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy on Training data : ', training_data_accuracy)



Accuracy on Training data :  0.957736137734555


In [20]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score on Test Data : ', test_data_accuracy)


Accuracy score on Test Data :  0.9575734660499797
