# Step 1: Import the dependencies

In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Step 2: Load and explore the data

In [2]:
# load the dataset
df = pd.read_csv("../data/creditcard_2023.csv")

In [3]:
#  first five rows of the dataset
df.head()

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-0.260648,-0.469648,2.496266,-0.083724,0.129681,0.732898,0.519014,-0.130006,0.727159,...,-0.110552,0.217606,-0.134794,0.165959,0.12628,-0.434824,-0.08123,-0.151045,17982.1,0
1,1,0.9851,-0.356045,0.558056,-0.429654,0.27714,0.428605,0.406466,-0.133118,0.347452,...,-0.194936,-0.605761,0.079469,-0.577395,0.19009,0.296503,-0.248052,-0.064512,6531.37,0
2,2,-0.260272,-0.949385,1.728538,-0.457986,0.074062,1.419481,0.743511,-0.095576,-0.261297,...,-0.00502,0.702906,0.945045,-1.154666,-0.605564,-0.312895,-0.300258,-0.244718,2513.54,0
3,3,-0.152152,-0.508959,1.74684,-1.090178,0.249486,1.143312,0.518269,-0.06513,-0.205698,...,-0.146927,-0.038212,-0.214048,-1.893131,1.003963,-0.51595,-0.165316,0.048424,5384.44,0
4,4,-0.20682,-0.16528,1.527053,-0.448293,0.106125,0.530549,0.658849,-0.21266,1.049921,...,-0.106984,0.729727,-0.161666,0.312561,-0.414116,1.071126,0.023712,0.419117,14278.97,0


In [4]:
# last five rows of dataset
df.tail()

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
568625,568625,-0.833437,0.061886,-0.899794,0.904227,-1.002401,0.481454,-0.370393,0.189694,-0.938153,...,0.167503,0.419731,1.288249,-0.900861,0.560661,-0.006018,3.308968,0.081564,4394.16,1
568626,568626,-0.670459,-0.202896,-0.068129,-0.267328,-0.13366,0.237148,-0.016935,-0.147733,0.483894,...,0.031874,0.388161,-0.154257,-0.846452,-0.153443,1.961398,-1.528642,1.704306,4653.4,1
568627,568627,-0.311997,-0.004095,0.137526,-0.035893,-0.042291,0.121098,-0.070958,-0.019997,-0.122048,...,0.140788,0.536523,-0.2111,-0.448909,0.540073,-0.755836,-0.48754,-0.268741,23572.85,1
568628,568628,0.636871,-0.51697,-0.300889,-0.14448,0.131042,-0.294148,0.580568,-0.207723,0.893527,...,-0.060381,-0.195609,-0.175488,-0.554643,-0.099669,-1.434931,-0.159269,-0.076251,10160.83,1
568629,568629,-0.795144,0.433236,-0.64914,0.374732,-0.244976,-0.603493,-0.347613,-0.340814,0.253971,...,0.534853,-0.291514,0.157303,0.93103,-0.349423,-1.090974,-1.575113,0.722936,21493.92,1


In [None]:
# dataset information 
df.info()

In [5]:
# checking for null values
df.isna().sum()

id        0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

so, we dont have any missing values

## distribution of the legit and fraudulent transactions
to check whether a dataset is unbalanced or not.

In [6]:
df["Class"].value_counts()

Class
0    284315
1    284315
Name: count, dtype: int64

### separate the dataset for analysis
0 --> normal transaction\
1 --> fraudulent transaction

In [7]:
legit = df[df.Class == 0]
fraud = df[df.Class == 1]

In [8]:
# check the shapes
print(legit.shape)
print(fraud.shape)

(284315, 31)
(284315, 31)


In [9]:
# statistical measures of 'Amount' column of the data
legit.Amount.describe()

count    284315.000000
mean      12026.313506
std        6929.500715
min          50.120000
25%        6034.540000
50%       11996.900000
75%       18040.265000
max       24039.930000
Name: Amount, dtype: float64

* 25% (also known as the 25th percentile or lower quartile): This represents the value below which 25% of the data points fall. In other words, 25% of the data are less than this value. For this dataset, the 25th percentile is 6034.540000.

* 50% (also known as the median or second quartile): This value splits the dataset into two halves, with 50% of the data points below and 50% above. The median for this dataset is 11996.900000.

* 75% (also known as the 75th percentile or upper quartile): This represents the value below which 75% of the data points fall. In other words, 75% of the data are less than this value. The 75th percentile for this dataset is 18040.265000.

In [10]:
fraud.Amount.describe()

count    284315.000000
mean      12057.601763
std        6909.750891
min          50.010000
25%        6074.640000
50%       12062.450000
75%       18033.780000
max       24039.930000
Name: Amount, dtype: float64

In [11]:
# compare the valued for both transactions ( legit and fraud)
df.groupby("Class").mean()

Unnamed: 0_level_0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,142442.987714,0.505761,-0.491878,0.682095,-0.735981,0.338639,0.435088,0.491234,-0.144294,0.585522,...,-0.179851,-0.10964,-0.014098,-0.010255,0.130107,-0.061847,-0.071052,-0.214002,-0.102024,12026.313506
1,426186.012286,-0.505761,0.491878,-0.682095,0.735981,-0.338639,-0.435088,-0.491234,0.144294,-0.585522,...,0.179851,0.10964,0.014098,0.010255,-0.130107,0.061847,0.071052,0.214002,0.102024,12057.601763


# Step 3: Prepare the data

In [25]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [26]:
print(X.shape, y.shape)

(568630, 30) (568630,)


## scale the features
`Equal Feature Contribution`: When the features are on different scales, the optimization algorithm can be biased towards features with larger values.\
This can result in slow convergence or failure to converge. Scaling helps to give all features equal importance.

In [34]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## split the data into training and testing sets

In [35]:
# used stratify to make random distribution along y 
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, stratify = y, random_state =2)

In [36]:
print(X_train.shape, X_test.shape)

(454904, 30) (113726, 30)


# Step 4: train the logistic regression model

In [37]:
model = LogisticRegression() 

In [38]:
# fit the dato to the model
model.fit(X_train, y_train)

# Step 5: Make predictions 

In [41]:
y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

# Step 6: Evaluate the model

In [43]:
# Accuracy Score of traint set
train_accuracy = accuracy_score(y_pred_train, y_train)
# Accurary for test set
test_accuracy = accuracy_score(y_pred_test, y_test)

In [44]:
print(f"Accuracy on trainind data: {train_accuracy}")
print(f"Accuracy on test data: {test_accuracy}")

Accuracy on trainind data: 0.9984040588783567
Accuracy on test data: 0.9982677663858748
