### Importing the necessary libraries.

In [1]:
import numpy as np
import pandas as pd

### Calling the dataset

In [2]:
data = pd.read_csv("Data/creditcard.csv")

data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
fraud_data = data[data["Class"] == 1]
not_fraud_data = data[data["Class"] == 0]

In [4]:
fraud_data.shape

(492, 31)

In [5]:
not_fraud_data.shape

(284315, 31)

Not to create too much synthetic data i will just be using a small amount of `not_fraud_data`.

As the number of rows in the `fraud_data` is 492, I will be selecting almost 800-900 of the data from the `not_fraud_data` and then SMOTE technique to balance the data.

In [6]:
not_fraud_data_copy = not_fraud_data.sample(n = 900).reset_index(drop = True).copy()

not_fraud_data_copy.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,33914.0,-1.16243,-0.515632,1.28108,-0.863549,1.147358,-1.298972,-0.111304,0.120593,-0.336653,...,0.300579,0.459128,0.001485,0.071313,-0.313706,0.894794,-0.01183,0.139288,12.3,0
1,163838.0,-0.838066,0.378655,0.525694,-0.339326,0.242388,-0.849044,-0.096197,0.305521,0.300952,...,-0.03599,-0.137936,0.151155,-0.047959,-1.299941,-0.411042,-0.184632,-0.03007,0.77,0
2,135453.0,0.035969,0.71648,-0.789469,-0.818106,0.567122,-0.136554,0.370656,0.524739,-0.181881,...,-0.130111,-0.591133,0.257792,0.036999,-1.038187,0.019466,-0.077722,0.020097,15.26,0
3,75831.0,-3.276723,2.802009,0.002695,-2.028004,-0.8163,-0.780616,0.203361,0.468559,2.255668,...,-0.567904,-0.357601,0.000489,-0.024737,0.154943,0.58419,-0.435045,-1.155967,1.46,0
4,33710.0,1.143119,-1.262769,0.347648,-1.862221,-1.606407,-1.041869,-0.499474,-0.266222,0.505258,...,-0.298084,-0.416984,-0.122111,0.442487,0.407319,-0.014352,0.029548,0.047321,141.33,0


In [7]:
data_new = pd.concat([fraud_data, not_fraud_data_copy], axis = 0).reset_index(drop = True)

In [8]:
data_new.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,406.0,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,...,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.17784,0.261145,-0.143276,0.0,1
1,472.0,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,...,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,529.0,1
2,4462.0,-2.30335,1.759247,-0.359745,2.330243,-0.821628,-0.075788,0.56232,-0.399147,-0.238253,...,-0.294166,-0.932391,0.172726,-0.08733,-0.156114,-0.542628,0.039566,-0.153029,239.93,1
3,6986.0,-4.397974,1.358367,-2.592844,2.679787,-1.128131,-1.706536,-3.496197,-0.248778,-0.247768,...,0.573574,0.176968,-0.436207,-0.053502,0.252405,-0.657488,-0.827136,0.849573,59.0,1
4,7519.0,1.234235,3.01974,-4.304597,4.732795,3.624201,-1.357746,1.713445,-0.496358,-1.282858,...,-0.379068,-0.704181,-0.656805,-1.632653,1.488901,0.566797,-0.010016,0.146793,1.0,1


In [9]:
data_new.drop("Time", axis = 1, inplace = True)

In [10]:
X = data_new.drop("Class", axis = 1)
y = data_new["Class"]

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [14]:
from imblearn.over_sampling import SMOTE

In [15]:
smote = SMOTE(sampling_strategy = "auto", random_state = 42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [16]:
X_train_resampled.shape, y_train_resampled.shape

((1438, 29), (1438,))

In [19]:
y_train_resampled.value_counts()

Class
0    719
1    719
Name: count, dtype: int64

In [20]:
from sklearn.linear_model import LogisticRegression

In [25]:
lr = LogisticRegression(max_iter = 5000)

In [26]:
lr.fit(X_train_resampled, y_train_resampled)

In [27]:
pred = lr.predict(X_test)

In [28]:
from sklearn.metrics import accuracy_score

In [29]:
accuracy_score(y_test, pred)

0.931899641577061