In [None]:
# Importing Dependancies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# Importing dataset
cc_dataset = pd.read_csv('/content/drive/MyDrive/credit_data.csv')

In [None]:
cc_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [None]:
cc_dataset.shape

(284807, 31)

In [None]:
cc_dataset['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

The above stats indicates that the datset is highly imbalanced

In [None]:
# Seprating the data for analysis
legit = cc_dataset[cc_dataset.Class == 0]
fraud = cc_dataset[cc_dataset.Class == 1]

In [None]:
print(legit.shape)

(284315, 31)


In [None]:
print(fraud.shape)

(492, 31)


In [None]:
legit['Amount'].describe()

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [None]:
fraud.Amount.describe()

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

Under Sampling

In [None]:
# Building a sample dataset from original dataset
# Sample dataset will be of legit transactions which will have values equal to fraudelent transactions

In [None]:
legit_sample = legit.sample(n=492)

Concatenating the two Data Frames

In [None]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

# The frames will be added row wise that is axis=0

In [None]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
178938,123850.0,1.816752,-0.283535,-1.962424,0.447233,0.162073,-1.00285,0.24347,-0.197998,0.500635,...,0.28287,0.64827,-0.15972,-0.422931,0.188231,-0.109656,-0.034984,-0.027042,117.0,0
236503,148839.0,2.112102,-0.052044,-1.34925,0.251715,0.211779,-0.813731,0.183281,-0.302346,0.557713,...,-0.332637,-0.809191,0.252759,-0.649112,-0.198992,0.238152,-0.070195,-0.06659,1.29,0
247954,153743.0,-1.336058,0.312551,2.495012,-1.138258,-1.603778,0.113928,-0.936666,0.364849,0.128153,...,0.185246,0.726227,-0.303867,0.39661,0.288858,-0.006238,-0.484031,-0.132803,1.0,0
203636,134898.0,2.170718,-0.299227,-1.896208,-0.313972,-0.003933,-1.642849,0.479411,-0.671197,-1.02714,...,-0.222421,-0.029899,0.038487,0.065063,0.339609,-0.174164,-0.025948,-0.058307,29.97,0
77814,57225.0,-22.435671,-21.510079,-3.711944,6.479684,2.177843,-0.276111,3.935704,-4.871645,5.305156,...,-6.245874,1.770421,6.146762,0.369772,3.119632,1.659289,-4.327341,15.374949,318.25,0


In [None]:
new_dataset.shape

(984, 31)

In [None]:
new_dataset['Class'].value_counts()

0    492
1    492
Name: Class, dtype: int64

Splitting the data into features and target/label

In [None]:
X = new_dataset.drop('Class', axis=1)
Y = new_dataset['Class']

In [None]:
print(X)
print(Y)

            Time         V1         V2        V3        V4        V5  \
178938  123850.0   1.816752  -0.283535 -1.962424  0.447233  0.162073   
236503  148839.0   2.112102  -0.052044 -1.349250  0.251715  0.211779   
247954  153743.0  -1.336058   0.312551  2.495012 -1.138258 -1.603778   
203636  134898.0   2.170718  -0.299227 -1.896208 -0.313972 -0.003933   
77814    57225.0 -22.435671 -21.510079 -3.711944  6.479684  2.177843   
...          ...        ...        ...       ...       ...       ...   
279863  169142.0  -1.927883   1.125653 -4.518331  1.749293 -1.566487   
280143  169347.0   1.378559   1.289381 -5.004247  1.411850  0.442581   
280149  169351.0  -0.676143   1.126366 -2.213700  0.468308 -1.120541   
281144  169966.0  -3.113832   0.585864 -5.399730  1.817092 -0.840618   
281674  170348.0   1.991976   0.158476 -2.583441  0.408670  1.151147   

              V6        V7        V8        V9  ...        V20       V21  \
178938 -1.002850  0.243470 -0.197998  0.500635  ...  -0.021

Train and Test data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(984, 30) (787, 30) (197, 30)


Model Training

Logistic Resgression

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

Model Evaluation

In [None]:
# Accuracy score of Training data

X_train_prediction = model.predict(X_train)

training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

In [None]:
print("Accuracy on training data is: ",training_data_accuracy)

Accuracy on training data is:  0.9428208386277002


In [None]:
# Accuracy score of Test data

X_test_prediction = model.predict(X_test)

test_data_accuracy = accuracy_score(Y_test, X_test_prediction)

In [None]:
print("Accuracy on test data is: ",test_data_accuracy)

Accuracy on test data is:  0.9238578680203046


Prediction Model

In [None]:
input_data = (123525,-5.90492080575337,4.43991138307409,-8.63180221299529,7.78868440738042,-4.98957980058454,-1.20014396478769,-7.6740599401754,4.12576101681016,-5.31577802250603,-4.89115620289902,4.22041859352387,-6.40830067684822,1.32800326739361,-5.853545070898,-0.928337066900246,-7.08579817642873,-12.62385610867,-4.74522383798077,2.79789150183086,-0.505607961764415,1.77502979034156,1.26644115239936,-0.199409588840669,0.0149600028407991,-1.90822653976519e-05,0.544209892335915,-1.16756581297999,-1.20635432583345,45.51)

In [None]:
input_nparray = np.asarray(input_data)

reshaped_array = input_nparray.reshape(1, -1)

prediction = model.predict(reshaped_array)
print(prediction)

if prediction[0] == 0:
  print("Transaction is Legit")
else:
  print("Transaction is Fraud")

[1]
Transaction is Fraud


  "X does not have valid feature names, but"
