In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# To get score for the accuracy
from sklearn.metrics import accuracy_score 

In [5]:
# loading the dataset

credit_data = pd.read_csv('../Data Manipulation/creditcard.csv')

In [6]:
# display first 5 
credit_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [7]:
# dataset info
credit_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [10]:
# checking number of missing values in each column
credit_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [11]:
# distribution of legit transaction and fradulent
credit_data['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

This Dataset is highely unbalanced

0 - Normal Transaction
1 - Fraudelent Transaction

In [12]:
legit = credit_data[credit_data.Class == 0]
fraud = credit_data[credit_data.Class == 1]

In [13]:
print(legit.shape)
print(fraud.shape)

(284315, 31)
(492, 31)


In [14]:
# statistical measures of the data
legit.Amount.describe()

# count is total number of datapoints/transaction
# mean is average amount of the transaction i.e 88 dollars
# 25 % of transactn amt is less than 5 dollars
# same for 50 % and 75 %

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [15]:
fraud.Amount.describe()
# maximum average transaction has been done of 122 dollars

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

In [16]:
# compare values for both transaction
credit_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


Under-Sampling

Build a sample dataset containing similar distribution of normal
transaction and fraudulent trans
random sampling

In [18]:
legit_sample = legit.sample(n=492)

concatenating 2 dataframes

In [21]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

#this will add fraud values below legit

In [22]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
19290,30150.0,1.171659,-0.115815,-0.324692,0.167111,-0.370738,-1.414698,0.462023,-0.353034,0.013859,...,-0.01401,-0.247007,-0.140111,0.453731,0.484069,1.06555,-0.135696,0.00214,84.95,0
230981,146563.0,0.699057,1.785158,-3.347184,4.389804,2.485139,5.366735,-2.76574,-4.207392,-2.025248,...,-2.698638,0.267023,0.249773,0.402444,0.10749,0.37057,0.019633,0.065897,98.87,0
152514,97398.0,1.929077,-0.432233,0.159215,0.349952,-0.670575,0.27463,-1.077094,0.138555,2.281403,...,-0.055895,0.189939,0.344788,0.749888,-0.614865,0.383762,-0.043959,-0.04414,15.95,0
106747,70104.0,1.298767,-0.811443,1.111912,-0.552368,-1.423069,0.153369,-1.163611,0.167881,-0.362288,...,-0.440877,-0.621347,0.08122,0.061458,0.039439,0.960169,-0.006524,0.010262,10.9,0
237757,149377.0,1.613477,-2.066645,-2.593614,-2.853373,-0.082514,0.173646,-0.035059,0.000511,0.693167,...,0.001701,0.167422,-0.183338,-1.646751,0.06452,-0.612706,0.022833,-0.040129,279.05,0


In [23]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
279863,169142.0,-1.927883,1.125653,-4.518331,1.749293,-1.566487,-2.010494,-0.88285,0.697211,-2.064945,...,0.778584,-0.319189,0.639419,-0.294885,0.537503,0.788395,0.29268,0.147968,390.0,1
280143,169347.0,1.378559,1.289381,-5.004247,1.41185,0.442581,-1.326536,-1.41317,0.248525,-1.127396,...,0.370612,0.028234,-0.14564,-0.081049,0.521875,0.739467,0.389152,0.186637,0.76,1
280149,169351.0,-0.676143,1.126366,-2.2137,0.468308,-1.120541,-0.003346,-2.234739,1.210158,-0.65225,...,0.751826,0.834108,0.190944,0.03207,-0.739695,0.471111,0.385107,0.194361,77.89,1
281144,169966.0,-3.113832,0.585864,-5.39973,1.817092,-0.840618,-2.943548,-2.208002,1.058733,-1.632333,...,0.583276,-0.269209,-0.456108,-0.183659,-0.328168,0.606116,0.884876,-0.2537,245.0,1
281674,170348.0,1.991976,0.158476,-2.583441,0.40867,1.151147,-0.096695,0.22305,-0.068384,0.577829,...,-0.16435,-0.295135,-0.072173,-0.450261,0.313267,-0.289617,0.002988,-0.015309,42.53,1


In [24]:
new_dataset['Class'].value_counts()

0    492
1    492
Name: Class, dtype: int64

In [25]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,97142.29065,0.104046,0.017272,-0.101441,0.018857,0.023581,0.006362,-0.055909,0.041517,0.07102,...,0.016018,-0.029817,0.010292,-0.022511,0.008205,-0.011245,0.048082,-0.00712,0.003087,84.727764
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


Splitting data into Features and Targets

In [26]:
X = new_dataset.drop(columns = 'Class',axis=1)
Y = new_dataset['Class']

In [27]:
print(X)

            Time        V1        V2        V3        V4        V5        V6  \
19290    30150.0  1.171659 -0.115815 -0.324692  0.167111 -0.370738 -1.414698   
230981  146563.0  0.699057  1.785158 -3.347184  4.389804  2.485139  5.366735   
152514   97398.0  1.929077 -0.432233  0.159215  0.349952 -0.670575  0.274630   
106747   70104.0  1.298767 -0.811443  1.111912 -0.552368 -1.423069  0.153369   
237757  149377.0  1.613477 -2.066645 -2.593614 -2.853373 -0.082514  0.173646   
...          ...       ...       ...       ...       ...       ...       ...   
279863  169142.0 -1.927883  1.125653 -4.518331  1.749293 -1.566487 -2.010494   
280143  169347.0  1.378559  1.289381 -5.004247  1.411850  0.442581 -1.326536   
280149  169351.0 -0.676143  1.126366 -2.213700  0.468308 -1.120541 -0.003346   
281144  169966.0 -3.113832  0.585864 -5.399730  1.817092 -0.840618 -2.943548   
281674  170348.0  1.991976  0.158476 -2.583441  0.408670  1.151147 -0.096695   

              V7        V8        V9  .

In [28]:
print(Y)

19290     0
230981    0
152514    0
106747    0
237757    0
         ..
279863    1
280143    1
280149    1
281144    1
281674    1
Name: Class, Length: 984, dtype: int64


Split the data into Training data and testing data


In [35]:
# 80 % in X_Train,corresponding label = Y_Train, 20 % data in Y_test and corresponding labels in Y_test

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2,stratify=Y,random_state = 2)

In [36]:
print(X.shape, X_train.shape,X_test.shape)

(984, 30) (787, 30) (197, 30)


Model Training - Logistic Regression (generally for binary classification)

In [40]:
# loading logistic regression model
model = LogisticRegression()

In [41]:
model.fit(X_train, Y_train)

In [39]:
print(len(X_train), len(Y_train))


787 787


Model Evaluation

In [42]:
# accuracy on training data

X_train_pred = model.predict(X_train)
training_data_acc = accuracy_score(X_train_pred,Y_train)

In [43]:
print('Accuracy on training data: ',training_data_acc)

Accuracy on training data:  0.928843710292249


In [44]:
# accuracy on test data
X_test_pred = model.predict(X_test)
test_data_acc = accuracy_score(X_test_pred,Y_test)

In [45]:
print('Accuracy on training data: ',test_data_acc)

Accuracy on training data:  0.883248730964467


Here there the training accuracy is comparitively more than the testing accuracy,hence there might be overfitting.Thta is more data is required so that it could get more insights from the data.