In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
credit_card_data = pd.read_csv('/content/creditcard.csv')

In [3]:
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


0 means Normal transaction

1 means fraud

In [4]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
39697,39927,-1.466679,2.425732,0.877724,3.950765,0.762477,1.162748,0.575625,-0.288354,-1.35799,...,0.186051,0.57101,-0.107517,-0.754306,-0.752828,0.113179,-2.079421,-0.809173,1.5,0.0
39698,39927,-0.523165,-0.100021,0.892966,-1.900405,-0.15687,-0.783894,0.917683,-0.308345,-1.305284,...,-0.082504,-0.414677,-0.063392,-0.087455,-0.303383,-0.682889,-0.178417,-0.137169,100.92,0.0
39699,39928,-2.768425,-1.007072,2.151127,0.117797,1.283178,1.869731,-0.56224,0.820374,0.348797,...,-0.182963,0.77821,0.904077,-1.288631,0.212441,0.483975,-0.027614,-0.582813,11.99,0.0
39700,39928,1.201327,0.158614,-0.325263,0.471667,0.086446,-0.770357,0.422151,-0.205277,-0.451865,...,0.027664,-0.018485,-0.199382,0.053605,0.683829,0.428416,-0.077342,-0.006394,45.0,0.0
39701,39929,1.097669,-1.315782,0.659681,-0.683915,-1.342612,0.332629,-1.1109,0.194811,-0.248825,...,,,,,,,,,,


In [5]:
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39702 entries, 0 to 39701
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    39702 non-null  int64  
 1   V1      39702 non-null  float64
 2   V2      39702 non-null  float64
 3   V3      39702 non-null  float64
 4   V4      39702 non-null  float64
 5   V5      39702 non-null  float64
 6   V6      39702 non-null  float64
 7   V7      39702 non-null  float64
 8   V8      39702 non-null  float64
 9   V9      39702 non-null  float64
 10  V10     39702 non-null  float64
 11  V11     39702 non-null  float64
 12  V12     39702 non-null  float64
 13  V13     39701 non-null  float64
 14  V14     39701 non-null  float64
 15  V15     39701 non-null  float64
 16  V16     39701 non-null  float64
 17  V17     39701 non-null  float64
 18  V18     39701 non-null  float64
 19  V19     39701 non-null  float64
 20  V20     39701 non-null  float64
 21  V21     39701 non-null  float64
 22

In [7]:
#checking missing values
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       1
V14       1
V15       1
V16       1
V17       1
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [8]:
credit_card_data.isnull().mean()*100

Time      0.000000
V1        0.000000
V2        0.000000
V3        0.000000
V4        0.000000
V5        0.000000
V6        0.000000
V7        0.000000
V8        0.000000
V9        0.000000
V10       0.000000
V11       0.000000
V12       0.000000
V13       0.002519
V14       0.002519
V15       0.002519
V16       0.002519
V17       0.002519
V18       0.002519
V19       0.002519
V20       0.002519
V21       0.002519
V22       0.002519
V23       0.002519
V24       0.002519
V25       0.002519
V26       0.002519
V27       0.002519
V28       0.002519
Amount    0.002519
Class     0.002519
dtype: float64

In [10]:
credit_card_data['Class'].value_counts()

Class
0.0    39597
1.0      104
Name: count, dtype: int64

This dataset is highly unbalenced...

In [11]:
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [12]:
print(legit.shape)
print(fraud.shape)

(39597, 31)
(104, 31)


In [13]:
legit.Amount.describe()

count    39597.000000
mean        87.420700
std        234.514383
min          0.000000
25%          7.420000
50%         23.500000
75%         79.000000
max       7879.420000
Name: Amount, dtype: float64

In [14]:
fraud.Amount.describe()

count     104.000000
mean       97.070769
std       255.012160
min         0.000000
25%         1.000000
50%         3.775000
75%        99.990000
max      1809.680000
Name: Amount, dtype: float64

In [15]:
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,25514.725686,-0.19431,0.036967,0.744162,0.174545,-0.217414,0.107151,-0.093182,0.031783,0.222434,...,0.046047,-0.031386,-0.111706,-0.03947,0.007851,0.135895,0.022779,0.005987,0.00392,87.4207
1.0,20683.201923,-7.697309,5.766703,-10.853666,5.86585,-5.424996,-2.275717,-7.641867,3.827809,-2.924282,...,0.663205,0.626795,-0.345972,-0.342944,-0.230514,0.299242,0.176522,0.811431,0.099914,97.070769


#Under sampling ;
building a sample dataset containing similar distribution of normal and fraud transaction

No of fraud --> 104

In [16]:
legit_sample = legit.sample(n=104)

##concatneting two data frames

In [17]:
new_dataset = pd.concat([legit_sample,fraud], axis=0)

In [18]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
9521,14205,1.1911,-0.181677,1.331599,0.530762,-1.089966,-0.358657,-0.722752,-0.065079,2.219014,...,-0.253923,-0.295291,0.055707,0.425282,0.10756,0.91611,-0.05724,0.014227,20.65,0.0
3301,2847,1.189434,-0.489833,0.9935,-0.771899,-1.362919,-0.623058,-0.774331,0.152964,1.725876,...,0.191295,0.748742,-0.107897,0.547122,0.535526,-0.557624,0.085313,0.022689,1.1,0.0
24097,33067,1.109128,0.200176,0.525949,1.321606,-0.149608,0.03137,0.002317,0.089938,-0.055488,...,-0.031503,0.134624,-0.054947,0.245375,0.622119,-0.343829,0.039716,0.008706,7.63,0.0
624,472,1.040781,0.109569,0.357987,1.118998,-0.105373,-0.056837,0.055026,0.045165,-0.350573,...,0.188378,0.487631,-0.147081,0.036691,0.565457,-0.275489,0.023386,0.019021,59.88,0.0
14260,25316,1.475619,-1.197544,0.700634,-1.324846,-1.629251,-0.144181,-1.327625,0.113917,-1.473272,...,-0.504216,-0.982746,0.143941,-0.054336,0.193463,-0.420313,0.046053,0.010667,4.24,0.0


In [19]:
new_dataset['Class'].value_counts()

Class
0.0    104
1.0    104
Name: count, dtype: int64

In [20]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,24694.548077,-0.064125,-0.001338,0.778883,0.100254,-0.558989,0.384686,-0.027232,-0.153814,0.273388,...,-0.053321,-0.005366,-0.181592,-0.016573,0.039415,0.182051,0.012575,0.055768,0.004858,129.163558
1.0,20683.201923,-7.697309,5.766703,-10.853666,5.86585,-5.424996,-2.275717,-7.641867,3.827809,-2.924282,...,0.663205,0.626795,-0.345972,-0.342944,-0.230514,0.299242,0.176522,0.811431,0.099914,97.070769


In [21]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [22]:
print(X)
print(Y)

        Time        V1        V2        V3        V4        V5        V6  \
9521   14205  1.191100 -0.181677  1.331599  0.530762 -1.089966 -0.358657   
3301    2847  1.189434 -0.489833  0.993500 -0.771899 -1.362919 -0.623058   
24097  33067  1.109128  0.200176  0.525949  1.321606 -0.149608  0.031370   
624      472  1.040781  0.109569  0.357987  1.118998 -0.105373 -0.056837   
14260  25316  1.475619 -1.197544  0.700634 -1.324846 -1.629251 -0.144181   
...      ...       ...       ...       ...       ...       ...       ...   
30473  35942 -4.194074  4.382897 -5.118363  4.455230 -4.812621 -1.224645   
30496  35953 -4.844372  5.649439 -6.730396  5.252842 -4.409566 -1.740767   
31002  36170 -5.685013  5.776516 -7.064977  5.902715 -4.715564 -1.755633   
33276  37167 -7.923891 -5.198360 -3.000024  4.420666  2.272194 -3.394483   
39183  39729 -0.964567 -1.643541 -0.187727  1.158253 -2.458336  0.852222   

             V7        V8        V9  ...       V20       V21       V22  \
9521  -0.7227

#Splitting the data in train and test data

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [24]:
print(X.shape, X_train.shape, X_test.shape)

(208, 30) (166, 30) (42, 30)


###Model training : Logistic Regression

In [25]:
model = LogisticRegression()

In [26]:
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


##model evaluation


In [28]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [29]:
print('Accuracy on training data : ', training_data_accuracy)

Accuracy on training data :  0.9819277108433735


In [30]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [31]:
print('Accuracy score on test data:' , test_data_accuracy)

Accuracy score on test data: 0.9523809523809523
