In [61]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [62]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('/content/credit_data.csv')

In [34]:
# first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [35]:
# last 5 rows of the dataset
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
190234,128749.0,2.041966,0.174409,-1.711336,0.533765,0.156949,-1.447242,0.244206,-0.354569,0.63399,...,0.200018,0.739193,-0.012997,-0.084867,0.227029,-0.097686,-0.001726,-0.031748,6.68,0.0
190235,128749.0,-0.217502,0.913453,0.020849,-0.835175,0.54112,-0.592428,0.887042,-0.004344,-0.013578,...,-0.272937,-0.593312,0.026502,-0.429509,-0.432099,0.151735,0.354753,0.137631,13.99,0.0
190236,128750.0,2.068935,0.013847,-2.275414,0.30517,0.938897,0.151034,-0.079624,0.02415,0.625386,...,-0.057606,0.07464,-0.084997,-0.479828,0.230444,0.654925,-0.051699,-0.043789,10.0,0.0
190237,128750.0,1.991961,-2.015101,-0.926487,-1.540712,-1.580846,-0.30551,-1.254461,-0.005932,-1.073746,...,-0.052382,-0.037013,0.079387,-0.54176,-0.347793,-0.176883,-0.013358,-0.037445,158.0,0.0
190238,128750.0,1.810118,-0.7664,,,,,,,,...,,,,,,,,,,


In [63]:
# dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221880 entries, 0 to 221879
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    221880 non-null  float64
 1   V1      221880 non-null  float64
 2   V2      221880 non-null  float64
 3   V3      221880 non-null  float64
 4   V4      221880 non-null  float64
 5   V5      221880 non-null  float64
 6   V6      221880 non-null  float64
 7   V7      221880 non-null  float64
 8   V8      221880 non-null  float64
 9   V9      221880 non-null  float64
 10  V10     221880 non-null  float64
 11  V11     221880 non-null  float64
 12  V12     221880 non-null  float64
 13  V13     221880 non-null  float64
 14  V14     221880 non-null  float64
 15  V15     221880 non-null  float64
 16  V16     221880 non-null  float64
 17  V17     221880 non-null  float64
 18  V18     221880 non-null  float64
 19  V19     221880 non-null  float64
 20  V20     221880 non-null  float64
 21  V21     22

In [37]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,1
V4,1
V5,1
V6,1
V7,1
V8,1
V9,1


In [38]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,189867
1.0,371


0 --> Transaction Normal/Non Frauduleuse

1 --> Transaction Frauduleuse

In [64]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [65]:
print(legit.shape)
print(fraud.shape)

(221470, 31)
(409, 31)


In [41]:
# statistical measures of the data
legit.Amount.describe()  # en montant de chaque transaction

Unnamed: 0,Amount
count,189867.0
mean,89.230652
std,248.469236
min,0.0
25%,5.99
50%,22.81
75%,79.0
max,19656.53


In [42]:
fraud.Amount.describe()   # en montant de chaque transaction

Unnamed: 0,Amount
count,371.0
mean,115.161968
std,247.521805
min,0.0
25%,1.0
50%,11.4
75%,102.75
max,2125.87


In [43]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,67566.989724,-0.111366,0.002216,0.387288,0.077709,-0.123697,0.053608,-0.046554,0.020314,0.01749,...,0.022476,-0.019611,-0.058489,-0.017631,0.005603,0.070216,0.007815,0.001469,0.002,89.230652
1.0,58686.078167,-5.903085,4.314198,-7.85673,4.821406,-4.260383,-1.41373,-6.868933,0.714288,-2.82081,...,0.392939,0.812833,-0.027889,-0.041286,-0.067589,0.080066,0.040481,0.173826,0.05532,115.161968


Under-Sampling

Créez un échantillon de données contenant une distribution similaire de transactions normales et de transactions frauduleuses

Number of Fraudulent Transactions --> 492

In [92]:
legit_sample = legit.sample(n=409) # il faut faire le même nombre de transaction non frauduleuse que transaction frauduleuse
# on a fait çà à cause du déséquilibre dans la base de données entre les transactions normaleset les transactions frauduleuses

In [93]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)
# axis=0 pour joindre les transactions normales puis les transactions frauduleuses

In [94]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
88022,61950.0,-1.670257,1.285661,0.161664,-0.027209,0.211349,0.546477,0.250062,1.054296,-0.146733,...,-0.088211,-0.347612,-0.351968,-1.367172,0.605055,-0.37238,-0.000263,-0.054217,66.72,0.0
193647,130227.0,-0.326123,0.415902,0.947526,-1.585367,0.278824,-0.116364,0.455816,-0.08141,-2.186662,...,0.502875,1.366006,-0.579834,0.801357,0.852944,0.320539,0.01823,0.043297,15.0,0.0
142353,84665.0,-10.861828,-0.540573,0.126576,-1.030664,0.682637,0.923672,4.443706,-4.107546,8.085073,...,-3.550374,0.393551,-0.348379,-0.302154,1.737326,-0.595189,-2.586083,3.018514,2.37,0.0
170359,120144.0,-2.055088,1.083054,-1.1313,-3.466641,0.215101,-0.852692,-0.57208,-2.853374,1.239028,...,3.408957,0.058522,-0.117914,-0.478904,-0.193037,-0.28706,0.427916,0.044752,15.17,0.0
62560,50317.0,-0.379474,1.153319,1.062073,0.255905,1.356841,-0.330336,1.768379,-0.744261,-0.743575,...,-0.056096,0.187469,-0.548637,-0.591952,0.561483,-0.311998,-0.384768,-0.357081,24.19,0.0


In [95]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
219025,141565.0,0.114965,0.766762,-0.494132,0.116772,0.868169,-0.477982,0.438496,0.063073,-0.186207,...,-0.284413,-0.706865,0.131405,0.600742,-0.604264,0.262938,0.099145,0.01081,4.49,1.0
219892,141925.0,0.120301,1.974141,-0.434087,5.390793,1.289684,0.28059,0.221963,0.067827,-1.387054,...,-0.03869,0.204554,-0.167313,0.791547,-0.223675,0.473223,-0.160202,0.065039,0.76,1.0
220725,142280.0,-1.169203,1.863414,-2.515135,5.463681,-0.297971,1.364918,0.759219,-0.118861,-2.293921,...,-0.39309,-0.708692,0.471309,-0.078616,-0.544655,0.014777,-0.24093,-0.781055,324.59,1.0
221018,142394.0,-3.36777,0.099249,-6.148487,3.401955,0.458307,-1.57163,-1.358708,0.672409,-3.188001,...,0.861308,1.249301,1.850627,-0.117471,1.219815,0.000251,1.036011,0.004367,320.01,1.0
221041,142409.0,-1.172183,1.661713,-3.049637,2.555058,3.669035,-3.162998,-5.98564,-2.179935,-1.120292,...,-0.812098,-0.295361,-5.988806,0.714381,-1.600024,-0.634783,0.822713,0.494375,2.0,1.0


In [96]:
new_dataset['Class'].value_counts()
# maintenant en aura une distribution uniforme

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,409
1.0,409


In [97]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,72355.354523,-0.229074,-0.088093,0.36114,0.077908,-0.061295,0.030916,-0.108306,-0.036909,0.14048,...,-0.038505,-0.011392,-0.005963,-0.022161,0.071842,0.071456,-0.015011,-0.020105,0.042468,86.662543
1.0,65815.268949,-5.435534,4.0008,-7.507748,4.668153,-3.834575,-1.359844,-6.326941,0.653534,-2.699718,...,0.38729,0.772642,0.009933,-0.054624,-0.082548,0.051201,0.041523,0.194965,0.060572,126.098924


In [98]:
# maintenant en fait une séparation de données en caractéristique et classe
X = new_dataset.drop(columns='Class', axis=1) # Caractéristiques
Y = new_dataset['Class'] # Classe

In [99]:
print(X) # Les caractéristiques

            Time         V1        V2        V3        V4        V5        V6  \
88022    61950.0  -1.670257  1.285661  0.161664 -0.027209  0.211349  0.546477   
193647  130227.0  -0.326123  0.415902  0.947526 -1.585367  0.278824 -0.116364   
142353   84665.0 -10.861828 -0.540573  0.126576 -1.030664  0.682637  0.923672   
170359  120144.0  -2.055088  1.083054 -1.131300 -3.466641  0.215101 -0.852692   
62560    50317.0  -0.379474  1.153319  1.062073  0.255905  1.356841 -0.330336   
...          ...        ...       ...       ...       ...       ...       ...   
219025  141565.0   0.114965  0.766762 -0.494132  0.116772  0.868169 -0.477982   
219892  141925.0   0.120301  1.974141 -0.434087  5.390793  1.289684  0.280590   
220725  142280.0  -1.169203  1.863414 -2.515135  5.463681 -0.297971  1.364918   
221018  142394.0  -3.367770  0.099249 -6.148487  3.401955  0.458307 -1.571630   
221041  142409.0  -1.172183  1.661713 -3.049637  2.555058  3.669035 -3.162998   

              V7        V8 

In [100]:
print(Y) # Les classes

88022     0.0
193647    0.0
142353    0.0
170359    0.0
62560     0.0
         ... 
219025    1.0
219892    1.0
220725    1.0
221018    1.0
221041    1.0
Name: Class, Length: 818, dtype: float64


In [101]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [102]:
print(X.shape, X_train.shape, X_test.shape)

(818, 30) (654, 30) (164, 30)


Training modèle

In [103]:
model = LogisticRegression()

In [104]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


évaluation du modèle

Accuracy Score

In [105]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [106]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9480122324159022


In [107]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [108]:
print('Accuracy score on Test data : ', test_data_accuracy)

Accuracy score on Test data :  0.9451219512195121


On a eu accuracy score proche entre training data et test data, cela signifie qu'on a eu une bonne généralisation du modèle (càd que notre modèle est un bon modèle)