Importing the Dependencies

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [5]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('/content/card_transdata.csv')

In [6]:
# first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0


In [13]:
credit_card_data.tail()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
96219,9.121923,5.163314,2.392847,1.0,0.0,0.0,1.0,0.0
96220,8.617792,1.541935,1.373312,1.0,0.0,0.0,0.0,0.0
96221,57.543861,4.597567,3.706197,1.0,0.0,0.0,1.0,0.0
96222,16.597802,9.05234,2.127253,1.0,0.0,0.0,0.0,0.0
96223,4.872391,1.026769,1.034156,1.0,0.0,0.0,1.0,0.0


In [12]:
# dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96224 entries, 0 to 96223
Data columns (total 8 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   distance_from_home              96224 non-null  float64
 1   distance_from_last_transaction  96224 non-null  float64
 2   ratio_to_median_purchase_price  96224 non-null  float64
 3   repeat_retailer                 96224 non-null  float64
 4   used_chip                       96224 non-null  float64
 5   used_pin_number                 96224 non-null  float64
 6   online_order                    96224 non-null  float64
 7   fraud                           96224 non-null  float64
dtypes: float64(8)
memory usage: 6.6 MB


In [11]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

distance_from_home                0
distance_from_last_transaction    0
ratio_to_median_purchase_price    0
repeat_retailer                   0
used_chip                         0
used_pin_number                   0
online_order                      0
fraud                             0
dtype: int64

In [10]:
# Remove null values rows
credit_card_data = credit_card_data.dropna()


In [14]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['fraud'].value_counts()

0.0    87858
1.0     8366
Name: fraud, dtype: int64

This Dataset is highly unblanced

0 --> Normal Transaction

1 --> fraudulent transaction

In [15]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.fraud == 0]
fraud = credit_card_data[credit_card_data.fraud == 1]

In [16]:
print(legit.shape)
print(fraud.shape)

(87858, 8)
(8366, 8)


In [18]:
# compare the values for both transactions
credit_card_data.groupby('fraud').mean()

Unnamed: 0_level_0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order
fraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,22.917466,4.290983,1.421893,0.882572,0.360081,0.112659,0.622482
1.0,66.476197,12.773924,5.970805,0.88011,0.255319,0.003466,0.948841


Under-Sampling

Build a sample dataset containing similar distribution of normal transactions and Fraudulent Transactions

Number of Fraudulent Transactions --> 8366

In [21]:
legit_sample = legit.sample(n=8366)

Concatenating two DataFrames

In [22]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [23]:
new_dataset.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
82038,5.310315,0.189874,0.39487,1.0,0.0,1.0,1.0,0.0
31125,42.741602,0.163824,0.408985,1.0,0.0,1.0,1.0,0.0
10445,2.569892,0.712338,1.340959,1.0,0.0,0.0,1.0,0.0
46713,41.475058,0.850411,0.295428,1.0,0.0,0.0,0.0,0.0
76222,2.822281,0.527031,1.007472,1.0,1.0,0.0,0.0,0.0


In [24]:
new_dataset.tail()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
96192,6.921222,13.957453,4.469542,1.0,0.0,0.0,1.0,1.0
96197,3.923266,0.025559,5.001054,1.0,1.0,0.0,1.0,1.0
96208,88.324468,1.528228,4.699893,1.0,0.0,0.0,1.0,1.0
96213,21.088694,0.247122,12.619517,1.0,1.0,0.0,1.0,1.0
96215,11.142679,0.956866,4.105859,1.0,0.0,0.0,1.0,1.0


In [25]:
new_dataset['fraud'].value_counts()

0.0    8366
1.0    8366
Name: fraud, dtype: int64

In [26]:
new_dataset.groupby('fraud').mean()

Unnamed: 0_level_0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order
fraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,22.325955,4.031687,1.397465,0.885011,0.359551,0.109969,0.624313
1.0,66.476197,12.773924,5.970805,0.88011,0.255319,0.003466,0.948841


Splitting the data into Features & Targets

In [27]:
X = new_dataset.drop(columns='fraud', axis=1)
Y = new_dataset['fraud']

In [28]:
print(X)

       distance_from_home  distance_from_last_transaction  \
82038            5.310315                        0.189874   
31125           42.741602                        0.163824   
10445            2.569892                        0.712338   
46713           41.475058                        0.850411   
76222            2.822281                        0.527031   
...                   ...                             ...   
96192            6.921222                       13.957453   
96197            3.923266                        0.025559   
96208           88.324468                        1.528228   
96213           21.088694                        0.247122   
96215           11.142679                        0.956866   

       ratio_to_median_purchase_price  repeat_retailer  used_chip  \
82038                        0.394870              1.0        0.0   
31125                        0.408985              1.0        0.0   
10445                        1.340959              1.0      

In [29]:
print(Y)

82038    0.0
31125    0.0
10445    0.0
46713    0.0
76222    0.0
        ... 
96192    1.0
96197    1.0
96208    1.0
96213    1.0
96215    1.0
Name: fraud, Length: 16732, dtype: float64


Split the data into Training data & Testing Data

In [30]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [31]:
print(X.shape, X_train.shape, X_test.shape)

(16732, 7) (13385, 7) (3347, 7)


Model Training

Logistic Regression

In [32]:
model = LogisticRegression()

In [33]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

Model Evaluation

Accuracy Score

In [34]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [35]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.942846469929025


In [36]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [37]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.943232745742456
