In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

The Dataset used in this Documentation is available on kaggle. Link:https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud

In [2]:
cc_data = pd.read_csv('creditcard.csv')
cc_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
cc_data.shape

(284807, 31)

In [4]:
cc_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [5]:
#Checking is there any null values in the dataset.
cc_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [6]:
#Finding out the number of Fraud and Not Fraud Transaction in the dataset.
cc_data['Class'].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

Classifying Fraud and Not Fraud Transaction.
If (Class==0) Transaction is Not Fraud Transaction.
If (Class==1) Transaction is Fraud Transaction.

In [7]:
Not_Fraud=cc_data[cc_data.Class==0]
Fraud= cc_data[cc_data.Class==1]

In [8]:
Not_Fraud.shape

(284315, 31)

In [9]:
Fraud.shape

(492, 31)

Here in this Section new dataset used train the models.  Where Not_Fraud 1000 data sample and Fraud 492(All) data sample are taken.

In [10]:
Sample_Not_Fraud= Not_Fraud.sample(n=1000)

In [11]:
new_dataset=pd.concat([Sample_Not_Fraud,Fraud], axis=0)

In [12]:
new_dataset

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
176869,122946.0,2.178873,-1.831393,-0.103750,-1.527062,-1.965567,-0.024450,-1.948237,0.185846,-0.390533,...,-0.102893,0.060853,0.370048,0.600208,-0.613641,-0.255657,0.048023,-0.021005,44.91,0
268862,163415.0,0.121484,0.932940,-0.545011,-0.745542,1.132315,-0.237121,0.810265,0.115323,-0.203780,...,-0.314623,-0.808472,0.048673,0.102885,-0.436162,0.124102,0.218187,0.068752,1.98,0
30937,36145.0,-1.663711,-0.698590,1.516574,1.579598,0.142501,-0.740503,0.986895,-0.085230,-0.850870,...,0.379549,0.296816,0.490967,0.344873,0.609362,-0.107302,-0.053919,0.119883,292.64,0
240784,150750.0,2.045637,0.302323,-1.717522,0.545435,0.270031,-1.518286,0.367206,-0.465299,0.409318,...,0.207374,0.820172,-0.040007,0.005915,0.289468,-0.108479,0.005434,-0.025248,9.19,0
78041,57340.0,0.749273,-1.675480,0.646818,-0.453367,-1.858335,-0.536703,-0.676041,0.044141,-0.601833,...,0.351223,0.147487,-0.187921,0.527364,0.052592,-0.355261,-0.038771,0.067757,307.80,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279863,169142.0,-1.927883,1.125653,-4.518331,1.749293,-1.566487,-2.010494,-0.882850,0.697211,-2.064945,...,0.778584,-0.319189,0.639419,-0.294885,0.537503,0.788395,0.292680,0.147968,390.00,1
280143,169347.0,1.378559,1.289381,-5.004247,1.411850,0.442581,-1.326536,-1.413170,0.248525,-1.127396,...,0.370612,0.028234,-0.145640,-0.081049,0.521875,0.739467,0.389152,0.186637,0.76,1
280149,169351.0,-0.676143,1.126366,-2.213700,0.468308,-1.120541,-0.003346,-2.234739,1.210158,-0.652250,...,0.751826,0.834108,0.190944,0.032070,-0.739695,0.471111,0.385107,0.194361,77.89,1
281144,169966.0,-3.113832,0.585864,-5.399730,1.817092,-0.840618,-2.943548,-2.208002,1.058733,-1.632333,...,0.583276,-0.269209,-0.456108,-0.183659,-0.328168,0.606116,0.884876,-0.253700,245.00,1


In [13]:
new_dataset.shape

(1492, 31)

In [14]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# Logistics Regression 

In [16]:
model_Log=LogisticRegression(max_iter=1000)
# training the Logistic Regression Model with Training Data
model_Log.fit(X_train, Y_train)

# accuracy on training data
X_train_prediction = model_Log.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy on Training data : ', training_data_accuracy)

# accuracy on test data
X_test_prediction = model_Log.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy on Training data :  0.9522212908633697
Accuracy score on Test Data :  0.9765886287625418


# Support Vector Machine (SVM)

In [17]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


# Create an SVM classifier
svm_model = SVC(kernel='linear')  

# Train the SVM model
svm_model.fit(X_train, Y_train)

# Make predictions on training data
y_train_pred = svm_model.predict(X_train)
train_accuracy = accuracy_score(Y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy}")

# Make predictions on test data
y_test_pred = svm_model.predict(X_test)
test_accuracy = accuracy_score(Y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy}")

Training Accuracy: 0.9170159262363788
Test Accuracy: 0.9364548494983278


# Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Create a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42) 

# Train the Random Forest model
rf_model.fit(X_train, Y_train)

# Make predictions on training data
y_train_pred = rf_model.predict(X_train)
train_accuracy = accuracy_score(Y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy}")

# Make predictions on test data
y_test_pred = rf_model.predict(X_test)
test_accuracy = accuracy_score(Y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy}")

Training Accuracy: 1.0
Test Accuracy: 0.9698996655518395


# Neural Network

In [21]:
import tensorflow as tf


model_nn = tf.keras.Sequential([
  tf.keras.layers.Dense(10, activation="relu", input_shape=(X_train.shape[1],)),  # First hidden layer with 10 units and ReLU activation
  tf.keras.layers.Dense(1, activation="sigmoid")  # Output layer with sigmoid activation for binary classification
])

# Compile the model
model_nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
model_nn.fit(X_train, Y_train, epochs=30)

# Use the model for prediction
predictions = model_nn.predict(X_test)  

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In this section, the whole Dataset used to train and test different models. Where Not_Fraud 284315 data sample and Fraud 492 data sample are taken. 

In [22]:
A = cc_data.drop(columns='Class', axis=1)
B = cc_data['Class']

In [23]:
A_train, A_test, B_train, B_test = train_test_split(A, B, test_size=0.2, stratify=B, random_state=2)

# Logistics Regression 

In [24]:
model_Log=LogisticRegression(max_iter=1000)
# training the Logistic Regression Model with Training Data
model_Log.fit(A_train, B_train)

# accuracy on training data
A_train_prediction = model_Log.predict(A_train)
training_data_accuracy = accuracy_score(A_train_prediction, B_train)
print('Accuracy on Training data : ', training_data_accuracy)

# accuracy on test data
A_test_prediction = model_Log.predict(A_test)
test_data_accuracy = accuracy_score(A_test_prediction, B_test)
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy on Training data :  0.999188044503939
Accuracy score on Test Data :  0.9993153330290369


# Support Vector Machine (SVM)

In [25]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


# Create an SVM classifier
svm_model = SVC(kernel='linear')  

# Train the SVM model
svm_model.fit(A_train, B_train)

# Make predictions on training data
y_train_pred = svm_model.predict(A_train)
train_accuracy = accuracy_score(B_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy}")

# Make predictions on test data
y_test_pred = svm_model.predict(A_test)
test_accuracy = accuracy_score(B_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy}")

Training Accuracy: 0.9986657596172837
Test Accuracy: 0.998735999438222


# Random Forest

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


# Create a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest model
rf_model.fit(A_train, B_train)

# Make predictions on training data
y_train_pred = rf_model.predict(A_train)
train_accuracy = accuracy_score(B_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy}")

# Make predictions on test data
y_test_pred = rf_model.predict(A_test)
test_accuracy = accuracy_score(B_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy}")

Training Accuracy: 1.0
Test Accuracy: 0.9995435553526912


# Neural Network

In [27]:
import tensorflow as tf


model_nn = tf.keras.Sequential([
  tf.keras.layers.Dense(10, activation="relu", input_shape=(X_train.shape[1],)),  # First hidden layer with 10 units and ReLU activation
  tf.keras.layers.Dense(1, activation="sigmoid")  # Output layer with sigmoid activation for binary classification
])

# Compile the model
model_nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
model_nn.fit(A_train, B_train, epochs=10)

# Use the model for prediction
predictions = model_nn.predict(A_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
