In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler

In [2]:
# Load the credit card transaction dataset
data = pd.read_csv('creditcard.csv')

In [3]:
# Explore the dataset
data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.01448,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.05508,2.03503,-0.738589,0.868229,1.058415,0.02433,0.294869,0.5848,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.24964,-0.557828,2.630515,3.03126,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.24044,0.530483,0.70251,0.689799,-0.377961,0.623708,-0.68618,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.0,0
284806,172792.0,-0.533413,-0.189733,0.703337,-0.506271,-0.012546,-0.649617,1.577006,-0.41465,0.48618,...,0.261057,0.643078,0.376777,0.008797,-0.473649,-0.818267,-0.002415,0.013649,217.0,0


In [4]:
data.shape

(284807, 31)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [6]:
# checking the number of missing values in each column
data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [7]:
# distribution of legit transactions & genuine transactions
data['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [8]:
# separating the data for analysis
genuine = data[data.Class == 0]
fraud = data[data.Class == 1]
     
print(genuine.shape)
print(fraud.shape)
     

(284315, 31)
(492, 31)


In [9]:
# statistical measures of the data
genuine.Amount.describe()
     

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [10]:
# Check for class imbalance
fraudulent_count = data['Class'].value_counts()
print("Class Distribution:\n", fraudulent_count)


Class Distribution:
 0    284315
1       492
Name: Class, dtype: int64


In [11]:
# Calculate the mean values for each feature, grouped by the 'Class' (fraudulent or genuine)
class_means = data.groupby('Class').mean()

In [12]:
# Calculate the difference in mean values between the two classes
mean_diff = class_means.diff().iloc[1]

In [13]:
# Check if the mean differences are significantly different
if (mean_diff > 0).any():
    print("The dataset may require undersampling (Class 0 has larger mean values).")
elif (mean_diff < 0).any():
    print("The dataset may require oversampling (Class 1 has larger mean values).")
else:
    print("The mean values for both classes are similar, no resampling may be needed.")

The dataset may require undersampling (Class 0 has larger mean values).


In [14]:
# Preprocessing and Feature Scaling
X = data.drop('Class', axis=1)  # Features
y = data['Class']  # Target variable

In [15]:
# Count class distribution before resampling
class_distribution_before = y.value_counts()

In [16]:
# Apply RandomUnderSampler for undersampling
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

In [17]:
# Count class distribution after resampling
class_distribution_after = y_resampled.value_counts()

# Compare class distributions
print("Class Distribution Before Resampling:\n", class_distribution_before)
print("\nClass Distribution After Resampling:\n", class_distribution_after)

Class Distribution Before Resampling:
 0    284315
1       492
Name: Class, dtype: int64

Class Distribution After Resampling:
 0    492
1    492
Name: Class, dtype: int64


In [18]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [19]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

In [21]:
# Calculate precision, recall, and F1-score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

# Create a confusion matrix
confusion = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion)

Precision: 0.8636363636363636
Recall: 0.5816326530612245
F1-Score: 0.6951219512195121
Confusion Matrix:
[[56855     9]
 [   41    57]]


In [22]:
# accuracy on training data
from sklearn.metrics import accuracy_score
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, y_train)
print(f'Accuracy: { training_data_accuracy * 100:.2f}%')

Accuracy: 99.92%


In [23]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, y_test)
print(f'Accuracy: { test_data_accuracy * 100:.2f}%')

Accuracy: 99.91%
