**1. Loading the dataset**

In [None]:
import pandas as pd # For data manipulation

#Loading the dataset
df = pd.read_csv('/content/creditcard.csv')

**Inspecting the dataset**

In [None]:
#Get basic information/structure of the dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [None]:
#Display statistics for numerical columns
print(df.describe())

                Time            V1            V2            V3            V4  \
count  284807.000000  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05   
mean    94813.859575  1.168375e-15  3.416908e-16 -1.379537e-15  2.074095e-15   
std     47488.145955  1.958696e+00  1.651309e+00  1.516255e+00  1.415869e+00   
min         0.000000 -5.640751e+01 -7.271573e+01 -4.832559e+01 -5.683171e+00   
25%     54201.500000 -9.203734e-01 -5.985499e-01 -8.903648e-01 -8.486401e-01   
50%     84692.000000  1.810880e-02  6.548556e-02  1.798463e-01 -1.984653e-02   
75%    139320.500000  1.315642e+00  8.037239e-01  1.027196e+00  7.433413e-01   
max    172792.000000  2.454930e+00  2.205773e+01  9.382558e+00  1.687534e+01   

                 V5            V6            V7            V8            V9  \
count  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05   
mean   9.604066e-16  1.487313e-15 -5.556467e-16  1.213481e-16 -2.406331e-15   
std    1.380247e+00  1.332271e+00  1.23709

**2. Data Preprocessing**

In [None]:
#Let's check how imbalanced data is:
print("Class distribution before preprocessing:")
print(df['Class'].value_counts())

Class distribution before preprocessing:
Class
0    284315
1       492
Name: count, dtype: int64


**Feature Scaling**

In [None]:
from sklearn.preprocessing import StandardScaler  # For feature scaling

#Initializing the scaler
scaler = StandardScaler()

#Standardize the 'Amount' and 'Time' columns to bring them to a common scale.
df['Scaled_Amount'] = scaler.fit_transform(df[['Amount']])  # Scale the 'Amount' feature
df['Scaled_Time'] = scaler.fit_transform(df[['Time']])  # Scale the 'Time' feature

#Drop the original 'Amount' and 'Time' columns since we have their scaled versions now
df.drop(['Amount', 'Time'], axis = 1)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Class,Scaled_Amount,Scaled_Time
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0,0.244964,-1.996583
1,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,0,-0.342475,-1.996583
2,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0,1.160686,-1.996562
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0,0.140534,-1.996562
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,0,-0.073403,-1.996541
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,4.356170,...,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0,-0.350151,1.641931
284803,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,-0.975926,...,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,0,-0.254117,1.641952
284804,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,-0.484782,...,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,0,-0.081839,1.641974
284805,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,-0.399126,...,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,0,-0.313249,1.641974


**Splitting the data into training and testing set**

In [None]:
from sklearn.model_selection import train_test_split  # For splitting into training and testing set

#Split the data into features (X) and target variable (y)
X = df.drop('Class', axis=1)  # X: All columns except 'Class' (the features)
y = df['Class']  # y: The Class column (target variable: 0 = legitimate, 1 = fraud)

#Split the dataset into training and testing set
# 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Handling imbalanced data**

**1. Under-Sampling:- Reducing the number of legitimate transactions to match the fraudulent transactions**

In [None]:
from imblearn.under_sampling import RandomUnderSampler # For Under-sampling

#Apply random under-sampling to balance the dataset by reducing legitimate transactions
under_sampler = RandomUnderSampler(random_state=42) # creates an instance of RandomUnderSampler
X_train_under, y_train_under = under_sampler.fit_resample(X_train, y_train)

#Check the new class distribution to ensure it is balanced after under-sampling
print('Class distribution after under-sampling:')
print(pd.Series(y_train_under).value_counts())

Class distribution after under-sampling:
Class
0    394
1    394
Name: count, dtype: int64


**2. Over-Sampling:- Duplicating fraudulent transactions to match with legitimate transactions by using SMOTE method (Synthetic Minority Over-Sampling Technique)**

In [None]:
from imblearn.over_sampling import SMOTE # For over-sampling

#Applying SMOTE to balance the training data
smote = SMOTE(random_state=42) # Creates an instance of SMOTE
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

#Check Class distribution
print('Class distribution before SMOTE:',y_train.value_counts())
print('Class distribution after SMOTE:')
print(pd.Series(y_train_smote).value_counts())

Class distribution before SMOTE: Class
0    227451
1       394
Name: count, dtype: int64
Class distribution after SMOTE:
Class
0    227451
1    227451
Name: count, dtype: int64


**3. Class Weights:- Class Weights in Logistic Regression (Used later during model training)**

**3. Model Training using Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression # For Logistic Regression model
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Logistic Regression with class weights
log_reg = LogisticRegression(class_weight='balanced', random_state=42)

# Train the model on training set
log_reg.fit(X_train, y_train)

# Make predictions
y_pred = log_reg.predict(X_test)

**4. Evaluation**

**Classification Report: Provides a detailed report of the model’s performance, including precision, recall, and F1-score.**

In [None]:
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     56864
           1       0.06      0.93      0.11        98

    accuracy                           0.97     56962
   macro avg       0.53      0.95      0.55     56962
weighted avg       1.00      0.97      0.99     56962



**Confusion matrix: Shows the confusion matrix, which helps visualize the performance of the classification model.**

In [None]:
print("Confusion Matrix:")
print(confusion_matrix(y_test,y_pred))

Confusion Matrix:
[[55423  1441]
 [    7    91]]


**AUC-ROC Curve: Calculates the ROC-AUC score, which measures the model’s ability to distinguish between the classes.**


In [None]:
roc_auc = roc_auc_score(y_test,log_reg.predict_proba(X_test)[:,1])
print(f"Logistic Regression AUC_ ROC:{roc_auc}")

Logistic Regression AUC_ ROC:0.9823244576389925
