In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [31]:
# Step 1: Load and Explore Data
# Load the credit card fraud dataset
data = pd.read_csv('/content/creditcard.csv')

In [32]:
# Step 2: Feature Selection and Data Splitting
X = data.drop(['Time', 'Class'], axis=1)
y = data['Class']


In [33]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
# Step 3: Data Normalization
scaler = StandardScaler()

# Check if 'Time' and 'Amount' columns exist in the dataset before applying normalization
if 'Time' in X_train.columns and 'Amount' in X_train.columns:
    X_train[['Time', 'Amount']] = scaler.fit_transform(X_train[['Time', 'Amount']])
    X_test[['Time', 'Amount']] = scaler.transform(X_test[['Time', 'Amount']])
else:
    print("The 'Time' and 'Amount' columns are not present in the dataset.")

The 'Time' and 'Amount' columns are not present in the dataset.


In [35]:
# Step 4: Handle Class Imbalance
oversampler = RandomOverSampler(sampling_strategy='minority', random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

In [36]:
# Step 5: Model Selection and Training
model = LogisticRegression(random_state=42, solver='liblinear')
model.fit(X_train_resampled, y_train_resampled)

In [37]:
from sklearn.impute import SimpleImputer

# Step 6: Data Imputation for Missing Values
imputer = SimpleImputer(strategy='mean')
X_test_imputed = imputer.fit_transform(X_test)

# Step 7: Model Evaluation
y_pred = model.predict(X_test_imputed)



In [38]:
# Step 8: Preprocess y_test to remove missing labels
y_test = y_test[~np.isnan(y_test)]

In [39]:
# Inspect the columns
print(data.columns)

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')


In [40]:

y_pred = model.predict(X_test[~np.isnan(X_test['Amount'])])

In [41]:
confusion = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)


In [42]:
print("Confusion Matrix:\n", confusion)
print("\nClassification Report:\n", report)

Confusion Matrix:
 [[55488  1376]
 [    8    90]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99     56864
           1       0.06      0.92      0.12        98

    accuracy                           0.98     56962
   macro avg       0.53      0.95      0.55     56962
weighted avg       1.00      0.98      0.99     56962

