In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Load the dataset
file_path = '/content/creditcard.csv'
df = pd.read_csv(file_path)

In [None]:
# Check for missing values
print("Missing values:\n", df.isnull().sum())

# Drop rows where 'Class' is NaN
df = df.dropna(subset=['Class'])

# Fill or handle any other NaN values in features if necessary
# For this example, we'll drop rows with NaN values in any feature column
df = df.dropna()

# Separate features and target variable
X = df.drop('Class', axis=1)
y = df['Class']

Missing values:
 Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64


In [None]:
# Normalize the 'Time' and 'Amount' columns
scaler = StandardScaler()
X[['Time', 'Amount']] = scaler.fit_transform(X[['Time', 'Amount']])

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Resample the data
X_resampled, y_resampled = smote.fit_resample(X, y)

print("Original dataset shape:", X.shape, y.shape)
print("Resampled dataset shape:", X_resampled.shape, y_resampled.shape)

Original dataset shape: (67433, 30) (67433,)
Resampled dataset shape: (134528, 30) (134528,)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Random Forest Classifier
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Logistic Regression Evaluation
y_pred_log_reg = log_reg.predict(X_test)
print("Logistic Regression Evaluation")
print(confusion_matrix(y_test, y_pred_log_reg))
print(classification_report(y_test, y_pred_log_reg))

# Random Forest Evaluation
y_pred_rf_clf = rf_clf.predict(X_test)
print("Random Forest Evaluation")
print(confusion_matrix(y_test, y_pred_rf_clf))
print(classification_report(y_test, y_pred_rf_clf))


Logistic Regression Evaluation
[[13231   240]
 [  410 13025]]
              precision    recall  f1-score   support

         0.0       0.97      0.98      0.98     13471
         1.0       0.98      0.97      0.98     13435

    accuracy                           0.98     26906
   macro avg       0.98      0.98      0.98     26906
weighted avg       0.98      0.98      0.98     26906

Random Forest Evaluation
[[13467     4]
 [    1 13434]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     13471
         1.0       1.00      1.00      1.00     13435

    accuracy                           1.00     26906
   macro avg       1.00      1.00      1.00     26906
weighted avg       1.00      1.00      1.00     26906



In [None]:
from imblearn.under_sampling import RandomUnderSampler

# Undersampling the majority class
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Train-test split again
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Re-train the model with undersampled data
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
print("Logistic Regression with Undersampling Evaluation")
print(confusion_matrix(y_test, y_pred_log_reg))
print(classification_report(y_test, y_pred_log_reg))


Logistic Regression with Undersampling Evaluation
[[35  3]
 [ 3 27]]
              precision    recall  f1-score   support

         0.0       0.92      0.92      0.92        38
         1.0       0.90      0.90      0.90        30

    accuracy                           0.91        68
   macro avg       0.91      0.91      0.91        68
weighted avg       0.91      0.91      0.91        68

