In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# Load the dataset
# Assuming the dataset is in a CSV file called 'creditcard.csv'
data = pd.read_csv('/creditcard.csv')

# Separate features and target variable
X = data.drop('Class', axis=1)
y = data['Class']

# Check for missing values in the target variable and handle them (e.g., remove rows with missing values)
y = y.dropna()
X = X.loc[y.index]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle class imbalance using SMOTE (oversampling the minority class)
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)


# Train a Logistic Regression model
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train_res, y_train_res)

# Train a Random Forest Classifier model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_res, y_train_res)

# Evaluate the models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(f'Precision: {precision_score(y_test, y_pred)}')
    print(f'Recall: {recall_score(y_test, y_pred)}')
    print(f'F1-Score: {f1_score(y_test, y_pred)}')
    print(classification_report(y_test, y_pred))

print("Logistic Regression Model Performance:")
evaluate_model(lr_model, X_test_scaled, y_test)

print("Random Forest Model Performance:")
evaluate_model(rf_model, X_test_scaled, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Model Performance:
Precision: 0.1165644171779141
Recall: 0.9047619047619048
F1-Score: 0.20652173913043476
              precision    recall  f1-score   support

         0.0       1.00      0.98      0.99      7920
         1.0       0.12      0.90      0.21        21

    accuracy                           0.98      7941
   macro avg       0.56      0.94      0.60      7941
weighted avg       1.00      0.98      0.99      7941

Random Forest Model Performance:
Precision: 0.95
Recall: 0.9047619047619048
F1-Score: 0.9268292682926829
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      7920
         1.0       0.95      0.90      0.93        21

    accuracy                           1.00      7941
   macro avg       0.97      0.95      0.96      7941
weighted avg       1.00      1.00      1.00      7941

