In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
df= pd.read_csv('creditcard.csv')
print("This is the unscaled data set. ALl are PCA scaled except time and amount.")

In [None]:
# adding new features i.e. feature engineering
df['amount-time']=df['Amount']/(df['Time']+1)



In [None]:
# we know there are no missing values as from eda before
# Scaling

scaler=StandardScaler()
df['Amount']=scaler.fit_transform(df['Amount'].values.reshape(-1,1)) 
# here reshape converts the column in to the numpy array as fit transform expects a 2d array

df['amount-time']=scaler.fit_transform(df[['amount-time']])

df['Time']=scaler.fit_transform(df[['Time']])
# here there is no reshape as df[[]'time']] itself selects the column as the 2d array
print("Now the Time and Amount columns are also scaled.")

In [None]:
# test train split

x=df.drop(['Class'], axis=1)
y=df['Class']

x_train, x_test,y_train, y_test = train_test_split(x,y, test_size=0.2,stratify=y, random_state=42)
'''  
our dataset is imbalanced, we have 99% not fraud and 1% fraud 
so due to this on splitting on train and test it will lead to the disproportion so
to make the equal proportion of fraud and non fraud in train and test sample we use stratify=y

the order of output from the train_test_split is x_train, x_test, y_train, y_test as it gives features train and test and then the label train and test
'''


In [None]:
# SMOTe oversampling

from imblearn.over_sampling import SMOTE

# smote is synthetic minority oversampling technique that creates new samples of minority class

# initializing smote object with a fixed random state for reproducibility
smote = SMOTE(random_state=42)
x_train_over, y_train_over= smote.fit_resample(x_train, y_train) 
''' 
 this does 2 things as fit learns the feature space of minority class i.e. fraud class
 and resample generates new samples of minority class  to balance the class and datasets
'''
print("before smot",y_train.value_counts())
print("after smot",y_train_over.value_counts())


In [None]:
# Logistic Regression for the smote  to build the model


model= LogisticRegression(max_iter=1000, random_state=42)
# this max_iter controls how many times the model iterates and finds the value
model.fit(x_train_over,y_train_over)
y_pred=model.predict(x_test)
print("Logistic Regression Model")

# now evaluating the model i.e. classification model so we need classification reports for logistic regression
print("Classification Report:")
cr=classification_report(y_test, y_pred)
print(cr)
print("Confusion Matrix:")
cm= confusion_matrix(y_test, y_pred)
print(cm)

# Viusalizing the metrics using heatmap from seaborn
plt.figure(figsize=(10, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud'])

''' 
annot gives number inside the box along with the color
fmt='d' formats the annotation as integers
xtickables are used to show the labels in x axis and ytickables are used to show the labels in y axis so as we know which box is what

'''
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# now random forest for the smote 

# making object of model
rf_smote = RandomForestClassifier(random_state=42)
rf_smote.fit(x_train_over, y_train_over)

# predicting 
y_pred_rf_smote = rf_smote.predict(x_test)

print("Random Forest Classifier Model with SMOTE\n")

# now evaluating the model
print("Confusion Matrix for Random Forest of SMOTE:")
cm_rf_smote = confusion_matrix(y_test, y_pred_rf_smote)
print(cm_rf_smote)
print("\nClassification Report for Random Forest of SMOTE:")
cr_rf_smote = classification_report(y_test, y_pred_rf_smote)
print(cr_rf_smote)


In [None]:
# Random Under Sampling 

from imblearn.under_sampling import RandomUnderSampler

# this reduces the majorityy class in training data
rus = RandomUnderSampler(random_state=42)
x_train_under, y_train_under = rus.fit_resample(x_train, y_train)

print("before under sampling", y_train.value_counts())
print("after under sampling", y_train_under.value_counts())

In [None]:
# Logistic regressor for the under sampling

# object of model
modell=LogisticRegression(max_iter=1000, random_state=42)

# this learns from the undersampled training data 
modell.fit(x_train_under, y_train_under)

# this predicts the test datas
y_pred_under = modell.predict(x_test)

print("\nLogistic Regression Model with Under Sampling")

# 4. Evaluate the model
print("Confusion Matrix:")
cm =confusion_matrix(y_test, y_pred_under)
print(cm)

print("\nClassification Report:")
cr=classification_report(y_test, y_pred_under)
print(cr)

In [None]:
# Now using the random forest regressor for the undersampling

# making the object for the model
print('\nNow Lets do Random Forest Classifier for undersampling.')
rf= RandomForestClassifier(random_state=42)


rf.fit(x_train_under, y_train_under)
y_pred_rf = rf.predict(x_test)

# now evaluating the mode
print("\nConfusion Matrix for Random Forest of undersampling:")
cm_rf = confusion_matrix(y_test, y_pred_rf)
print(cm_rf)

print("\nClassification Report for Random Forest of undersampling:")
cr_rf = classification_report(y_test, y_pred_rf)
print(cr_rf)

In [None]:
# SMOTETOMEK and  SMOTEENN


from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN

sample=10000
x_Small= x_train.sample(n=sample, random_state=42)
y_Small=y_train.loc[x_Small.index]


smT = SMOTETomek(random_state=42)
# This is only done in the training data
x_st, y_St=smT.fit_resample(x_Small, y_Small)
print("before smote tomek", y_train.value_counts())
print("after smote tomek", y_St.value_counts())


smenn= SMOTEENN(random_state=42)
x_st_enn, y_st_enn= smenn.fit_resample(x_Small, y_Small)
print("before smote enn", y_train.value_counts())
print("after smote enn", y_st_enn.value_counts())


In [None]:
# logistic and random forest classifier for this smoteTomek and SMOTEENN    

# random forest FOR SMOTE
rf = RandomForestClassifier(random_state=42,class_weight='balanced')
rf.fit(x_st, y_St)

# THRESHOLD REDUCING
y_prob_St= rf.predict_proba(x_test)[:, 1]
threshold=0.3
y_pred_rf = (y_prob_St >= threshold).astype(int)

# this is for smootetmoek result
print("Random Forest Classifier smote tomek by reducing threshold.\n")
print('\n')

# evaluating the model
print("Confusion report for the random forest classifier")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix for Random Forest Classifier")
cm_rf = confusion_matrix(y_test, y_pred_rf)
print("rf",cm_rf)


# Random Forest for SMOTEENN

print('\n Random forest for the smoteen')

# hyperparameter tuning using GridSearchCV
param={
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None], # none means no limit of the depth of the tree
    'min_samples_split': [2, 5]
    
}

# GridSearchCV
grid_rf_enn= GridSearchCV( estimator= rf,
                          param_grid=param,
                          scoring='recall', # this is used to maximize recall which is much needed in fraud detection
                          cv=3,
                          n_jobs=1, # this uses all available cpu cores to run in parallel
                          verbose=1 # this means transparency of the updates of each parameters evaluation
)


# fiting this on the smoteenn data
grid_rf_enn.fit(x_st_enn,y_st_enn)

# after fitting using hyperparameter, now we get the best model from GridSearch
best_rf_enn= grid_rf_enn.best_estimator_


# this is using the threshold for the probability of the random forest
y_prob_st_enn = best_rf_enn.predict_proba(x_test)[:, 1]
y_pred_rf_enn = (y_prob_st_enn >= threshold).astype(int)

# evaluating model
print("GridSearchCv results: \n")
print("best parameters:", grid_rf_enn.best_params_)
print("best cv recall score: ",grid_rf_enn.best_score_)


print("classification report for Random Forest Classifier with SMOTEENN")
print(classification_report(y_test, y_pred_rf_enn))

print("Confusion Matrix for Random Forest Classifier with SMOTEENN")
cm_rf_enn = confusion_matrix(y_test, y_pred_rf_enn)
print("rf enn", cm_rf_enn)



In [None]:
# Using some robust model for increasing recall 

# using xgboost for the smote and tomek 

import xgboost as xgb

# creating object for the model
xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(x_st, y_St)


# Threshold reducing for xgboost
y_prob_xgb = xgb_model.predict_proba(x_test)[:, 1]
threshold_xgb = 0.3
y_pred_xgb = (y_prob_xgb >= threshold_xgb).astype(int)

print("\nXGBoost Classifier Model with SMOTE and Tomek Links\n")

# evaluating the model
print("Confusion Matrix for XGBoost Classifier with SMOTE and Tomek Links:\n")
cm_xgb = confusion_matrix(y_test, y_pred_xgb)
print(cm_xgb)
print("\nClassification Report for XGBoost Classifier with SMOTE and Tomek Links:")
cr_xgb = classification_report(y_test, y_pred_xgb)
print(cr_xgb)


# using xgboost for the smote enn

# making object of model
xgb_model_enn = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

xgb_model_enn.fit(x_st_enn, y_st_enn)

# THRESHOLD REDUCING for xgboost with SMOTEENN
y_prob_xgb_enn = xgb_model_enn.predict_proba(x_test)[:, 1]
y_pred_xgb_enn = (y_prob_xgb_enn >= threshold_xgb).astype(int)


print("\nXGBoost Classifier Model with SMOTEENN\n")

# evaluating the model
print("Confusion Matrix for XGBoost Classifier with SMOTEENN:\n")
cm_xgb_enn = confusion_matrix(y_test, y_pred_xgb_enn)
print(cm_xgb_enn)
print("\nClassification Report for XGBoost Classifier with SMOTEENN:")
cr_xgb_enn = classification_report(y_test, y_pred_xgb_enn)
print(cr_xgb_enn)

