In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns


In [None]:
df = pd.read_csv('./creditcard.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
number_of_null_values = df.isnull().sum().sum()
number_of_nan_values = df.isna().sum().sum()

print(f'NaN count: {number_of_nan_values}\nNull count: {number_of_null_values}')

In [None]:
sns.set_style("whitegrid")
sns.countplot(data=df, x='Class', stat='percent')
plt.savefig('class_count')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler(copy=False)

X = df[df.columns[:-1]].values
y = df['Class'].values

std_scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.linear_model import LogisticRegression
weights = {0:1, 1:10}
weighted_log_reg = LogisticRegression(class_weight=weights)

weighted_log_reg.fit(X_train,y_train)


In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
y_predicted = weighted_log_reg.predict(X_train)
conf_matrix = confusion_matrix(y_true=y_train, y_pred=y_predicted, labels=weighted_log_reg.classes_)
sns.heatmap(conf_matrix/len(y_predicted), cmap='coolwarm', annot=True, fmt='0.2%', linewidths=1)
plt.savefig('corelation_matrix_before_grid_search')

In [None]:
from sklearn.model_selection import cross_val_score
F1_score_before_grid_search = cross_val_score(estimator=weighted_log_reg, X=X_train, y=y_train, scoring='f1', cv=3, n_jobs=-1 )
recall_score_before_grid_search = cross_val_score(estimator=weighted_log_reg, X=X_train, y=y_train, scoring='recall', cv=3, n_jobs=-1 )
print(f"""F1_score_before_grid_search: {F1_score_before_grid_search}    average: {sum(F1_score_before_grid_search)/3}
recall_score_before_grid_search: {recall_score_before_grid_search}    average: {sum(recall_score_before_grid_search)/3}""")

In [None]:
weighted_log_reg.get_params()

In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np

param_grid = {'class_weight': [{0:1, 1: weight} for weight in np.linspace(1,100,500)] }
grid_cv = 3
n_jobs = -1
scoring = 'f1'

grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid, n_jobs=n_jobs, scoring=scoring, cv=grid_cv)
grid_search.fit(X_train, y_train)

In [None]:
best_estimator = grid_search.best_estimator_

f1_after_grid_search = sum( cross_val_score(X = X_train, y = y_train,estimator=best_estimator, scoring='f1') ) / 5
recall_after_grid_search = sum( cross_val_score(X = X_train, y = y_train,estimator=best_estimator, scoring='recall') ) / 5

print(f"""f1_after_grid_search: {f1_after_grid_search}
recal_after_grid_search: {recall_after_grid_search}""")

In [None]:
f1_test = sum( cross_val_score(X = X_test, y = y_test,estimator=best_estimator, scoring='f1') ) / 5
recall_test = sum( cross_val_score(X = X_test, y = y_test,estimator=best_estimator, scoring='recall') ) / 5

print(f"""f1_test: {f1_test}
recall_test: {recall_test}""")