In [None]:
import numpy as np
import pandas as pd
import logisticRegression as lgr
import importlib
importlib.reload(lgr)

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import confusion_matrix,f1_score, roc_auc_score, average_precision_score, precision_score,recall_score 




In [None]:
# drop the duplicated rows
def drop_duplicates(dataframe):
    dataframe.drop_duplicates(inplace=True)
    return dataframe

# separate the data frame into features and target
def create_feature_target(dataframe, column=''):
    features = dataframe.drop(column, axis=1)
    target = dataframe[column]
    return features, target

# label encode the target
def label_encoding_target_helper(target, cols=[]):
    le = LabelEncoder()
    target = le.fit_transform(target.values.ravel())
    target = pd.DataFrame(target, columns=cols)
    return target

# label encode the feature
def label_encoding_features_helper(features):
    # Label Encode binary columns
    binary_cols = [c for c in features.columns if features[c].dtype not in ['int64', 'float64'] and features[c].nunique() == 2]
    binary_cols
    le = LabelEncoder()

    for c in binary_cols:
        features[c] = le.fit_transform(features[c])

    return features

# one hot encoding
def one_hot_helper(features):
    # OneHot encoding
    object_cols = features.select_dtypes(include=['object']).columns


    # chage the datatype to category
    for c in object_cols:
        features[c] = features[c].astype('category')

    features = pd.get_dummies(features)
    features = features.astype('int')
    return features

# scaling helper
def scaling_helper(features, scaler='minmax'):
    # we separate the binary cols
    binary_cols = [c for c in features.columns if features[c].nunique() == 2]

    binary_features = features[binary_cols]
    continous_features = features.drop(columns=binary_cols)

    if scaler == 'standard':
        scaler = StandardScaler()
    else:
        scaler = MinMaxScaler()
    
    scaled_features = scaler.fit_transform(continous_features)
    continous_features = pd.DataFrame(scaled_features, columns=continous_features.columns)

    # reset the index
    binary_features.reset_index(drop=True, inplace=True)

    #concat the two matrices
    features = pd.concat([binary_features,continous_features], axis=1)
    return features

# correlation helper
def correlation_helper(features, target, target_col_name):
    target_series = target[target_col_name]
    correlations = features.corrwith(target_series)
    return pd.DataFrame({'Feature': correlations.index, 'Correlation': correlations.values})

def information_gain_helper(features, target):
    mi = mutual_info_classif(features,target)
    return pd.DataFrame({'Feature': features.columns, 'Information Gain': mi})

def get_top_features(correlations, info_gain, top=40):
    # Combine DataFrames
    combined_df = info_gain.merge(correlations, on='Feature', how='inner')
    
    # Calculate a combined score based on absolute values
    combined_df['Absolute Info Gain'] = combined_df['Information Gain'].abs()
    combined_df['Absolute Correlation'] = combined_df['Correlation'].abs()
    combined_df['Combined Score'] = combined_df['Absolute Info Gain'] + combined_df['Absolute Correlation']
    
    # Sort by combined score
    combined_df = combined_df.sort_values(by='Combined Score', ascending=False)
    
    #return the top features
    return combined_df.head(top)['Feature'] 


# Data splitter
def split_data(features, target, test_size=0.2, random_state=42):
    
    # Check if the target is a pandas DataFrame/Series
    if isinstance(target, pd.DataFrame) or isinstance(target, pd.Series):
        target = target.values.ravel()  # Convert to 1D numpy array
    
    # If it's already a numpy array and has more than 1 dimension, flatten it
    elif isinstance(target, np.ndarray) and target.ndim > 1:
        target = target.ravel()
    
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=test_size, random_state=random_state)
    

    X_train = X_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    y_train = pd.Series(y_train).reset_index(drop=True).to_numpy().ravel()
    y_test = pd.Series(y_test).reset_index(drop=True).to_numpy().ravel()

    return X_train, X_test, y_train, y_test


In [28]:
dataframe = pd.read_csv('../Program_Data/creditcard.csv')

#  no null value in the cells


# drop the duplicated rows
# since no positive values in the duplicated rows, they can also be deleted
dataframe = drop_duplicates(dataframe)


# take 20,000 randomly from the negative class and take all the positive class data
negative_class = dataframe[dataframe['Class']==0]
positive_class = dataframe[dataframe['Class']==1]

negative_class = negative_class.sample(20000, random_state=42)

# concat the two class and reshuffle them to mix the classes
dataframe = pd.concat([negative_class, positive_class])
dataframe = dataframe.sample(frac=1, random_state=42).reset_index(drop=True)


# separate into features and target
features, target = create_feature_target(dataframe, column='Class')


# scale the features
features = scaling_helper(features, scaler='minmax')

# feature selection
# since there are only 30 columns, we avoid the feature selection part

# split the dataset to train and test
X_train, X_test, y_train, y_test = split_data(features, target)

# create the validation set
X_train, X_validation, y_train, y_validation = split_data(X_train, y_train)



In [51]:

weight_0 = 0.0929
weight_1 = 0.9081

class_weight = {0:weight_0, 1:weight_1}

print("\n**********Before fitting")
print(f'X_train.shape = {X_train.shape}   y_train.shape = {y_train.shape}')
print(f'X_validation.shape = {X_validation.shape}     y_validation.shape = {y_validation.shape}')
print(f'X_test.shape = {X_test.shape}     y_test.shape = {y_test.shape}')
print(f'0s in train:{np.sum(y_train == 0)}  1s in train: {np.sum(y_train==1)}' )
print(f'0s in test:{np.sum(y_test == 0)}  1s in test: {np.sum(y_test==1)}' )
print(f'class_weight = {class_weight}')


model = lgr.CustomLogisticRegression(class_weight=class_weight)
model.custom_fit(X_train, y_train)
y_pred = model.custom_predict(X_test, threshold=0.4095)
print(np.sum(y_pred == 0))
print(np.sum(y_pred == 1))


accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auroc = roc_auc_score(y_test, y_pred)  # Use predicted probabilities for AUROC
aupr = average_precision_score(y_test, y_pred)  # Precision-Recall AUC
conf_matrix = confusion_matrix(y_test, y_pred)
tp, fp, fn, tn = conf_matrix.ravel()
sensitivity = tp/(tp+fn) if (tp+fn)>0 else 0
specificity = tn/(tn+fp) if (tn+fp)>0 else 0


print(f'The accuracy of skLearn.LogisticRegression : {accuracy:.2f}')
print(f'The precision of skLearn.LogisticRegression : {precision:.2f}')
print(f'The recall of skLearn.LogisticRegression : {recall:.2f}')
print(f'The f1 of skLearn.LogisticRegression : {f1:.2f}')
print(f'The auroc of skLearn.LogisticRegression : {auroc:.2f}')
print(f'The aupr of skLearn.LogisticRegression : {aupr:.2f}')
print(f'The sensitivity of skLearn.LogisticRegression : {sensitivity:.2f}')
print(f'The specificity of skLearn.LogisticRegression : {specificity:.2f}')



**********Before fitting
X_train.shape = (13102, 30)   y_train.shape = (13102,)
X_validation.shape = (3276, 30)     y_validation.shape = (3276,)
X_test.shape = (4095, 30)     y_test.shape = (4095,)
0s in train:12793  1s in train: 309
0s in test:3988  1s in test: 107
class_weight = {0: 0.0929, 1: 0.9081}
3995
100
The accuracy of skLearn.LogisticRegression : 0.99
The precision of skLearn.LogisticRegression : 0.76
The recall of skLearn.LogisticRegression : 0.71
The f1 of skLearn.LogisticRegression : 0.73
The auroc of skLearn.LogisticRegression : 0.85
The aupr of skLearn.LogisticRegression : 0.55
The sensitivity of skLearn.LogisticRegression : 0.99
The specificity of skLearn.LogisticRegression : 0.76
