In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [12]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# function to train-test-split data and treat it

def split_and_treat_data(X, y, randomstate=0):
    # splitting
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=randomstate)

    # transforming numericals
    transformer = MinMaxScaler().fit(X_train.select_dtypes(np.number))

    cols_nums = X.select_dtypes(np.number).columns
    
    X_train_norm = pd.DataFrame(transformer.transform(X_train.select_dtypes(np.number)), columns=cols_nums)
    X_test_norm = pd.DataFrame(transformer.transform(X_test.select_dtypes(np.number)), columns=cols_nums)

    # encoding categoricals
    encoder = OneHotEncoder(drop='first', handle_unknown='ignore').fit(pd.DataFrame(X_train.select_dtypes(object)))

    encoded_train = encoder.transform(pd.DataFrame(X_train.select_dtypes(object))).toarray()
    encoded_test = encoder.transform(pd.DataFrame(X_test.select_dtypes(object))).toarray()

    cols_cats = encoder.get_feature_names_out(input_features=X_train.select_dtypes(object).columns)

    onehot_encoded_cats_train = pd.DataFrame(encoded_train, columns=cols_cats).astype(object)
    onehot_encoded_cats_test = pd.DataFrame(encoded_test, columns=cols_cats).astype(object)

    # concat cats + nums back together
    X_train_treated = pd.concat([X_train_norm, onehot_encoded_cats_train], axis=1)
    X_test_treated = pd.concat([X_test_norm, onehot_encoded_cats_test], axis=1)

    return X_train_treated.reset_index(drop=True), X_test_treated, y_train.reset_index(drop=True), y_test

In [13]:
from sklearn.metrics import r2_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

# function to fit and evaluate a model

def build_eval_model(X_train_treated, X_test_treated, y_train, y_test, model, decimals=5):
    # predict y_test
    model = model.fit(X_train_treated, y_train)
    pred = model.predict(X_test_treated)
    
    # evaluate predictions
    print(
        "accuracy:", round(model.score(X_test_treated, y_test),decimals), 
        "  r2:", round(r2_score(y_test, pred),decimals), 
        "  precision:", round(precision_score(y_test, pred),decimals), 
        "  recall:", round(recall_score(y_test, pred),decimals), 
        "  f1:", round(f1_score(y_test, pred),decimals), "\n"
        )
    print(pd.DataFrame(confusion_matrix(y_test, pred)), end='')

In [24]:
from sklearn.utils import resample

# function for manually resampling to a size between a majority and a minority (only 2 targets possible)

def resample_treated(X_train_treated, X_test_treated, y_train, y_test, resample_size, over=1, under=0, show_dists=False):
    # concat back input and target of training data
    train_data = pd.concat([X_train_treated, y_train], axis=1)

    # split majority/minority 
    category_0 = train_data[train_data[y_train.name] == under]
    category_1 = train_data[train_data[y_train.name] == over]

    # resample the classes
    category_0_undersampled = resample(category_0, replace=False, n_samples = resample_size)
    category_1_oversampled = resample(category_1, replace=True, n_samples = resample_size)

    # concat majority/minority back together
    train_data = pd.concat([category_0_undersampled, category_1_oversampled], axis=0)

    # split input and target
    X_train_resampled = train_data.drop([y_train.name], axis=1)
    y_train_resampled = train_data[y_train.name]

    # show information if flag is set to True
    if show_dists:
        print(f'Resampled from: {y_train.value_counts()[0]}/{y_train.value_counts()[1]} to {resample_size}/{resample_size}')

    return X_train_resampled, X_test_treated, y_train_resampled, y_test

In [15]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# function for automatic resampling using SMOTE and RandomUnderSampler (by default ending up with size of 0.5 of mayority)

def smote_rnd_treated(X_train_treated, X_test_treated, y_train, y_test, smote_strat=0.5, rnd_strat=1.0, show_dists=False):
    X_train_SMOTE,y_train_SMOTE = SMOTE(sampling_strategy=smote_strat).fit_resample(X_train_treated, y_train)
    X_train_RND,y_train_RND = RandomUnderSampler(sampling_strategy=rnd_strat).fit_resample(X_train_SMOTE,y_train_SMOTE)

    # show information if flag is set to True
    if show_dists:
        print(f'Resampled from: {y_train.value_counts()[0]}/{y_train.value_counts()[1]} to {y_train_RND.value_counts()[0]}/{y_train_RND.value_counts()[1]}')

    return X_train_RND, X_test_treated, y_train_RND, y_test

In [16]:
# read in data with different feature selections

cats = pd.read_csv('data/categorical.csv').astype(object)
nums_kbest = pd.read_csv('data/kbest_nums.csv')
nums_rfe = pd.read_csv('data/rfe_nums.csv')
nums_var = pd.read_csv('data/var_nums.csv')
targets = pd.read_csv('data/target.csv')

In [17]:
# create dict for X for different numericals, set target y

X = {}
X['kbest'] = pd.concat([cats, nums_kbest], axis=1)
X['rfe'] = pd.concat([cats, nums_rfe], axis=1)
X['var'] = pd.concat([cats, nums_var], axis=1)
y = targets['TARGET_B']

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

In [22]:
# loop through the X with different features using unbalanced data

for k in X:
    print('######################################### ', k, ' #########################################\n')
    a,b,c,d = split_and_treat_data(X[k], y)

    build_eval_model(a,b,c,d, LogisticRegression())
    print()
    print()

#########################################  kbest  #########################################

accuracy: 0.94739   r2: -0.05633   precision: 0.0   recall: 0.0   f1: 0.0 

       0  1
0  22598  1
1   1254  0

#########################################  rfe  #########################################

accuracy: 0.94743   r2: -0.05549   precision: 0.0   recall: 0.0   f1: 0.0 

       0  1
0  22599  0
1   1254  0

#########################################  var  #########################################

accuracy: 0.94743   r2: -0.05549   precision: 0.0   recall: 0.0   f1: 0.0 

       0  1
0  22599  0
1   1254  0



In [18]:
# loop through the X with different features 
# using balanced data (SMOTE/RandomUnder) and Logistic Regression

for k in X:
    print('######################################### ', k, ' #########################################\n')
    a,b,c,d = split_and_treat_data(X[k], y)
    w,x,s,z = smote_rnd_treated(a,b,c,d)

    build_eval_model(w,x,s,z, LogisticRegression())
    print(f'\t\t\tResampled from: {c.value_counts()[0]}/{c.value_counts()[1]} to {s.value_counts()[0]}/{s.value_counts()[1]}\n')

#########################################  kbest  #########################################

accuracy: 0.61309   r2: -6.76803   precision: 0.06887   recall: 0.50797   f1: 0.1213 

       0     1
0  13987  8612
1    617   637			Resampled from: 67970/3589 to 33985/33985

#########################################  rfe  #########################################

accuracy: 0.61523   r2: -6.7251   precision: 0.06803   recall: 0.49761   f1: 0.1197 

       0     1
0  14051  8548
1    630   624			Resampled from: 67970/3589 to 33985/33985

#########################################  var  #########################################

accuracy: 0.61204   r2: -6.78907   precision: 0.06915   recall: 0.51196   f1: 0.12184 

       0     1
0  13957  8642
1    612   642			Resampled from: 67970/3589 to 33985/33985



In [26]:
# loop through the X with different features 
# using balanced data (manual resample) and KNeighborsClassifier

size= 20000
for k in X:
    print('######################################### ', k, ' #########################################\n')
    a,b,c,d = split_and_treat_data(X[k], y)
    treated_resampled = resample_treated(a,b,c,d, size)

    build_eval_model(*treated_resampled, KNeighborsClassifier())
    print(f'\t\t\tResampled from: {c.value_counts()[0]}/{c.value_counts()[1]} to {size}/{size}\n')

#########################################  kbest  #########################################

accuracy: 0.64185   r2: -6.19063   precision: 0.05894   recall: 0.38836   f1: 0.10234 

       0     1
0  14823  7776
1    767   487			Resampled from: 67970/3589 to 20000/20000

#########################################  rfe  #########################################

accuracy: 0.64021   r2: -6.22345   precision: 0.05781   recall: 0.38198   f1: 0.10042 

       0     1
0  14792  7807
1    775   479			Resampled from: 67970/3589 to 20000/20000

#########################################  var  #########################################

accuracy: 0.64554   r2: -6.11656   precision: 0.05936   recall: 0.38676   f1: 0.10292 

       0     1
0  14913  7686
1    769   485			Resampled from: 67970/3589 to 20000/20000



In [19]:
# loop through the X with different features 
# using balanced data (SMOTE/RandomUnder) and MLPClassifier

for k in X:
    print('######################################### ', k, ' #########################################\n')
    a,b,c,d = split_and_treat_data(X[k], y)
    w,x,s,z = smote_rnd_treated(a,b,c,d)

    build_eval_model(w,x,s,z, MLPClassifier())
    print(f'\t\t\tResampled from: {c.value_counts()[0]}/{c.value_counts()[1]} to {s.value_counts()[0]}/{s.value_counts()[1]}\n')

#########################################  kbest  #########################################

accuracy: 0.86966   r2: -1.61684   precision: 0.07159   recall: 0.1236   f1: 0.09067 

       0     1
0  20589  2010
1   1099   155			Resampled from: 67970/3589 to 33985/33985

#########################################  rfe  #########################################

accuracy: 0.87197   r2: -1.57055   precision: 0.07061   recall: 0.11802   f1: 0.08836 

       0     1
0  20651  1948
1   1106   148			Resampled from: 67970/3589 to 33985/33985

#########################################  var  #########################################

accuracy: 0.87209   r2: -1.56802   precision: 0.06657   recall: 0.11005   f1: 0.08296 

       0     1
0  20664  1935
1   1116   138			Resampled from: 67970/3589 to 33985/33985

