In [91]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [92]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# function to train-test-split data and treat it

def split_and_treat_data(X, y, encode_cats=True, scale_nums=True, randomstate=0):
    # splitting
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=randomstate)

    # transforming numericals
    if scale_nums:
        transformer = MinMaxScaler().fit(X_train.select_dtypes(np.number))

        cols_nums = X.select_dtypes(np.number).columns
        
        X_train_norm = pd.DataFrame(transformer.transform(X_train.select_dtypes(np.number)), columns=cols_nums)
        X_test_norm = pd.DataFrame(transformer.transform(X_test.select_dtypes(np.number)), columns=cols_nums)
    else:
        X_train_norm = X_train.select_dtypes(np.number)
        X_test_norm = X_test.select_dtypes(np.number)

    # encoding categoricals
    if encode_cats:
        encoder = OneHotEncoder(drop='first', handle_unknown='ignore').fit(pd.DataFrame(X_train.select_dtypes(object)))

        encoded_train = encoder.transform(pd.DataFrame(X_train.select_dtypes(object))).toarray()
        encoded_test = encoder.transform(pd.DataFrame(X_test.select_dtypes(object))).toarray()

        cols_cats = encoder.get_feature_names_out(input_features=X_train.select_dtypes(object).columns)

        onehot_encoded_cats_train = pd.DataFrame(encoded_train, columns=cols_cats).astype(object)
        onehot_encoded_cats_test = pd.DataFrame(encoded_test, columns=cols_cats).astype(object)
    else:
        onehot_encoded_cats_train = X_train.select_dtypes(object)
        onehot_encoded_cats_test = X_test.select_dtypes(object)


    # concat cats + nums back together
    X_train_treated = pd.concat([X_train_norm, onehot_encoded_cats_train], axis=1)
    X_test_treated = pd.concat([X_test_norm, onehot_encoded_cats_test], axis=1)

    return X_train_treated.reset_index(drop=True), X_test_treated, y_train.reset_index(drop=True), y_test

In [93]:
from sklearn.metrics import r2_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

# function to fit and evaluate a model

def build_eval_model(X_train_treated, X_test_treated, y_train, y_test, model, decimals=5):
    # predict y_test
    model = model.fit(X_train_treated, y_train)
    pred = model.predict(X_test_treated)
    
    # evaluate predictions
    print(
        "accuracy:", round(model.score(X_test_treated, y_test),decimals), 
        "  r2:", round(r2_score(y_test, pred),decimals), 
        "  precision:", round(precision_score(y_test, pred),decimals), 
        "  recall:", round(recall_score(y_test, pred),decimals), 
        "  f1:", round(f1_score(y_test, pred),decimals), "\n"
        )
    print(pd.DataFrame(confusion_matrix(y_test, pred)), end='')
    # return scores-dict
    return {
        "accuracy": round(model.score(X_test_treated, y_test),decimals),
        "precision": round(precision_score(y_test, pred),decimals), 
        "recall": round(recall_score(y_test, pred),decimals), 
        "f1": round(f1_score(y_test, pred),decimals)
        }

In [94]:
from sklearn.utils import resample

# function for manually resampling to a size between a majority and a minority (only 2 targets possible)

def resample_treated(X_train_treated, X_test_treated, y_train, y_test, resample_size, show_dists=False):
    # concat back input and target of training data
    train_data = pd.concat([X_train_treated, y_train], axis=1)

    # split majority/minority 
    mayority = pd.Series(y_train).index[0]
    category_0 = train_data[train_data[y_train.name] == mayority]
    category_1 = train_data[train_data[y_train.name] != mayority]

    # resample the classes
    category_0_undersampled = resample(category_0, replace=False, n_samples = resample_size)
    category_1_oversampled = resample(category_1, replace=True, n_samples = resample_size)

    # concat majority/minority back together
    train_data = pd.concat([category_0_undersampled, category_1_oversampled], axis=0)

    # split input and target
    X_train_resampled = train_data.drop([y_train.name], axis=1)
    y_train_resampled = train_data[y_train.name]

    # show information if flag is set to True
    if show_dists:
        print(f'Resampled from: {y_train.value_counts()[0]}/{y_train.value_counts()[1]} to {resample_size}/{resample_size}')

    return X_train_resampled, X_test_treated, y_train_resampled, y_test

In [95]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# function for automatic resampling using SMOTE and RandomUnderSampler (by default ending up with size of 0.5 of mayority)

def smote_rnd_treated(X_train_treated, X_test_treated, y_train, y_test, size=100, show_dists=False):
    # strategy is fraction of y_train-mayority
    mayority = pd.Series(y_train).value_counts()[0]
    strat = size/mayority
    
    X_train_treated,y_train = SMOTE(sampling_strategy=strat).fit_resample(X_train_treated, y_train)
    X_train_RND,y_train_RND = RandomUnderSampler(sampling_strategy=1.0).fit_resample(X_train_treated,y_train)

    # show information if flag is set to True
    if show_dists:
        print(f'Resampled from: {y_train.value_counts()[0]}/{y_train.value_counts()[1]} to {y_train_RND.value_counts()[0]}/{y_train_RND.value_counts()[1]}')

    return X_train_RND, X_test_treated, y_train_RND, y_test

In [96]:
# pipeline function to combine multiple inputs and models

def pipeline(input_data, keys, y, models, encode_cats=True, scale_nums=True, sampling=None, size=100, may_val=1, min_val=0):
    scores = {}
    for model_name, model in models:
        for key in keys:
            print('#############   Model |', model_name, '  #############   Data |', key, '  #############\n')
            a,b,c,d = split_and_treat_data(input_data[key], y, encode_cats=encode_cats, scale_nums=scale_nums)

            # evaluate sampling-flag
            if sampling == 'resample':
                a,b,c2,d = resample_treated(a,b,c,d, size)
            elif sampling == 'smote_rnd':
                a,b,c2,d = smote_rnd_treated(a,b,c,d, size)
            else:
                c2 = c
            
            # create scores-dict and add a sampling info
            model_id = model_name.replace(' ','')
            identif = f'{model_id}_{key}'
            scores[identif] = build_eval_model(a,b,c2,d, model)
            scores[identif]['resampling'] = sampling
            if sampling:
                print(f'\t\tTraining data resampled from: {c.value_counts()[0]}/{c.value_counts()[1]} to {c2.value_counts()[0]}/{c2.value_counts()[1]}\n')
            else:
                print(f'\t\tTraining data sample sizes: {c.value_counts()[0]}/{c.value_counts()[1]}')

    return scores

In [97]:
# read in data with different features

cats = pd.read_csv('files_for_lab/categorical.csv').astype(object)
targets = pd.read_csv('files_for_lab/target.csv')

# create dict for X for different numericals, set target y
X = {}
y = targets['TARGET_B']
methods = ['kbest', 'rfe', 'var', 'pca']

# read in feature selected data with and without categorical
for method in methods:
    dict_key = f'only_{method}'
    X[dict_key] = pd.read_csv(f'files_for_lab/{method}_nums.csv')
    X[method] = pd.concat([cats, X[dict_key]], axis=1)

# read original data without feature selection
X['all'] = pd.concat([cats, pd.read_csv(f'files_for_lab/numerical.csv')], axis=1)
X['only_nums'] = pd.read_csv(f'files_for_lab/numerical.csv')

In [98]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [87]:
models = [
    ('Random Forest Classifier', RandomForestClassifier())
    ]

inputs = ['only_nums']

scores = pipeline(X, inputs, y, models)

#############   Model | Random Forest Classifier   #############   Data | only_nums   #############

accuracy: 0.94743   r2: -0.05549   precision: 0.0   recall: 0.0   f1: 0.0 

       0  1
0  22599  0
1   1254  0		Training data sample sizes: 67970/3589


In [68]:
scores = pipeline(X, inputs, y, models, sampling='smote_rnd', size=67970)

#############   Model | Random Forest Classifier   #############   Data | only_nums   #############

accuracy: 0.94437   r2: -0.11693   precision: 0.08046   recall: 0.00558   f1: 0.01044 

       0   1
0  22519  80
1   1247   7		Training data resampled from: 67970/3589 to 67970/67970



In [69]:
scores = pipeline(X, inputs, y, models, sampling='resample', size=67970)

#############   Model | Random Forest Classifier   #############   Data | only_nums   #############

accuracy: 0.9455   r2: -0.09421   precision: 0.11667   recall: 0.00558   f1: 0.01065 

       0   1
0  22546  53
1   1247   7		Training data resampled from: 67970/3589 to 67970/67970



In [89]:
scores = pipeline(X, inputs, y, models, sampling='resample', size=1000)

#############   Model | Random Forest Classifier   #############   Data | only_nums   #############

accuracy: 0.56555   r2: -7.72252   precision: 0.0642   recall: 0.53509   f1: 0.11465 

       0     1
0  12819  9780
1    583   671		Training data resampled from: 67970/3589 to 1000/1000



In [99]:
%%time

models = [
    ('Logistic Regression', LogisticRegression()),
    ('Decision Tree Classifier', DecisionTreeClassifier()),
    ('Random Forest Classifier', RandomForestClassifier())
    ]

scores = pipeline(X, X.keys(), y, models, sampling='smote_rnd', size=67970)

# 22 min

#############   Model | Logistic Regression   #############   Data | only_kbest   #############

accuracy: 0.59883   r2: -7.05421   precision: 0.0717   recall: 0.55502   f1: 0.127 

       0     1
0  13588  9011
1    558   696		Training data resampled from: 67970/3589 to 67970/67970

#############   Model | Logistic Regression   #############   Data | kbest   #############

accuracy: 0.6169   r2: -6.69144   precision: 0.06974   recall: 0.50957   f1: 0.1227 

       0     1
0  14076  8523
1    615   639		Training data resampled from: 67970/3589 to 67970/67970

#############   Model | Logistic Regression   #############   Data | only_rfe   #############

accuracy: 0.57754   r2: -7.48179   precision: 0.06754   recall: 0.54944   f1: 0.1203 

       0     1
0  13087  9512
1    565   689		Training data resampled from: 67970/3589 to 67970/67970

#############   Model | Logistic Regression   #############   Data | rfe   #############

accuracy: 0.6208   r2: -6.61316   precision: 0.06817   reca

In [100]:
import json

with open('scores.json', 'w') as f:
    json.dump(scores, f, indent=4)