In [34]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [35]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import pickle

# function to train-test-split data and treat it

def split_and_treat_data(X, y, encode_cats=True, save_encoder=False, scale_nums=True, save_scaler=False, randomstate=None):
    # splitting
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=randomstate)

    # transforming numericals
    if scale_nums:
        transformer = MinMaxScaler().fit(X_train.select_dtypes(np.number))

        cols_nums = X.select_dtypes(np.number).columns
        
        X_train_norm = pd.DataFrame(transformer.transform(X_train.select_dtypes(np.number)), columns=cols_nums)
        X_test_norm = pd.DataFrame(transformer.transform(X_test.select_dtypes(np.number)), columns=cols_nums)
    else:
        X_train_norm = X_train.select_dtypes(np.number)
        X_test_norm = X_test.select_dtypes(np.number)

    if save_scaler:
        pickle.dump(transformer, open('scaler.sav', 'wb'))

    # encoding categoricals
    if encode_cats:
        encoder = OneHotEncoder(drop='first', handle_unknown='ignore').fit(pd.DataFrame(X_train.select_dtypes(object)))

        encoded_train = encoder.transform(pd.DataFrame(X_train.select_dtypes(object))).toarray()
        encoded_test = encoder.transform(pd.DataFrame(X_test.select_dtypes(object))).toarray()

        cols_cats = encoder.get_feature_names_out(input_features=X_train.select_dtypes(object).columns)

        onehot_encoded_cats_train = pd.DataFrame(encoded_train, columns=cols_cats).astype(object)
        onehot_encoded_cats_test = pd.DataFrame(encoded_test, columns=cols_cats).astype(object)
    else:
        onehot_encoded_cats_train = X_train.select_dtypes(object)
        onehot_encoded_cats_test = X_test.select_dtypes(object)

    if save_encoder:
        pickle.dump(encoder, open('encoder.sav', 'wb'))

    # concat cats + nums back together
    X_train_treated = pd.concat([X_train_norm, onehot_encoded_cats_train], axis=1)
    X_test_treated = pd.concat([X_test_norm, onehot_encoded_cats_test], axis=1)

    return X_train_treated.reset_index(drop=True), X_test_treated, y_train.reset_index(drop=True), y_test

In [36]:
from sklearn.utils import resample

# function for manually resampling to a size between a majority and a minority (only 2 targets possible)

def resample_treated(X_train_treated, X_test_treated, y_train, y_test, resample_size, show_dists=False):
    # concat back input and target of training data
    train_data = pd.concat([X_train_treated, y_train], axis=1)

    # split majority/minority 
    mayority = pd.Series(y_train).index[0]
    category_0 = train_data[train_data[y_train.name] == mayority]
    category_1 = train_data[train_data[y_train.name] != mayority]

    # resample the classes
    category_0_undersampled = resample(category_0, replace=False, n_samples = resample_size)
    category_1_oversampled = resample(category_1, replace=True, n_samples = resample_size)

    # concat majority/minority back together
    train_data = pd.concat([category_0_undersampled, category_1_oversampled], axis=0)

    # split input and target
    X_train_resampled = train_data.drop([y_train.name], axis=1)
    y_train_resampled = train_data[y_train.name]

    # show information if flag is set to True
    if show_dists:
        counts = y_train.value_counts()
        print(f'Resampled from: {counts[0]}/{counts()[1]} to {resample_size}/{resample_size}')

    return X_train_resampled, X_test_treated, y_train_resampled, y_test

In [37]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# function for automatic resampling using SMOTE and RandomUnderSampler (by default ending up with size of 0.5 of mayority)

def smote_rnd_treated(X_train_treated, X_test_treated, y_train, y_test, size=100, show_dists=False):
    # strategy is fraction of y_train-mayority
    mayority = pd.Series(y_train).value_counts()[0]
    strat = size/mayority
    before = y_train.value_counts()

    X_train_treated,y_train = SMOTE(sampling_strategy=strat).fit_resample(X_train_treated, y_train)
    if strat < 1.0:
        X_train_treated,y_train = RandomUnderSampler(sampling_strategy=1.0).fit_resample(X_train_treated,y_train)

    after = y_train.value_counts()

    # show information if flag is set to True
    if show_dists:
        print(f'Resampled from: {before[0]}/{before()[1]} to {after[0]}/{after[1]}')

    return X_train_treated, X_test_treated, y_train, y_test

In [38]:
from sklearn.metrics import r2_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

# function to fit and evaluate a model

def build_eval_model(X_train_treated, X_test_treated, y_train, y_test, model, save_model=False, decimals=5):
    # predict y_test
    model = model.fit(X_train_treated, y_train)
    pred = model.predict(X_test_treated)
    
    acc = round(model.score(X_test_treated, y_test),decimals)
    prec = round(precision_score(y_test, pred),decimals)
    recall = round(recall_score(y_test, pred),decimals)
    f1 = round(f1_score(y_test, pred),decimals)

    # evaluate predictions
    conf = confusion_matrix(y_test, pred)
    f_p = float(conf[0][1])
    t_p = float(conf[1][1])

    print(model)
    
    if save_model:
        model_name = str(model).split('(')[0]
        pickle.dump(model, open(f'{model_name}.sav', 'wb'))

    print("accuracy:", acc, "  precision:", prec, "  recall:", recall, "  f1:", f1, "\n")
    print(pd.DataFrame(conf), end='')

    # return scores-dict
    return {"accuracy": acc, "precision": prec, "recall": recall, "f1": f1, "false_positives": f_p, "true_positives": t_p}

In [39]:
import json

# pipeline function to combine multiple inputs and models

def pipeline(
    categoricals, 
    numericals_path:str, 
    y, 
    models, 
    save_models=False, 
    include_cats=True, 
    encode_cats=True, 
    save_encoder=False, 
    scale_nums=True, 
    save_scaler=False, 
    save_scores=False, 
    sampling=None, 
    size=1, 
    may_val=1, min_val=0):

    scores = {}
    for model_name, model in models:
        # set selection string
        selection = numericals_path.split('_nums')[0]
        selection = selection.split('lab/')[1]
        if include_cats:
            selection += '_with_cats'

        print('#############   Model |', model_name, '  #############   Data |', selection, '  #############\n')

        # set X data
        if include_cats:
            X = pd.concat([categoricals, pd.read_csv(numericals_path)], axis=1)
        else:
            X = pd.read_csv(numericals_path)
        a,b,c,d = split_and_treat_data(X, y, encode_cats=encode_cats, save_encoder=save_encoder, scale_nums=scale_nums, save_scaler=save_scaler)

        # evaluate sampling-flag
        if sampling == 'resample':
            a,b,c2,d = resample_treated(a,b,c,d, size)
        elif sampling == 'smote_rnd':
            a,b,c2,d = smote_rnd_treated(a,b,c,d, size)
        else:
            c2 = c
        
        # create scores-dict and add a sampling info
        model_id = model_name.replace(' ','')
        identif = f'{model_id}_{selection}'
        scores[identif] = build_eval_model(a,b,c2,d, model, save_model=save_models)
        scores[identif]['resampling'] = sampling

        # print log sampling and set final save_string
        if sampling:
            file_name = f'scores/scores_{len(scores)}models_features_{selection}_{sampling}_{size}.json'
            print(f'\t\tTraining data resampled from: {c.value_counts()[0]}/{c.value_counts()[1]} to {c2.value_counts()[0]}/{c2.value_counts()[1]}\n')
        else:
            file_name = f'scores/scores_{len(scores)}models_features_{selection}_{c.value_counts()[0]}_{c.value_counts()[1]}.json'
            print(f'\t\tTraining data sample sizes: {c.value_counts()[0]}/{c.value_counts()[1]}')

    # save if flag is set with last set file name
    if save_scores:
        with open(file_name, 'w') as f:
            json.dump(scores, f, indent=4)

    return scores

In [40]:
# read in data with different features

cats = pd.read_csv('files_for_lab/categorical.csv').astype(object)
targets = pd.read_csv('files_for_lab/target.csv')

# create dict for X for different numericals, set target y
y = targets['TARGET_B']

num_paths = []
methods = ['kbest', 'rfe', 'var', 'pca', 'all']

# read in feature selected data
for method in methods:
    num_paths.append(f'files_for_lab/{method}_nums.csv')

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [47]:
models = [
    ('Logistic Regression', LogisticRegression()),
    ('KNeighbors Classifier', KNeighborsClassifier(n_neighbors=5, n_jobs=-1)),
    ('Decision Tree Classifier Depth 5', DecisionTreeClassifier(max_depth=5)),
    ('Random Forest Classifier Depth 10', RandomForestClassifier(max_depth=10,
                                                        min_samples_split=20,
                                                        min_samples_leaf =20,
                                                        max_samples=0.2,
                                                        n_jobs=-1))
    ]

for path in num_paths:
    # for resample_size in [67970, 35000]:              # own little search grid
    #     for res_method in ['smote_rnd', 'resample']:
    pipeline(cats, path, y, models, save_scores=True, sampling='smote_rnd', size=67970)
    pipeline(cats, path, y, models, include_cats=False, save_scores=True, sampling='smote_rnd', size=67970)

# 22 min

#############   Model | Logistic Regression   #############   Data | kbest_with_cats   #############

accuracy: 0.61791   precision: 0.06974   recall: 0.50797   f1: 0.12264 

       0     1
0  14102  8497
1    617   637		Training data resampled from: 67970/3589 to 67970/67970

#############   Model | KNeighbors Classifier   #############   Data | kbest_with_cats   #############

accuracy: 0.45919   precision: 0.05759   recall: 0.60447   f1: 0.10516 

       0      1
0  10195  12404
1    496    758		Training data resampled from: 67970/3589 to 67970/67970

#############   Model | Decision Tree Classifier Depth 5   #############   Data | kbest_with_cats   #############

accuracy: 0.84715   precision: 0.09458   recall: 0.22249   f1: 0.13273 

       0     1
0  19928  2671
1    975   279		Training data resampled from: 67970/3589 to 67970/67970

#############   Model | Random Forest Classifier Depth 10   #############   Data | kbest_with_cats   #############

accuracy: 0.88379   precision: 0

In [33]:
models = [('Decision Tree Classifier Depth 5', DecisionTreeClassifier(max_depth=5))]

path = 'files_for_lab/var_nums.csv'

pipeline(cats, path, y, models, include_cats=True, save_models=True, save_encoder=True, save_scaler=True, save_scores=True, sampling='smote_rnd', size=66666)

#############   Model | Decision Tree Classifier Depth 5   #############   Data | var_with_cats   #############

DecisionTreeClassifier(max_depth=5)
accuracy: 0.84182   precision: 0.07396   recall: 0.19052   f1: 0.10656 

       0     1
0  19855  2817
1    956   225		Training data resampled from: 67897/3662 to 66666/66666



{'DecisionTreeClassifierDepth5_var_with_cats': {'accuracy': 0.84182,
  'precision': 0.07396,
  'recall': 0.19052,
  'f1': 0.10656,
  'false_positives': 2817.0,
  'true_positives': 225.0,
  'resampling': 'smote_rnd'}}

In [45]:
models = [('Logistic Regression', LogisticRegression())]

path = 'files_for_lab/var_nums.csv'

pipeline(cats, path, y, models, include_cats=False, save_models=True, save_encoder=True, save_scaler=True, save_scores=True, sampling='smote_rnd', size=67776)

#############   Model | Logistic Regression   #############   Data | var   #############

LogisticRegression()
accuracy: 0.61929   precision: 0.07169   recall: 0.51618   f1: 0.1259 

       0     1
0  14118  8468
1    613   654		Training data resampled from: 67983/3576 to 67776/67776



{'LogisticRegression_var': {'accuracy': 0.61929,
  'precision': 0.07169,
  'recall': 0.51618,
  'f1': 0.1259,
  'false_positives': 8468.0,
  'true_positives': 654.0,
  'resampling': 'smote_rnd'}}