## Functions

In [23]:
#! pip install nbimporter
import nbimporter
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, recall_score, precision_score, auc, roc_auc_score, precision_recall_curve, confusion_matrix, roc_curve, plot_roc_curve
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV, KFold, cross_val_score
import nbimporter
#For upsampling data
from imblearn.over_sampling import SMOTENC
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone
from tqdm import tqdm
#!pip install pickle-mixin
import pickle

### Preprocessing

In [24]:
def adapt_writing(df):
    '''Remove whitescpace and special characters and make everything lower case for better handling of data.
    Input and output: Dataframe '''
    #make all strings lowercase to avoid label differences due to capitalization
    for col in df.select_dtypes('object').columns:
        df[col] = df[col].str.lower()  
    #column names to list
    cols = list(df.columns) 
    #for every list entry in cols: replace whitespace and special character by underscore and change names to lowercase  
    cols = [name.replace(' ', '_').replace('-', '_').lower() for name in cols] 
    #replace dataframe columns by updated list entries
    #return updated dataframe
    df.columns = cols
    return df

In [25]:
#Function for all prior preprocessing steps to be applied to validation and test data
def adapt_writing_test (df):
    #Remove whitespace, special characters, make everything lowercase
    adapt_writing(df)
    
    #If there are no duplicates, remove stammnummer and anruf id from df
    if df.duplicated(subset = ['stammnummer', 'anruf_id']).sum() == 0:
        df.drop(['stammnummer', 'anruf_id'], axis = 1, inplace = True)
    else:
        print(df[df.duplicated()])
        
    #Compute new feature
    df['letzte_kampagne'] = df.ergebnis_letzte_kampagne
    #If there was no prior contact, letzte Kampagne is set to 'kein kontakt'
    for i in range(len(df)):
        if df.loc[i, 'anzahl_kontakte_letzte_kampagne'] == 0:
            df.loc[i,'letzte_kampagne'] = 'kein kontakt' 
    #drop the original features
    df.drop(['anzahl_kontakte_letzte_kampagne', 'ergebnis_letzte_kampagne'], axis = 1, inplace = True)
    
    #Fill in the NaNs of tage_seit_letzter_kampagne with 0 if there was no prior contact.
    df.tage_seit_letzter_kampagne.fillna(0, inplace = True)
    
    #Reduce number of classes in art_der_anstellung
    df['erwerbstaetigkeit'] = 'angestellt'
    for i in range(len(df)):
        if df.loc[i, 'art_der_anstellung'] in ['hausfrau', 'arbeitslos', 'rentner']:
            df.loc[i,'erwerbstaetigkeit'] = 'nicht erwerbstätig'
        elif df.loc[i, 'art_der_anstellung'] == 'student':
            df.loc[i,'erwerbstaetigkeit'] = 'student'
        elif df.loc[i, 'art_der_anstellung'] in ['selbständig', 'gründer']:
            df.loc[i,'erwerbstaetigkeit'] = 'selbständig'
    #drop original feature
    df.drop('art_der_anstellung', axis = 1, inplace = True)
    
    #missings/ unbekannt labels are replaced by the column's mode
    cols = ['schulabschluß', 'kontaktart', 'letzte_kampagne']
    for col in cols:
        df[col].replace('unbekannt', np.nan, inplace = True)
        df[col].fillna(df[col].mode()[0], inplace = True)
    
    return df

### Modelling

In [26]:
#Preprocssing: Rescale numeric data, encode categorical data and make a list with feature names (needed for feature importancce)
def preprocessing(X_train, X_val, y_train, y_val, train_only = False):
    '''Function to rescale numeric data and encode categorical data of train dataframe and optionally, validation data.
    Training data is additionally upsampled using SMOTENC
    Input:
    X_train: training features as pd.DataFrame
    y_train: training target as pd.DataFrame
    X_val: validation features as pd.DataFrame
    y_val: validation target as pd.DataFrame
    train_only: Bool, if False, training and validation are to be transformed, if True, only training data is transformed'''
    
    #List of categorical features
    cat = list(X_train.select_dtypes('object'))
    
    #Indices of categorical data
    cat_indices = []
    for col in cat:
        cat_indices.append(X_train.columns.get_loc(col))

    #numerical features
    num = list(X_train.select_dtypes('number')) 
    
    #Pipeline for feature transformation: rescale numerical features, numerically encode categorical features
    preprocessor = ColumnTransformer([
        ('scale_numceric', RobustScaler(), num),
        ('encode_cat', OneHotEncoder(drop = 'first'), cat)], 
        remainder = 'passthrough')
    
    #fit to train data
    preprocessor.fit(X_train)
    
    #transform train data
    X_train = pd.DataFrame(preprocessor.transform(X_train))
    y_train = pd.get_dummies(y_train, drop_first = True)
    
    #get features names 
    feature_names = num + list(preprocessor.named_transformers_['encode_cat'].get_feature_names())
    
    #Oversampling train data via smotenc 
    X_sm, y_sm = SMOTENC(categorical_features = cat_indices, random_state = 42, sampling_strategy = 'minority', n_jobs = -1).fit_sample(X_train, y_train)
    
    if train_only == True:
        return np.array(X_sm), np.array(y_sm), feature_names
    else:
        #transform validation data
        X_val = pd.DataFrame(preprocessor.transform(X_val))
        y_val = pd.get_dummies(y_val, drop_first = True)
        return np.array(X_sm), np.array(X_val), np.array(y_sm), np.array(y_val), feature_names

In [27]:
def plot_roc_curve(y_train, y_proba_train, y_test, y_proba):
    print(f'Baseline ROC AUC: {roc_auc_score(y_val, [1 for _ in range(len(y_val))])}')
    print(f'Train ROC AUC Score: {roc_auc_score(y_train, y_proba_t)}')
    print(f'Validation ROC AUC  Score: {roc_auc_score(y_val, y_proba)}')
    
    FPR, TPR, Thresholds = roc_curve(y_test, y_proba)
    FPRt, TPRt, Thresholdst = roc_curve(y_train, y_proba_t)

    plt.plot(FPR, TPR,'b-',label = 'validation')
    plt.plot(FPRt, TPRt,'r-',label = 'train')
    plt.plot([0,1],[0,1],'k--', label = 'random')
    plt.plot([0,0,1,1],[0,1,1,1],'g--',label = 'perfect')
    plt.legend()
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.show()

In [28]:
### Predict test_y values and probabilities based on fitted logistic regression model

#pred_y = logistic_regression.predict(X_test) 

#probs_y=logistic_regression.predict_proba(X_test) 
  # probs_y is a 2-D array of probability of being labeled as 0 (first column of array) vs 1 (2nd column in array)
def define_thresholds(y_test, y_proba):
    precision, recall, thresholds = precision_recall_curve(y_val, y_proba)
    #retrieve probability of being 1(in second column of probs_y)
    pr_auc = auc(recall, precision)
    plt.title("Precision-Recall vs Threshold Chart")
    plt.plot(thresholds, precision[: -1], "b--", label = "Precision")
    plt.plot(thresholds, recall[: -1], "r--", label = "Recall")
    plt.ylabel("Precision, Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="lower left")
    plt.ylim([0,1])

In [29]:
#Feature Importance
#Since it is biased towards continuous or high-cardinality features, I do not use sklearn's feature importance but use a drop-column-approach. This computationally 
# more expensive but gives better results.
def dc_importance(model, X_train, y_train, X_val, y_val, feature_names):
    '''Get feature importance using a drop-column-approach: Importance is calculated based on score changes after removing the feature.
    Input: X, y as pd.DataFrames, feature_names as list'''
    #Fit model with all features
    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_val)
    benchmark_score = roc_auc_score(y_val, y_proba[:,1]) 
    
    importances = []
    for col in tqdm(pd.DataFrame(X_train).columns):
        model.fit(pd.DataFrame(X_train).drop(col, axis = 1), y_train)
        y_proba_d = model.predict_proba(pd.DataFrame(X_val).drop(col, axis = 1))
        drop_score = roc_auc_score(y_val, y_proba_d[:,1]) 
        importances.append(benchmark_score - drop_score)
    importances_df = pd.DataFrame(data = [feature_names, rf_model.feature_importances_]).transpose()
    importances_df.rename({1: 'importance', 0: 'feature'}, axis = 1, inplace = True)
    importances_df.sort_values('importance', ascending = False, inplace = True)
    return importances_df

In [30]:
#Precision recall curve
def prec_recall(y, y_proba):
    precision, recall, thresholds = precision_recall_curve(y, y_proba)
    plt.plot(recall, precision)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.show()