# Test over many different random sampels. Used for Hyperparameter tuning and comparing models

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, StackingClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE
from catboost import CatBoostClassifier
import math

In [2]:
# Function that calculates the sum of avg_cost_min with the highest predicted probabilities
# prediction is the predicted probabilities from the model.
# The argument testSet should be the unstandardized testset

def avgCostSum(prediction, y_testSet, X_testSet):
    #Get an array of sorted predictions in descending order
    index = np.argsort(prediction)[::-1]
    #Select the sorted avg_cost_min column from the unstandardized dataframe, then get the 20 highest
    avg_cost_min = X_testSet.iloc[index,].join(y_testSet.iloc[index])
    avg_cost_min = avg_cost_min[["average cost min", "target"]][0:20]
    #Return the sum of the top 20 for those that were correctly predicted
    return avg_cost_min[avg_cost_min["target"]==1]["average cost min"].sum()

In [3]:
df = pd.read_csv("../datasets/assignment1_train.csv")

In [4]:
# We can drop the column id: it is unique to each customer and has no predictive value
assert len(df.id.unique()) == len(df)
df_clean = df.drop(["id"], axis=1)

# Convert Connect date into number of days
date_col = pd.to_datetime(df_clean["Connect_Date"], format = '%d/%m/%y')
date_col.astype('int64')
df_clean["today"] = pd.Timestamp.today()
df_clean["Days_since_connected"] = df_clean["today"]-date_col
df_clean["Days_since_connected"] = df_clean["Days_since_connected"].dt.days
df_clean.drop(["today", "Connect_Date"], axis=1, inplace=True)

# Set the target as categorical
#df_clean['target'] = df_clean['target'].map({0:'retained', 1:'churned'}).astype('object')



In [5]:
y = df_clean['target']
X = df_clean.drop(['target'], axis=1, inplace=False)

continuous = df_clean._get_numeric_data().columns.drop('target').tolist()
categorical = df_clean.select_dtypes(include=['object', 'bool', 'category']).columns.tolist()


## The Loop

In [40]:
results=[]

for i in range(10):
    print("iteration", i)
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=4*i)
    
    
    # Save untransformed data for later
    X_train_raw = X_train.copy()
    X_test_raw = X_test.copy()
    
    # Continuous Transformations
    cts_pipe = Pipeline([
        ('ImputeContinuous', SimpleImputer(strategy="median")),
        #('StandardScaler', StandardScaler())
        ])
    
    # Categorical Transformations
    cat_pipe = Pipeline([
        ('OneHot', OneHotEncoder(handle_unknown='ignore'))
        ])
    
    # Apply to columns
    t = ColumnTransformer(
        [
        ("cts", cts_pipe, continuous),
        ("cat", cat_pipe, categorical)
        ])
    
    # All trasnformations. Add any here that would apply to both continuous and categorical
    final_pipeline = Pipeline([
        ('columns', t),
        #('PCA', PCA())
        ])
    
    
    X_train = final_pipeline.fit_transform(X_train)
    X_test= final_pipeline.transform(X_test)
    
    
    
    
    #maximum possible value for top 20 score
    x = X_test_raw.join(y_test)
    #Save the top 20 threshhold for later
    top_20_thresh = (x[x["target"] ==1 ][["average cost min", "target"]].sort_values(by = "average cost min", ascending = False)[0:20].min().iloc[0])
    #Save the sum of the top 20
    x = x[x["target"] ==1 ][["average cost min", "target"]].sort_values(by = "average cost min", ascending = False)[0:20].sum()
    max_target=(x.iloc[0])
    print("Actual thresh:", top_20_thresh)
    
    
    #######
    percentile = 90
    avg_cost_thresh = np.percentile(X_train_raw["average cost min"], percentile)
    print("predicted thresh:", avg_cost_thresh)
   
    
    ###############
    high_model_weight = 0.8
    reg_model_weight = 1-high_model_weight
    y_train_high = y_train*(X_train_raw["average cost min"]>=avg_cost_thresh)
    
    #################
    
    #combining the above with oversampling improves top 20 score
    oversample = SMOTE(random_state=42)
    X_train_high, y_train_high = oversample.fit_resample(X_train, y_train_high)
    X_train, y_train = oversample.fit_resample(X_train, y_train)
    
    
    
    
    #restrict to fewer models for faster testing
    models = []
    models.append(('RFW ', RandomForestClassifier(class_weight="balanced",
                                                  n_estimators=50,
                                                  max_features="sqrt",
                                                 criterion="entropy",
                                                  random_state=42)))

    models.append(('XGBW', XGBClassifier(scale_pos_weight=6, learning_rate=0.3)))   
    
    models.append(('catboost', CatBoostClassifier(iterations=100, 
                                        learning_rate=0.2,  
                                        depth=6, 
                                        loss_function='Logloss', 
                                        eval_metric='AUC',  
                                        random_state=42, verbose=False, allow_writing_files=False)))
    
    
    
    for name, model in models:
        model.fit(X_train, y_train)  # Fit model to regular target
        y_pred_reg = model.predict_proba(X_test)[:,1]
        model.fit(X_train_high, y_train_high)  #Fit model to 
        y_pred_high = model.predict_proba(X_test)[:,1]
        y_pred = reg_model_weight*y_pred_reg + high_model_weight*y_pred_high
        auc = roc_auc_score(y_test, y_pred)
        top20 = avgCostSum(y_pred, y_test, X_test_raw)
        results.append(
        [name, auc, top20,top20/max_target])


resdf = pd.DataFrame(results)
resdf.columns = ['name', 'auc', 'top20' ,'%oftop20']
resdf.groupby('name')[["auc","top20","%oftop20"]].mean()

iteration 0
Actual thresh: 0.217383
predicted thresh_high: 0.32293410000000006
predicted thresh_med: 0.18559240000000002
iteration 1
Actual thresh: 0.233411
predicted thresh_high: 0.32506285000000007
predicted thresh_med: 0.1858592
iteration 2
Actual thresh: 0.250189
predicted thresh_high: 0.3514979000000006
predicted thresh_med: 0.18635300000000002
iteration 3
Actual thresh: 0.222783
predicted thresh_high: 0.3349249
predicted thresh_med: 0.1858592
iteration 4
Actual thresh: 0.231511
predicted thresh_high: 0.33462590000000003
predicted thresh_med: 0.1860228


Unnamed: 0_level_0,auc,top20,%oftop20
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
RFW,0.637625,5.824797,0.769553
XGBW,0.774897,5.403261,0.714785
catboost,0.761704,5.592907,0.735807


<h4> Things that were tested:</h4>

- Standardization: models perform better without standardizing
- Standardizing + using PCA: Performs much worse
- Adjusting the percentile: 90 works very well
- Changing weight of high value targets. Couldn't find anything that works better than 0.8
- RFW, XGB, and catboost all perform similarly.
- Hard to find anything that pushes the %oftop20 above 0.7 on average