In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style 
import seaborn as sns
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from datetime import datetime

#Get the data
df = pd.read_csv("train.csv")

<h4>Create Days Since connected Column</h4>

In [2]:
date_col = pd.to_datetime(df["Connect_Date"], format = '%d/%m/%y')
date_col.astype('int64')
df["today"] = pd.Timestamp.today()
df["Days_since_connected"] = df["today"]-date_col
df["Days_since_connected"] = df["Days_since_connected"].dt.days
df.drop(["today", "Connect_Date"], axis=1, inplace=True)

In [3]:
#Remove id
df.drop(["id"], axis=1, inplace=True)

<h4>Create continuous and categorical columns</h4>

In [4]:
continuous=df._get_numeric_data().columns.tolist()
continuous.remove('target')

In [5]:
categorical=df.select_dtypes(include=['object', 'bool', 'category']).columns.tolist()

<h4> Create test and training sets</h4>

In [6]:
y=df['target']
X = df.drop(['target'], axis=1, inplace=False)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)



<h4> Column Transformer

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA



#IDEA TO TRY: Sample only those above a certain threshhold
#ind = X_train["average cost min"] >= 0.134453
#X_train = X_train[ind]
#y_train = y_train[ind]


#Save untransformed data for later
X_train_raw = X_train.copy()
X_test_raw = X_test.copy()

#Cotinuous Transformations
cts_pipe = Pipeline([
    ('ImputeContinuous', SimpleImputer(strategy="median")),
    ('StandardScaler', StandardScaler())
    ])

#Categorical Transformations
cat_pipe = Pipeline([
    ('OneHot', OneHotEncoder(handle_unknown='ignore'))
    ])

#Apply to columns
t = ColumnTransformer(
    [
    ("cts", cts_pipe, continuous),
    ("cat", cat_pipe, categorical)
    ])

# All trasnformations. Add any here that would apply to both continuous and categorical
final_pipeline = Pipeline([
    ('columns', t),
    #('PCA', PCA())
    ])


X_train = final_pipeline.fit_transform(X_train)
X_test= final_pipeline.transform(X_test)







<h4>Function to compute top 20 Metric</h4>

In [8]:
#Function that calculates the sum of avg_cost_min with the highest predicted probabilities
#prediction is the predicted probabilities from the model.
#The argument testSet should be the unstandardized testset
def avgCostSum(prediction, y_testSet, X_testSet):
    #Get an array of sorted predictions in descending order
    index = np.argsort(prediction)[::-1]
    #Select the sorted avg_cost_min column from the unstandardized dataframe, then get the 20 highest
    avg_cost_min = X_testSet.iloc[index,].join(y_testSet.iloc[index])
    avg_cost_min = avg_cost_min[["average cost min", "target"]][0:20]
    #Return the sum of the top 20 for those that were correctly predicted
    return avg_cost_min[avg_cost_min["target"]==1]["average cost min"].sum()



<h4> Comparing Models

In [9]:
#My version test
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

models = []
models.append(('LR  ', LogisticRegression(solver='lbfgs', max_iter=1000)))
models.append(('LRW ', LogisticRegression(solver='lbfgs', max_iter=1000, class_weight="balanced")))
models.append(('RF  ', RandomForestClassifier()))
models.append(('KNN ', KNeighborsClassifier()))
models.append(('GB  ', GradientBoostingClassifier()))
models.append(('XGB ', XGBClassifier()))
models.append(('XGBW', XGBClassifier(scale_pos_weight=5)))

results = []
names = []
print("---------AUC----------Top20------Accuracy----Precision----Recall")
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)[:,1]
    cat_preds = [round(value) for value in y_pred]
    auc = roc_auc_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, cat_preds)
    precision = precision_score(y_test,cat_preds)
    recall = recall_score(y_test,cat_preds)
    top20 = avgCostSum(y_pred, y_test, X_test_raw)
    results.append(auc)
    names.append(name)
    
    msg = "%s:    %f    %f    %f    %f    %f" % (name, auc, top20, accuracy, precision, recall)
    print(msg)

---------AUC----------Top20------Accuracy----Precision----Recall
LR  :    0.917071    3.264576    0.921705    0.756944    0.712418
LRW :    0.921821    3.161221    0.913776    0.679348    0.816993
RF  :    0.918900    2.652866    0.915758    0.746377    0.673203
KNN :    0.747240    3.060730    0.863231    0.641509    0.222222
GB  :    0.935870    2.428153    0.915758    0.739437    0.686275
XGB :    0.940115    2.429886    0.920714    0.748299    0.718954
XGBW:    0.940375    2.190504    0.921705    0.734177    0.758170


<h4> Try oversampling

In [10]:
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE

oversample = SMOTE(sampling_strategy=0.5,random_state=42)
X_train, y_train = oversample.fit_resample(X_train, y_train)

In [11]:
results = []
names = []
print("---------AUC----------Top20------Accuracy----Precision----Recall")
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)[:,1]
    cat_preds = [round(value) for value in y_pred]
    auc = roc_auc_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, cat_preds)
    precision = precision_score(y_test,cat_preds)
    recall = recall_score(y_test,cat_preds)
    top20 = avgCostSum(y_pred, y_test, X_test_raw)
    results.append(auc)
    names.append(name)
    
    msg = "%s:    %f    %f    %f    %f    %f" % (name, auc, top20, accuracy, precision, recall)
    print(msg)

---------AUC----------Top20------Accuracy----Precision----Recall
LR  :    0.922073    3.371483    0.922696    0.730061    0.777778
LRW :    0.923340    3.371483    0.914767    0.687151    0.803922
RF  :    0.920912    2.381582    0.921705    0.750000    0.725490
KNN :    0.760728    3.125176    0.802775    0.392523    0.549020
GB  :    0.934969    2.616514    0.922696    0.741935    0.751634
XGB :    0.936511    2.609524    0.917740    0.743056    0.699346
XGBW:    0.935213    2.435331    0.927651    0.743902    0.797386
