In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style 
import seaborn as sns
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from datetime import datetime

#Get the data
df = pd.read_csv("train.csv")

<h4>Create Days Since connected Column</h4>

In [2]:
date_col = pd.to_datetime(df["Connect_Date"], format = '%d/%m/%y')
date_col.astype('int64')
df["today"] = pd.Timestamp.today()
df["Days_since_connected"] = df["today"]-date_col
df["Days_since_connected"] = df["Days_since_connected"].dt.days
df.drop(["today", "Connect_Date"], axis=1, inplace=True)

In [3]:
#Remove id
df.drop(["id"], axis=1, inplace=True)

<h4>Create continuous and categorical columns</h4>

In [4]:
continuous=df._get_numeric_data().columns.tolist()
continuous.remove('target')

In [5]:
categorical=df.select_dtypes(include=['object', 'bool', 'category']).columns.tolist()

<h4> Create test and training sets</h4>

In [6]:
y=df['target']
X = df.drop(['target'], axis=1, inplace=False)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=123)

<h4> Column Transformer

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

#Save untransformed data for later
X_train_raw = X_train.copy()
X_test_raw = X_test.copy()

cts_pipe = Pipeline([
    ('ImputeContinuous', SimpleImputer(strategy="median")),
    ('StandardScaler', StandardScaler())
    ])

cat_pipe = Pipeline([
    ('ImputeCategorical', SimpleImputer(strategy = "most_frequent")),
    ('OneHot', OneHotEncoder(handle_unknown='ignore'))
    ])

t = ColumnTransformer(
    [
    ("cts", cts_pipe, continuous),
    ("cat", cat_pipe, categorical)
    ])

X_train = t.fit_transform(X_train)
X_test=t.transform(X_test)




<h4>Function to compute top 20 Metric</h4>

In [8]:
#Function that calculates the sum of avg_cost_min with the highest predicted probabilities
#prediction is the predicted probabilities from the model.
#The argument testSet should be the unstandardized testset
def avgCostSum(prediction, y_testSet, X_testSet):
    #Get an array of sorted predictions in descending order
    index = np.argsort(prediction)[::-1]
    #Select the sorted avg_cost_min column from the unstandardized dataframe, then get the 20 highest
    avg_cost_min = X_testSet.iloc[index,].join(y_testSet.iloc[index])
    avg_cost_min = avg_cost_min[["average cost min", "target"]][0:20]
    #Return the sum of the top 20 for those that were correctly predicted
    return avg_cost_min[avg_cost_min["target"]==1]["average cost min"].sum()



<h4> Comparing Models

In [13]:
#My version test
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

models = []
models.append(('LRW', LogisticRegression(solver='lbfgs', max_iter=1000, class_weight="balanced")))
models.append(('LR', LogisticRegression(solver='lbfgs', max_iter=1000)))
models.append(('RF', RandomForestClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('GB', GradientBoostingClassifier()))

results = []
names = []
print("---------AUC----------Top20")
scoring = 'roc_auc'
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)[:,1]
    auc = roc_auc_score(y_test, y_pred)
    top20 = avgCostSum(y_pred, y_test, X_test_raw)
    results.append(auc)
    names.append(name)
    
    msg = "%s:    %f    %f" % (name, auc, top20)
    print(msg)

---------AUC----------Top20
LRW:    0.930947    3.849698
LR:    0.923934    3.592699
RF:    0.922640    2.739427
KNN:    0.734829    3.372555
GB:    0.937690    2.633399


<h4> Same thing using k-fold cross valiation (omitted to save time)