In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style 
import seaborn as sns
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from datetime import datetime

#Get the data
df = pd.read_csv("train.csv")

<h4>Create Days Since connected Column</h4>

In [2]:
date_col = pd.to_datetime(df["Connect_Date"], format = '%d/%m/%y')
date_col.astype('int64')
df["today"] = pd.Timestamp.today()
df["Days_since_connected"] = df["today"]-date_col
df["Days_since_connected"] = df["Days_since_connected"].dt.days
df.drop(["today", "Connect_Date"], axis=1, inplace=True)

In [3]:
#Remove id
df.drop(["id"], axis=1, inplace=True)

<h4>Create continuous and categorical columns</h4>

In [4]:
continuous=df._get_numeric_data().columns.tolist()
continuous.remove('target')

In [5]:
categorical=df.select_dtypes(include=['object', 'bool', 'category']).columns.tolist()

<h4> Create test and training sets</h4>

In [6]:
y=df['target']
X = df.drop(['target'], axis=1, inplace=False)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)


<h4> Preprocessing Pipeline

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA


#Something Worked!!! This leads to significant improvement in top 20 metric
#This creates a weigting array that assigns much higher weight 
#to the instances that churn and have average cost min higher than a certain
#threshold. sample_weights are the used to fit the model.
#The threshold 0.179141 is the top quartile of average cost min
#750 is just arbitraray and can be tuned.
ind1 = np.array(X_train["average cost min"] >= 0.179141)
ind2 = np.array(y_train)
ind = ind1*ind2
sample_weights = (ind*750)+1




#Save untransformed data for later
X_train_raw = X_train.copy()
X_test_raw = X_test.copy()

#Cotinuous Transformations
cts_pipe = Pipeline([
    ('ImputeContinuous', SimpleImputer(strategy="median")),
    ('StandardScaler', StandardScaler())
    ])

#Categorical Transformations
cat_pipe = Pipeline([
    ('OneHot', OneHotEncoder(handle_unknown='ignore'))
    ])

#Apply to columns
t = ColumnTransformer(
    [
    ("cts", cts_pipe, continuous),
    ("cat", cat_pipe, categorical)
    ])

# All trasnformations. Add any here that would apply to both continuous and categorical
final_pipeline = Pipeline([
    ('columns', t),
    #('PCA', PCA())
    ])


X_train = final_pipeline.fit_transform(X_train)
X_test= final_pipeline.transform(X_test)







<h4>Function to compute top 20 Metric</h4>

In [8]:
#Function that calculates the sum of avg_cost_min with the highest predicted probabilities
#prediction is the predicted probabilities from the model.
#The argument testSet should be the unstandardized testset
def avgCostSum(prediction, y_testSet, X_testSet):
    #Get an array of sorted predictions in descending order
    index = np.argsort(prediction)[::-1]
    #Select the sorted avg_cost_min column from the unstandardized dataframe, then get the 20 highest
    avg_cost_min = X_testSet.iloc[index,].join(y_testSet.iloc[index])
    avg_cost_min = avg_cost_min[["average cost min", "target"]][0:20]
    #Return the sum of the top 20 for those that were correctly predicted
    return avg_cost_min[avg_cost_min["target"]==1]["average cost min"].sum()



<h4> Comparing Models

In [29]:
#My version test
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier

models = []
models.append(('LR  ', LogisticRegression(solver='lbfgs', max_iter=1000)))
models.append(('LRW ', LogisticRegression(solver='lbfgs', max_iter=1000, class_weight="balanced")))
models.append(('RF  ', RandomForestClassifier()))
models.append(('RFW ', RandomForestClassifier(class_weight="balanced")))
models.append(('DT  ', DecisionTreeClassifier(class_weight="balanced")))
models.append(('DTW ', DecisionTreeClassifier(class_weight="balanced")))
models.append(('GB  ', GradientBoostingClassifier(n_estimators = 100, learning_rate =0.2, max_depth=3, random_state=42)))
models.append(('XGB ', XGBClassifier()))
models.append(('XGBW', XGBClassifier(scale_pos_weight=6)))
models.append(('GNB', GaussianNB()))
models.append(('CNNs', MLPClassifier(hidden_layer_sizes=(200,), max_iter=1000, alpha=1e-6,
                        solver='adam', tol=1e-6, random_state=1,
                        learning_rate_init=.1)))
models.append(('SVM', SVC(kernel='rbf', C=5.0, probability=True)))
b_c=[
    ('RFW ', RandomForestClassifier(class_weight="balanced")),
    ('SVM', SVC(kernel='rbf', C=5.0, probability=True)),
    ('XGBW', XGBClassifier(scale_pos_weight=6))
]
m_c= LogisticRegression()
models.append(('Stacking', StackingClassifier(estimators=b_c, final_estimator=m_c, cv=5)))

results = []
names = []
print("---------AUC----------Top20------Accuracy----Precision----Recall")
for name, model in models:
    if hasattr(model, 'fit') and 'sample_weight' in model.fit.__code__.co_varnames:
        model.fit(X_train, y_train, sample_weight=sample_weights)
    else:
        model.fit(X_train, y_train)  # Fit without sample weights
    
    #model.fit(X_train, y_train, sample_weight = sample_weights)
    y_pred = model.predict_proba(X_test)[:,1]
    cat_preds = [round(value) for value in y_pred]
    auc = roc_auc_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, cat_preds)
    precision = precision_score(y_test,cat_preds)
    recall = recall_score(y_test,cat_preds)
    top20 = avgCostSum(y_pred, y_test, X_test_raw)
    results.append(auc)
    names.append(name)
    
    msg = "%s:    %f    %f    %f    %f    %f" % (name, auc, top20, accuracy, precision, recall)
    print(msg)

---------AUC----------Top20------Accuracy----Precision----Recall
LR  :    0.836281    3.837880    0.731417    0.336111    0.790850
LRW :    0.857851    3.924467    0.648167    0.290456    0.915033
RF  :    0.908512    1.902645    0.895937    0.740000    0.483660
RFW :    0.915063    2.373663    0.887017    0.746835    0.385621
DT  :    0.783115    2.130154    0.887017    0.625806    0.633987
DTW :    0.775063    2.164788    0.887017    0.630872    0.614379
GB  :    0.867074    3.820324    0.845391    0.493671    0.764706
XGB :    0.939596    2.139042    0.909812    0.731343    0.640523
XGBW:    0.936229    2.219334    0.920714    0.732484    0.751634
GNB:    0.674711    4.647400    0.731417    0.295139    0.555556
CNNs:    0.872839    2.445988    0.900892    0.680272    0.653595
SVM:    0.909940    2.605811    0.907830    0.717391    0.647059
Stacking:    0.927440    2.703068    0.151635    0.151635    1.000000


<h4> Try oversampling Left out for now. Don't know how to implement with the sample_weights.
Might not be necesarry since most algorithms have a class_weight or scale_pos_weight option