## Predict callers to retention

### read csv

In [1]:
#import libraries
import os
import re
import time
import google
from google.oauth2 import credentials
from google.oauth2 import service_account
from google.oauth2.service_account import Credentials
from datetime import date
from datetime import timedelta
from dateutil.relativedelta import relativedelta
import pandas as pd
import numpy as np
import pickle
from google.cloud import storage
from google.cloud import bigquery
from sklearn.model_selection import train_test_split

# build model
import xgboost as xgb
from sklearn.metrics import roc_auc_score

file_bucket = 'divg-josh-pr-d1cc3a-default' 
folder_name = 'promo_expiry_analysis'

df = pd.read_csv('gs://{}/{}/data.csv'.format(file_bucket, folder_name))

### preprocess

- Tenure Group: cat
- PROV: cat
- Pcount: cat
- Price Plan Grouping: cat
- Technology Group: cat
- demographics: cat
- CampaignFlag: cat
- TOTALCalls: remove


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112460 entries, 0 to 112459
Data columns (total 55 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   BAN                                    112460 non-null  int64  
 1   Tenure Group                           112460 non-null  object 
 2   PROV                                   112460 non-null  object 
 3   Pcount                                 112460 non-null  object 
 4   Product Count                          112460 non-null  int64  
 5   HasHSIA                                112460 non-null  int64  
 6   HasHP                                  112460 non-null  int64  
 7   HasTV                                  112460 non-null  int64  
 8   HasSMHM                                112460 non-null  int64  
 9   productMix_product_mix_all             112460 non-null  int64  
 10  productMix_hsic_count                  112460 non-null  

In [3]:
df.columns

Index(['BAN', 'Tenure Group', 'PROV', 'Pcount', 'Product Count', 'HasHSIA',
       'HasHP', 'HasTV', 'HasSMHM', 'productMix_product_mix_all',
       'productMix_hsic_count', 'productMix_sing_count',
       'productMix_ttv_count', 'productMix_shs_count',
       'productMix_new_hsic_ind', 'productMix_new_sing_ind',
       'productMix_new_ttv_ind', 'productMix_new_smhm_ind',
       'Price Plan Grouping', 'Technology Group', 'TOTAL_CHARGE',
       'HSIA_CHARGE', 'HP_CHARGE', 'TV_CHARGE', 'SMHM_CHARGE', 'tot_disc_amt',
       'hsic_disc_amt', 'sing_disc_amt', 'ttv_disc_amt', 'smhm_disc_amt',
       'TOTAL_CHARGE_NO_DISC', 'HSIC_CHARGE_NO_DISC', 'SING_CHARGE_NO_DISC',
       'TTV_CHARGE_NO_DISC', 'SMHS_CHARGE_NO_DISC', 'total_disc_pct',
       'hsic_disc_pct', 'sing_disc_pct', 'ttv_disc_pct', 'smhm_disc_pct',
       'hsiaUsage_hs_tot_gb_avg', 'demographics_demo_avg_income',
       'troubleTickets_number_tickets', 'troubleTickets_ticket_hsia_affected',
       'troubleTickets_ticket_tv_affecte

In [4]:
cols_to_dummy = ['Tenure Group', 'PROV', 'Pcount', 'Price Plan Grouping', 'Technology Group', 'demographics', 'CampaignFlag']

for col in cols_to_dummy: 

    # Create dummy variables for the Country column
    df = pd.get_dummies(df, columns=[col], drop_first=True, prefix=None, dtype="int64")

# reorder the df columns so that 'target' comes last
df_processed = df[[c for c in df if c not in ['target']] 
       + ['target']]

df_processed.columns = df_processed.columns.str.replace('<', 'less_than_')
df_processed.columns = df_processed.columns.str.replace(' ', '_')

df_processed.head()

Unnamed: 0,BAN,Product_Count,HasHSIA,HasHP,HasTV,HasSMHM,productMix_product_mix_all,productMix_hsic_count,productMix_sing_count,productMix_ttv_count,...,Price_Plan_Grouping_Other,Technology_Group_Fibre,demographics_rural_family,demographics_unassigned,demographics_urban,demographics_urban_family,demographics_urban_young,CampaignFlag_Reached,CampaignFlag_Targeted,target
0,603512534,2,1,0,1,0,2,1,0,1,...,0,0,0,0,0,1,0,0,0,1
1,225975121,2,1,1,0,0,2,1,1,0,...,0,1,0,0,0,1,0,0,0,1
2,234048777,3,1,1,1,0,4,1,1,1,...,0,1,0,0,0,1,0,1,0,1
3,604553605,2,1,0,1,0,2,1,0,1,...,0,1,0,0,0,1,0,0,0,1
4,604260803,1,1,0,0,0,1,1,0,0,...,0,1,0,0,0,1,0,1,0,0


In [5]:
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112460 entries, 0 to 112459
Data columns (total 66 columns):
 #   Column                                          Non-Null Count   Dtype  
---  ------                                          --------------   -----  
 0   BAN                                             112460 non-null  int64  
 1   Product_Count                                   112460 non-null  int64  
 2   HasHSIA                                         112460 non-null  int64  
 3   HasHP                                           112460 non-null  int64  
 4   HasTV                                           112460 non-null  int64  
 5   HasSMHM                                         112460 non-null  int64  
 6   productMix_product_mix_all                      112460 non-null  int64  
 7   productMix_hsic_count                           112460 non-null  int64  
 8   productMix_sing_count                           112460 non-null  int64  
 9   productMix_ttv_count      

### register lift function 

In [16]:
def get_lift(prob, y_test, q):
    result = pd.DataFrame(columns=['Prob', 'CallToRetention'])
    result['Prob'] = prob
    result['CallToRetention'] = y_test
    result['Decile'] = pd.qcut(result['Prob'], q, labels=[i for i in range(q, 0, -1)])
    add = pd.DataFrame(result.groupby('Decile')['CallToRetention'].mean()).reset_index()
    add.columns = ['Decile', 'avg_real_call_rate']
    add2 = pd.DataFrame(result.groupby('Decile')['CallToRetention'].count()).reset_index()
    add2.columns = ['Decile', 'ban_count']
    result = result.merge(add, on='Decile', how='left')
    result = result.merge(add2, on='Decile', how='left')
    result.sort_values('Decile', ascending=True, inplace=True)
    lg = pd.DataFrame(result.groupby('Decile')['Prob'].mean()).reset_index()
    lg.columns = ['Decile', 'avg_model_pred_call_rate']
    lg.sort_values('Decile', ascending=False, inplace=True)
    lg['avg_call_rate_total'] = result['CallToRetention'].mean()
    lg = lg.merge(add, on='Decile', how='left')
    lg = lg.merge(add2, on='Decile', how='left')
    lg['lift'] = lg['avg_real_call_rate'] / lg['avg_call_rate_total']

    return lg

### set X_train, X_test, y_train, y_test

In [7]:
features = [col for col in df_processed.columns if col not in ["BAN", "target"]]

X, y = df_processed[[col for col in df_processed.columns if col != "target"]], df_processed["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None, shuffle=True)

ban_train = X_train['BAN']
X_train = X_train[features]
y_train = np.squeeze(y_train.values)

ban_test = X_test['BAN']
X_test = X_test[features]
y_test = np.squeeze(y_test.values)


### set up xgb and train the model

In [8]:

# build model and fit in training data
import xgboost as xgb
from sklearn.metrics import roc_auc_score

xgb_model = xgb.XGBClassifier(
    learning_rate=0.01,
    n_estimators=80,
    max_depth=8,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=1
    # seed=27
)

xgb_model.fit(X_train, y_train)
print('xgb training done')


xgb training done


### make predictions on X_train

In [21]:
from sklearn.preprocessing import normalize

#predictions on X_test
pred_prb = xgb_model.predict_proba(X_train, ntree_limit=xgb_model.best_iteration)[:, 1]
pred_prb = np.array(normalize([pred_prb]))[0]

#join ban_test, X_test, y_test and pred_prb and print to csv
#CHECK THE SIZE OF EACH COMPONENT BEFORE JOINING
q=10
df_ban_train = ban_train.to_frame()
df_train_exp = df_ban_train.join(X_train) 
df_train_exp['y_test'] = y_train
df_train_exp['y_pred_proba'] = pred_prb
df_train_exp['y_pred'] = (df_train_exp['y_pred_proba'] > 0.5).astype(int)
df_train_exp['decile'] = pd.qcut(df_train_exp['y_pred_proba'], q, labels=[i for i in range(q, 0, -1)])

lg = get_lift(pred_prb, y_train, q)

lg



Unnamed: 0,Decile,avg_model_pred_call_rate,avg_call_rate_total,avg_real_call_rate,ban_count,lift
0,1,0.004197,0.470945,0.791551,8995,1.680771
1,2,0.003893,0.470945,0.672186,8999,1.427312
2,3,0.003664,0.470945,0.609537,8997,1.294283
3,4,0.003467,0.470945,0.531458,8996,1.128493
4,5,0.003317,0.470945,0.474269,8997,1.007058
5,6,0.003186,0.470945,0.428143,8997,0.909114
6,7,0.003028,0.470945,0.388506,8996,0.824949
7,8,0.002903,0.470945,0.329999,8997,0.700716
8,9,0.002764,0.470945,0.299989,8997,0.636993
9,10,0.002546,0.470945,0.183839,8997,0.390362


### make predictions on X_test

In [22]:
from sklearn.preprocessing import normalize

#predictions on X_test
pred_prb = xgb_model.predict_proba(X_test, ntree_limit=xgb_model.best_iteration)[:, 1]
pred_prb = np.array(normalize([pred_prb]))[0]

#join ban_test, X_test, y_test and pred_prb and print to csv
#CHECK THE SIZE OF EACH COMPONENT BEFORE JOINING
q=10
df_ban_test = ban_test.to_frame()
df_test_exp = df_ban_test.join(X_test) 
df_test_exp['y_test'] = y_test
df_test_exp['y_pred_proba'] = pred_prb
df_test_exp['y_pred'] = (df_test_exp['y_pred_proba'] > 0.5).astype(int)
df_test_exp['decile'] = pd.qcut(df_test_exp['y_pred_proba'], q, labels=[i for i in range(q, 0, -1)])

lg = get_lift(pred_prb, y_test, q)

lg



Unnamed: 0,Decile,avg_model_pred_call_rate,avg_call_rate_total,avg_real_call_rate,ban_count,lift
0,1,0.008361,0.474302,0.735556,2250,1.550817
1,2,0.00777,0.474302,0.619386,2249,1.30589
2,3,0.007324,0.474302,0.574544,2247,1.211346
3,4,0.006937,0.474302,0.529542,2251,1.116467
4,5,0.006644,0.474302,0.483713,2241,1.019841
5,6,0.006382,0.474302,0.450598,2257,0.950024
6,7,0.006063,0.474302,0.424189,2249,0.894343
7,8,0.005818,0.474302,0.355714,2249,0.749973
8,9,0.005542,0.474302,0.321476,2249,0.677788
9,10,0.005115,0.474302,0.248444,2250,0.523811


### get lift