In [5]:
import numpy as np
import pandas as pd

In [6]:
df_org = pd.read_csv('telecom_customer_churn.csv')

In [7]:
df_org

Unnamed: 0,Customer ID,Gender,Age,Married,Number of Dependents,City,Zip Code,Latitude,Longitude,Number of Referrals,...,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status,Churn Category,Churn Reason
0,0002-ORFBO,Female,37,Yes,0,Frazier Park,93225,34.827662,-118.999073,2,...,Credit Card,65.60,593.30,0.00,0,381.51,974.81,Stayed,,
1,0003-MKNFE,Male,46,No,0,Glendale,91206,34.162515,-118.203869,0,...,Credit Card,-4.00,542.40,38.33,10,96.21,610.28,Stayed,,
2,0004-TLHLJ,Male,50,No,0,Costa Mesa,92627,33.645672,-117.922613,0,...,Bank Withdrawal,73.90,280.85,0.00,0,134.60,415.45,Churned,Competitor,Competitor had better devices
3,0011-IGKFF,Male,78,Yes,0,Martinez,94553,38.014457,-122.115432,1,...,Bank Withdrawal,98.00,1237.85,0.00,0,361.66,1599.51,Churned,Dissatisfaction,Product dissatisfaction
4,0013-EXCHZ,Female,75,Yes,0,Camarillo,93010,34.227846,-119.079903,3,...,Credit Card,83.90,267.40,0.00,0,22.14,289.54,Churned,Dissatisfaction,Network reliability
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,9987-LUTYD,Female,20,No,0,La Mesa,91941,32.759327,-116.997260,0,...,Credit Card,55.15,742.90,0.00,0,606.84,1349.74,Stayed,,
7039,9992-RRAMN,Male,40,Yes,0,Riverbank,95367,37.734971,-120.954271,1,...,Bank Withdrawal,85.10,1873.70,0.00,0,356.40,2230.10,Churned,Dissatisfaction,Product dissatisfaction
7040,9992-UJOEL,Male,22,No,0,Elk,95432,39.108252,-123.645121,0,...,Credit Card,50.30,92.75,0.00,0,37.24,129.99,Joined,,
7041,9993-LHIEB,Male,21,Yes,0,Solana Beach,92075,33.001813,-117.263628,5,...,Credit Card,67.85,4627.65,0.00,0,142.04,4769.69,Stayed,,


In [8]:
df = df_org.copy()
df['client_churned'] = df['Customer Status'] == 'Churned'
df = df[df['Customer Status'] != 'Joined']
columns_to_drop = ['City', 'Latitude', 'Longitude', 'Churn Category', 'Churn Reason', 'Customer Status', 'Customer ID']
df = df.drop(columns_to_drop, axis=1) #Drop All columns with nans
df.dropna(axis=1, how='any', inplace=True)

In [9]:
df

Unnamed: 0,Gender,Age,Married,Number of Dependents,Zip Code,Number of Referrals,Tenure in Months,Offer,Phone Service,Internet Service,Contract,Paperless Billing,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,client_churned
0,Female,37,Yes,0,93225,2,9,,Yes,Yes,One Year,Yes,Credit Card,65.60,593.30,0.00,0,381.51,974.81,False
1,Male,46,No,0,91206,0,9,,Yes,Yes,Month-to-Month,No,Credit Card,-4.00,542.40,38.33,10,96.21,610.28,False
2,Male,50,No,0,92627,0,4,Offer E,Yes,Yes,Month-to-Month,Yes,Bank Withdrawal,73.90,280.85,0.00,0,134.60,415.45,True
3,Male,78,Yes,0,94553,1,13,Offer D,Yes,Yes,Month-to-Month,Yes,Bank Withdrawal,98.00,1237.85,0.00,0,361.66,1599.51,True
4,Female,75,Yes,0,93010,3,3,,Yes,Yes,Month-to-Month,Yes,Credit Card,83.90,267.40,0.00,0,22.14,289.54,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7037,Female,36,No,0,92028,0,4,,Yes,No,Month-to-Month,No,Bank Withdrawal,20.95,85.50,0.00,0,8.04,93.54,True
7038,Female,20,No,0,91941,0,13,Offer D,Yes,Yes,One Year,No,Credit Card,55.15,742.90,0.00,0,606.84,1349.74,False
7039,Male,40,Yes,0,95367,1,22,Offer D,Yes,Yes,Month-to-Month,Yes,Bank Withdrawal,85.10,1873.70,0.00,0,356.40,2230.10,True
7041,Male,21,Yes,0,92075,5,67,Offer A,Yes,Yes,Two Year,No,Credit Card,67.85,4627.65,0.00,0,142.04,4769.69,False


## Leaving some data for Batch Mode

In [10]:
from sklearn.model_selection import train_test_split

df_curr, df_future = train_test_split(df, test_size=0.2)

In [11]:
df_future.to_csv('future.csv')

## Train, val, test split 

In [12]:
df_train, df_test = train_test_split(df, test_size=0.4, stratify=df['client_churned'])
df_test, df_val = train_test_split(df_test, test_size=0.5, stratify=df_test['client_churned'])

In [13]:
df_val.to_csv('val.csv')
df_train.to_csv('train.csv')
df_test.to_csv('test.csv')

# Training

## Preprocessing

In [14]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer

In [15]:
categorical = ['Gender', 'Zip Code', 'Offer', 'Contract', 'Phone Service', 'Internet Service', 'Paperless Billing',
              'Payment Method']
numerical = ['Age', 'Number of Dependents', 'Number of Referrals', 'Tenure in Months', 'Monthly Charge',
            'Total Charges', 'Total Refunds', 'Total Extra Data Charges', 'Total Revenue']
target = 'client_churned'

def preprocess_data(df, dv=None, scaler=None, train=True):
    
    df = df.copy()
    target_vec = df[target]
    df.drop(target, axis=1, inplace=True)
    df[categorical] = df[categorical].astype('str')
    if train:
        scaler = StandardScaler()
        df[numerical] = scaler.fit_transform(df[numerical])
        
        df_dict = df.to_dict(orient='records')
        dv = DictVectorizer()
        X = dv.fit_transform(df_dict)
        return scaler, dv, X, target_vec
    else:
        df[numerical] = scaler.transform(df[numerical])
        df_dict = df.to_dict(orient='records')
        X = dv.transform(df_dict)
        return X, target_vec

## Training

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
scaler, dv, X_train, y_train = preprocess_data(df_train)

In [18]:
def train(X_train, y_train):

    lr = LogisticRegression(solver='liblinear')
    lr.fit(X_train, y_train)
    
    return lr 

In [19]:
lr = train(X_train, y_train)

## Inference

In [20]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import fbeta_score

In [21]:
def predict(X, lr):
    return lr.predict(X)

In [22]:
balanced_accuracy_score(y_train, predict(X_train, lr))

0.8677798840321143

In [23]:
fbeta_score(y_train, predict(X_train, lr), beta=0)

0.8305555555555556

In [24]:
X_val, y_val = preprocess_data(df_val, dv, scaler, train=False)

In [25]:
balanced_accuracy_score(y_val, predict(X_val, lr))

0.8192213133327291

In [26]:
fbeta_score(y_val, predict(X_val, lr), beta=0)

0.7374005305039788

In [27]:
import xgboost as xgb


In [28]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [45]:
params = {
    'max_depth': 3,
    'objective': 'binary:logistic', 
}

In [46]:
booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )

[0]	validation-logloss:0.54952
[1]	validation-logloss:0.46856
[2]	validation-logloss:0.41636
[3]	validation-logloss:0.38347
[4]	validation-logloss:0.35878
[5]	validation-logloss:0.34137
[6]	validation-logloss:0.32549
[7]	validation-logloss:0.31677
[8]	validation-logloss:0.30179
[9]	validation-logloss:0.29589
[10]	validation-logloss:0.29051
[11]	validation-logloss:0.28730
[12]	validation-logloss:0.28405
[13]	validation-logloss:0.28167
[14]	validation-logloss:0.27962
[15]	validation-logloss:0.27695
[16]	validation-logloss:0.27520
[17]	validation-logloss:0.27335
[18]	validation-logloss:0.27252
[19]	validation-logloss:0.26885
[20]	validation-logloss:0.26673
[21]	validation-logloss:0.26587
[22]	validation-logloss:0.26538
[23]	validation-logloss:0.26406
[24]	validation-logloss:0.26255
[25]	validation-logloss:0.26164
[26]	validation-logloss:0.26067
[27]	validation-logloss:0.26054
[28]	validation-logloss:0.26067
[29]	validation-logloss:0.25962
[30]	validation-logloss:0.25903
[31]	validation-lo

In [51]:
def predict_binary(probs):
    return (probs >= 0.5).astype('int')

In [52]:
probs = booster.predict(valid)

In [53]:
predict_binary(probs)

array([0, 1, 0, ..., 0, 1, 0])

In [54]:
fbeta_score(y_val, predict_binary(probs), beta=0.2)

0.8202080237741455