In [39]:
import pandas as pd
import numpy as np


from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import pickle 
import requests

In [40]:
from sklearn.preprocessing import StandardScaler

def printest(args, value):
    return print( "{} : \n {} \n".format(args, value) )
def train(df, y, C=1.0):
    cat = df[categorical + numerical].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X = dv.fit_transform(cat)
    model = LogisticRegression(solver='liblinear', C=C)
    model.fit(X, y)

    return dv, model


def predict(df, dv, model):
    cat = df[categorical + numerical].to_dict(orient='records')

    X = dv.transform(cat)

    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

def predict_single(customer, dv, model):
    X = dv.transform([customer])
    y_pred = model.predict_proba(X)[:, 1]
    return y_pred[0]

# Model to Deploy
## Load and arrandge the data base for training purpose

In [41]:
# Load the dataset
df = pd.read_csv('./Data/WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Standardize column names: lowercase and replace spaces with underscores
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Identify all columns of type 'object' (i.e., strings/categorical)
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

# Normalize all string values: lowercase and replace spaces with underscores
for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

# Convert the 'totalcharges' column to numeric (some entries may be invalid)
# Invalid parsing results in NaN, which we then replace with 0
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)

# Convert 'churn' column to binary: 1 if 'yes', else 0
df.churn = (df.churn == 'yes').astype(int)

## Split the model in Train, and Test parts(validation done last time)

In [42]:

# Split the dataset into train+val (80%) and test (20%)
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)

# Reset row indices for consistency
df_train_full = df_train_full.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Extract target labels for each split (before deleting)
y_train_full = df_train_full.churn.values
y_test = df_test.churn.values

# Remove target column from features to isolate input features
del df_train_full['churn']
del df_test['churn']


In [43]:
# All categorical columns except 'customerid'
categorical_mask = df_train_full.dtypes == 'object'
categorical = list(df_train_full.dtypes[categorical_mask].index)
categorical.remove('customerid') 

# Manually add 'seniorcitizen' because it's an int boolean (0 or 1)
categorical.append('seniorcitizen')
printest('categorical', categorical)

# All numerical columns except 'seniorcitizen' because it's an int boolean
numerical_mask = df_train_full.dtypes != 'object'
numerical = list(df_train_full.dtypes[numerical_mask].index)
numerical.remove('seniorcitizen')
printest('numerical', numerical)

df_train_full[numerical]

categorical : 
 ['gender', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod', 'seniorcitizen'] 

numerical : 
 ['tenure', 'monthlycharges', 'totalcharges'] 



Unnamed: 0,tenure,monthlycharges,totalcharges
0,12,19.70,258.35
1,42,73.90,3160.55
2,71,65.15,4681.75
3,71,85.45,6300.85
4,30,70.40,2044.75
...,...,...,...
5629,9,100.50,918.60
5630,60,19.95,1189.90
5631,28,105.70,2979.50
5632,2,54.40,114.10


In [44]:
dv, model = train(df_train_full, y_train_full, C=0.5)
y_pred = predict(df_test, dv, model)

auc = roc_auc_score(y_test, y_pred)
print('auc = %.3f' % auc)


auc = 0.858


**Saving the model**

In [47]:
# wb: write binary
with open('churn_model.bin', 'wb') as f_out:
    # tuple of saved objects
    pickle.dump((dv, model), f_out)

# Load
with open('churn_model.bin', 'rb') as f_in:
    dv2, model2 = pickle.load(f_in)

y_pred2 = predict(df_test, dv2, model2)
auc = roc_auc_score(y_test, y_pred2)
print('auc = %.3f' % auc)


auc = 0.858


This script print out the response from the server, which will be a dictionary containing the churn probability and churn prediction for the provided customer data.

In [50]:
customer = {
  'customerid': '8879-zkjof',
  'gender': 'female',
  'seniorcitizen': 0,
  'partner': 'no',
  'dependents': 'no',
  'tenure': 41,
  'phoneservice': 'yes',
  'multiplelines': 'no',
  'internetservice': 'dsl',
  'onlinesecurity': 'yes',
  'onlinebackup': 'no',
  'deviceprotection': 'yes',
  'techsupport': 'yes',
  'streamingtv': 'yes',
  'streamingmovies': 'yes',
  'contract': 'one_year',
  'paperlessbilling': 'yes',
  'paymentmethod': 'bank_transfer_(automatic)',
  'monthlycharges': 79.85,
  'totalcharges': 3320.75
}
url = 'http://localhost:9696/predict'
response = requests.post(url, json=customer)
result = response.json()
print(result)

{'churn': False, 'churn_probability': 0.05960570371897223}
