In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [3]:
import tensorflow as tf
from tensorflow.keras import layers

print(tf.version.VERSION)
print(tf.keras.__version__)

1.13.1
2.2.4-tf


# Objective:

Implement a MLP Neural Network for predicting churn from a telecom dataset. 

#### Content

Each row represents a customer, each column contains customer’s attributes described on the column Metadata.

The data set includes information about:

* Customers who left within the last month – the column is called Churn
* Services that each customer has signed up for – phone, multiple lines, internet, online security, online backup, device protection, tech support, and streaming TV and movies
* Customer account information – how long they’ve been a customer, contract, payment method, paperless billing, monthly  charges, and total charges
* Demographic info about customers – gender, age range, and if they have partners and dependents


### Columns 
* **customerID**: Customer ID
* **gender**: Whether the customer is a male or a female
* **SeniorCitizen**: Whether the customer is a senior citizen or not (1, 0)
* **Partner**: Whether the customer has a partner or not (Yes, No)
* **Dependents**: Whether the customer has dependents or not (Yes, No)
* **tenureNumber**: of months the customer has stayed with the company
* **PhoneService**: Whether the customer has a phone service or not (Yes, No)
* **MultipleLines**: Whether the customer has multiple lines or not (Yes, No, No phone service)
* **InternetService**: Customer’s internet service provider (DSL, Fiber optic, No)
* **OnlineSecurity**: Whether the customer has online security or not (Yes, No, No internet service)
* **OnlineBackup**: Whether the customer has online backup or not (Yes, No, No internet service)
* **DeviceProtection**: Whether the customer has device protection or not (Yes, No, No internet service)
* **TechSupport**: Whether the customer has tech support or not (Yes, No, No internet service)
* **StreamingTV**: Whether the customer has streaming TV or not (Yes, No, No internet service)
* **StreamingMovies**: Whether the customer has streaming movies or not (Yes, No, No internet service)
* **Contract**: The contract term of the customer (Month-to-month, One year, Two year)
* **PaperlessBilling**: Whether the customer has paperless billing or not (Yes, No)
* **PaymentMethod**: The customer’s payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic))
* **MonthlyCharges**: The amount charged to the customer monthly
* **TotalCharges**: The total amount charged to the customer
* **Churn**: Whether the customer churned or not (Yes or No)

In [252]:
df = pd.read_csv("telco-customer-churn\churn_df.csv")

In [253]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## Quick exploration of variables 

In [254]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [255]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
customerID          7043 non-null object
gender              7043 non-null object
SeniorCitizen       7043 non-null int64
Partner             7043 non-null object
Dependents          7043 non-null object
tenure              7043 non-null int64
PhoneService        7043 non-null object
MultipleLines       7043 non-null object
InternetService     7043 non-null object
OnlineSecurity      7043 non-null object
OnlineBackup        7043 non-null object
DeviceProtection    7043 non-null object
TechSupport         7043 non-null object
StreamingTV         7043 non-null object
StreamingMovies     7043 non-null object
Contract            7043 non-null object
PaperlessBilling    7043 non-null object
PaymentMethod       7043 non-null object
MonthlyCharges      7043 non-null float64
TotalCharges        7043 non-null object
Churn               7043 non-null object
dtypes: float64(1), int64(2), obj

# Data Preprocessing 

To do's:

* Encode text variables in order to feed them to a neural network.
* Drop customerID as it is of no use 

In [257]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
encoder = preprocessing.LabelEncoder()
onehotencoder = OneHotEncoder(categorical_features = [0])
scaler = MinMaxScaler()

In [258]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'],errors='coerce')
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'],errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(0.0)
df[['TotalCharges', 'MonthlyCharges']]= scaler.fit_transform(df[['TotalCharges', 'MonthlyCharges']])
df['MonthlyCharges'] = pd.to_numeric(df['MonthlyCharges'],errors='coerce')
df['Churn_encoded'] = encoder.fit_transform(df['Churn'])
df['gender_encoded'] = encoder.fit_transform(df['gender'])
df['Partner_encoded'] = encoder.fit_transform(df['Partner'])
df['Dependents_encoded'] = encoder.fit_transform(df['Dependents'])
df['PhoneService_encoded'] = encoder.fit_transform(df['PhoneService'])
df['PaperlessBilling_encoded'] = encoder.fit_transform(df['PaperlessBilling'])

In [259]:
def dummy_creator(col_name,dataset):
    dummies = pd.get_dummies(dataset[col_name]).rename(columns=lambda x: col_name + '_'+ str(x))
    #dataset = pd.concat([dataset, dummies], axis=1)
    return dummies



In [260]:
df = pd.concat([df, dummy_creator('TechSupport',df)], axis=1)
df = pd.concat([df, dummy_creator('MultipleLines',df)], axis=1)
df = pd.concat([df, dummy_creator('InternetService',df)], axis=1)
df = pd.concat([df, dummy_creator('OnlineSecurity',df)], axis=1)
df = pd.concat([df, dummy_creator('OnlineBackup',df)], axis=1)
df = pd.concat([df, dummy_creator('DeviceProtection',df)], axis=1)
df = pd.concat([df, dummy_creator('TechSupport',df)], axis=1)
df = pd.concat([df, dummy_creator('StreamingTV',df)], axis=1)
df = pd.concat([df, dummy_creator('StreamingMovies',df)], axis=1)
df = pd.concat([df, dummy_creator('Contract',df)], axis=1)
df = pd.concat([df, dummy_creator('PaymentMethod',df)], axis=1)

In [261]:
to_drop= ['customerID','gender','Churn','Partner','Dependents','PhoneService','PaperlessBilling',
          'TechSupport','MultipleLines','InternetService','OnlineSecurity',
          'OnlineBackup','DeviceProtection','TechSupport',
          'StreamingTV','StreamingMovies','Contract','PaymentMethod']

df.drop(to_drop,axis=1).columns

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges',
       'Churn_encoded', 'gender_encoded', 'Partner_encoded',
       'Dependents_encoded', 'PhoneService_encoded',
       'PaperlessBilling_encoded', 'TechSupport_No',
       'TechSupport_No internet service', 'TechSupport_Yes',
       'MultipleLines_No', 'MultipleLines_No phone service',
       'MultipleLines_Yes', 'InternetService_DSL',
       'InternetService_Fiber optic', 'InternetService_No',
       'OnlineSecurity_No', 'OnlineSecurity_No internet service',
       'OnlineSecurity_Yes', 'OnlineBackup_No',
       'OnlineBackup_No internet service', 'OnlineBackup_Yes',
       'DeviceProtection_No', 'DeviceProtection_No internet service',
       'DeviceProtection_Yes', 'TechSupport_No',
       'TechSupport_No internet service', 'TechSupport_Yes', 'StreamingTV_No',
       'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No', 'StreamingMovies_No internet service',
       'StreamingMovies_Yes'

In [262]:
df = df.drop(to_drop,axis= 1)

In [263]:
df.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn_encoded,gender_encoded,Partner_encoded,Dependents_encoded,PhoneService_encoded,PaperlessBilling_encoded,...,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,0.115423,0.003437,0,0,1,0,0,1,...,1,0,0,1,0,0,0,0,1,0
1,0,34,0.385075,0.217564,0,1,0,0,1,0,...,1,0,0,0,1,0,0,0,0,1
2,0,2,0.354229,0.012453,1,1,0,0,1,1,...,1,0,0,1,0,0,0,0,0,1
3,0,45,0.239303,0.211951,0,1,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0
4,0,2,0.521891,0.017462,1,0,0,0,1,1,...,1,0,0,1,0,0,0,0,1,0


## Model implementation


In [264]:
labels = df['Churn_encoded']
labels.head()
df_features = df.drop(['Churn_encoded'],axis=1)

In [265]:
df_features.shape

(7043, 43)

In [268]:
### Splitting the datsets

from sklearn.model_selection import train_test_split
X_train, X_test_validation, y_train, y_test_validation = train_test_split(df_features, labels,
                                                    stratify=labels, 
                                                    test_size=0.4)

In [269]:
X_train

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_encoded,Partner_encoded,Dependents_encoded,PhoneService_encoded,PaperlessBilling_encoded,TechSupport_No,...,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
7034,0,67,0.842786,0.792908,0,0,0,1,1,1,...,1,0,0,1,0,0,0,1,0,0
3438,0,18,0.760697,0.194357,1,1,0,1,1,1,...,0,0,1,1,0,0,0,0,1,0
5223,0,61,0.071642,0.177160,1,1,0,1,0,0,...,0,1,0,0,0,1,1,0,0,0
532,0,2,0.613930,0.020087,1,0,0,1,1,1,...,1,0,0,1,0,0,0,0,1,0
1094,0,41,0.500000,0.327002,1,1,1,1,0,1,...,1,0,0,0,1,0,1,0,0,0
5295,1,50,0.900498,0.625449,0,1,0,1,0,1,...,0,0,1,1,0,0,1,0,0,0
5114,1,50,0.648259,0.473667,0,0,0,1,1,1,...,1,0,0,1,0,0,0,0,1,0
2565,0,13,0.067164,0.038285,0,1,1,1,1,0,...,0,1,0,0,0,1,1,0,0,0
4378,0,67,0.025871,0.152842,0,1,0,1,1,0,...,0,1,0,0,1,0,1,0,0,0
3202,0,1,0.368159,0.006362,1,0,0,1,1,1,...,0,0,1,1,0,0,0,0,1,0


In [214]:
score

[nan, 0.7345635]

In [274]:
model = tf.keras.Sequential()
model.add(layers.Dense(64, input_dim=43, activation='sigmoid'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(64, activation='sigmoid'))
#model.add(layers.Dense(64, activation='sigmoid'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation='linear'))
sgd = tf.keras.optimizers.SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='binary_crossentropy',
              #optimizer = tf.train.AdamOptimizer(0.001),
              optimizer='rmsprop',
              #optimizer = sgd,
              metrics=['accuracy'])

model.fit(X_train, y_train,
          epochs=100,
          batch_size=32)
score = model.evaluate(X_test_validation, y_test_validation, batch_size=32)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [216]:
model.compile(optimizer=tf.train.AdamOptimizer(0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [217]:
model.fit(df_features, labels, epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x145417ca940>

In [250]:
print("No. of missing values: \n",df.isnull().sum())


No. of missing values: 
 SeniorCitizen                               0
tenure                                      0
MonthlyCharges                              0
TotalCharges                               11
Churn_encoded                               0
gender_encoded                              0
Partner_encoded                             0
Dependents_encoded                          0
PhoneService_encoded                        0
PaperlessBilling_encoded                    0
TechSupport_No                              0
TechSupport_No internet service             0
TechSupport_Yes                             0
MultipleLines_No                            0
MultipleLines_No phone service              0
MultipleLines_Yes                           0
InternetService_DSL                         0
InternetService_Fiber optic                 0
InternetService_No                          0
OnlineSecurity_No                           0
OnlineSecurity_No internet service          0
OnlineSec

In [266]:
df.isnull().values.any()

False