In [23]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [24]:
import tensorflow as tf
from tensorflow.keras import layers

print(tf.version.VERSION)
print(tf.keras.__version__)

1.14.0
2.2.4-tf


# Objective:

Implement a MLP Neural Network for predicting churn from a telecom dataset. 

#### Content

Each row represents a customer, each column contains customer’s attributes described on the column Metadata.

The data set includes information about:

* Customers who left within the last month – the column is called Churn
* Services that each customer has signed up for – phone, multiple lines, internet, online security, online backup, device protection, tech support, and streaming TV and movies
* Customer account information – how long they’ve been a customer, contract, payment method, paperless billing, monthly  charges, and total charges
* Demographic info about customers – gender, age range, and if they have partners and dependents


### Columns 
* **customerID**: Customer ID
* **gender**: Whether the customer is a male or a female
* **SeniorCitizen**: Whether the customer is a senior citizen or not (1, 0)
* **Partner**: Whether the customer has a partner or not (Yes, No)
* **Dependents**: Whether the customer has dependents or not (Yes, No)
* **tenureNumber**: of months the customer has stayed with the company
* **PhoneService**: Whether the customer has a phone service or not (Yes, No)
* **MultipleLines**: Whether the customer has multiple lines or not (Yes, No, No phone service)
* **InternetService**: Customer’s internet service provider (DSL, Fiber optic, No)
* **OnlineSecurity**: Whether the customer has online security or not (Yes, No, No internet service)
* **OnlineBackup**: Whether the customer has online backup or not (Yes, No, No internet service)
* **DeviceProtection**: Whether the customer has device protection or not (Yes, No, No internet service)
* **TechSupport**: Whether the customer has tech support or not (Yes, No, No internet service)
* **StreamingTV**: Whether the customer has streaming TV or not (Yes, No, No internet service)
* **StreamingMovies**: Whether the customer has streaming movies or not (Yes, No, No internet service)
* **Contract**: The contract term of the customer (Month-to-month, One year, Two year)
* **PaperlessBilling**: Whether the customer has paperless billing or not (Yes, No)
* **PaymentMethod**: The customer’s payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic))
* **MonthlyCharges**: The amount charged to the customer monthly
* **TotalCharges**: The total amount charged to the customer
* **Churn**: Whether the customer churned or not (Yes or No)

In [25]:
df = pd.read_csv("telco-customer-churn\churn_df.csv")

In [26]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## Quick exploration of variables 

In [27]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
customerID          7043 non-null object
gender              7043 non-null object
SeniorCitizen       7043 non-null int64
Partner             7043 non-null object
Dependents          7043 non-null object
tenure              7043 non-null int64
PhoneService        7043 non-null object
MultipleLines       7043 non-null object
InternetService     7043 non-null object
OnlineSecurity      7043 non-null object
OnlineBackup        7043 non-null object
DeviceProtection    7043 non-null object
TechSupport         7043 non-null object
StreamingTV         7043 non-null object
StreamingMovies     7043 non-null object
Contract            7043 non-null object
PaperlessBilling    7043 non-null object
PaymentMethod       7043 non-null object
MonthlyCharges      7043 non-null float64
TotalCharges        7043 non-null object
Churn               7043 non-null object
dtypes: float64(1), int64(2), obj

# Data Preprocessing 

To do's:

* Encode text variables in order to feed them to a neural network.
* Drop customerID as it is of no use 

In [29]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
encoder = preprocessing.LabelEncoder()
onehotencoder = OneHotEncoder(categorical_features = [0])
scaler = MinMaxScaler()

In [30]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'],errors='coerce')
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'],errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(0.0)
df[['TotalCharges', 'MonthlyCharges']]= scaler.fit_transform(df[['TotalCharges', 'MonthlyCharges']])
df['MonthlyCharges'] = pd.to_numeric(df['MonthlyCharges'],errors='coerce')
df['Churn_encoded'] = encoder.fit_transform(df['Churn'])
df['gender_encoded'] = encoder.fit_transform(df['gender'])
df['Partner_encoded'] = encoder.fit_transform(df['Partner'])
df['Dependents_encoded'] = encoder.fit_transform(df['Dependents'])
df['PhoneService_encoded'] = encoder.fit_transform(df['PhoneService'])
df['PaperlessBilling_encoded'] = encoder.fit_transform(df['PaperlessBilling'])

In [31]:
def dummy_creator(col_name,dataset):
    dummies = pd.get_dummies(dataset[col_name]).rename(columns=lambda x: col_name + '_'+ str(x))
    #dataset = pd.concat([dataset, dummies], axis=1)
    return dummies



In [32]:
df = pd.concat([df, dummy_creator('TechSupport',df)], axis=1)
df = pd.concat([df, dummy_creator('MultipleLines',df)], axis=1)
df = pd.concat([df, dummy_creator('InternetService',df)], axis=1)
df = pd.concat([df, dummy_creator('OnlineSecurity',df)], axis=1)
df = pd.concat([df, dummy_creator('OnlineBackup',df)], axis=1)
df = pd.concat([df, dummy_creator('DeviceProtection',df)], axis=1)
df = pd.concat([df, dummy_creator('TechSupport',df)], axis=1)
df = pd.concat([df, dummy_creator('StreamingTV',df)], axis=1)
df = pd.concat([df, dummy_creator('StreamingMovies',df)], axis=1)
df = pd.concat([df, dummy_creator('Contract',df)], axis=1)
df = pd.concat([df, dummy_creator('PaymentMethod',df)], axis=1)

In [33]:
to_drop= ['customerID','gender','Churn','Partner','Dependents','PhoneService','PaperlessBilling',
          'TechSupport','MultipleLines','InternetService','OnlineSecurity',
          'OnlineBackup','DeviceProtection','TechSupport',
          'StreamingTV','StreamingMovies','Contract','PaymentMethod']

df.drop(to_drop,axis=1).columns

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges',
       'Churn_encoded', 'gender_encoded', 'Partner_encoded',
       'Dependents_encoded', 'PhoneService_encoded',
       'PaperlessBilling_encoded', 'TechSupport_No',
       'TechSupport_No internet service', 'TechSupport_Yes',
       'MultipleLines_No', 'MultipleLines_No phone service',
       'MultipleLines_Yes', 'InternetService_DSL',
       'InternetService_Fiber optic', 'InternetService_No',
       'OnlineSecurity_No', 'OnlineSecurity_No internet service',
       'OnlineSecurity_Yes', 'OnlineBackup_No',
       'OnlineBackup_No internet service', 'OnlineBackup_Yes',
       'DeviceProtection_No', 'DeviceProtection_No internet service',
       'DeviceProtection_Yes', 'TechSupport_No',
       'TechSupport_No internet service', 'TechSupport_Yes', 'StreamingTV_No',
       'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No', 'StreamingMovies_No internet service',
       'StreamingMovies_Yes'

In [34]:
df = df.drop(to_drop,axis= 1)

In [35]:
df.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn_encoded,gender_encoded,Partner_encoded,Dependents_encoded,PhoneService_encoded,PaperlessBilling_encoded,...,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,0.115423,0.003437,0,0,1,0,0,1,...,1,0,0,1,0,0,0,0,1,0
1,0,34,0.385075,0.217564,0,1,0,0,1,0,...,1,0,0,0,1,0,0,0,0,1
2,0,2,0.354229,0.012453,1,1,0,0,1,1,...,1,0,0,1,0,0,0,0,0,1
3,0,45,0.239303,0.211951,0,1,0,0,0,0,...,1,0,0,0,1,0,1,0,0,0
4,0,2,0.521891,0.017462,1,0,0,0,1,1,...,1,0,0,1,0,0,0,0,1,0


## Model implementation


In [36]:
labels = df['Churn_encoded']
labels.head()
df_features = df.drop(['Churn_encoded'],axis=1)

In [37]:
df_features.shape

(7043, 43)

In [38]:
### Splitting the datsets

from sklearn.model_selection import train_test_split
X_train, X_test_validation, y_train, y_test_validation = train_test_split(df_features, labels,
                                                    stratify=labels, 
                                                    test_size=0.4)

In [39]:
X_train

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_encoded,Partner_encoded,Dependents_encoded,PhoneService_encoded,PaperlessBilling_encoded,TechSupport_No,...,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
1349,0,51,0.312438,0.294002,0,1,0,0,1,1,...,0,0,1,1,0,0,0,0,1,0
4150,1,7,0.703980,0.069714,1,1,0,1,1,1,...,0,0,1,1,0,0,0,0,1,0
3710,0,10,0.021891,0.027911,1,1,1,1,0,0,...,0,1,0,0,1,0,0,0,0,1
1191,0,35,0.506468,0.286725,1,1,1,1,1,0,...,1,0,0,0,0,1,0,1,0,0
6625,1,12,0.726866,0.126025,0,0,0,1,1,1,...,1,0,0,1,0,0,0,0,1,0
4133,0,56,0.799502,0.642623,1,0,0,1,0,1,...,0,0,1,1,0,0,0,0,1,0
1354,0,18,0.697512,0.188755,0,0,0,1,1,0,...,1,0,0,1,0,0,0,0,1,0
215,0,57,0.402985,0.395801,0,1,0,0,1,1,...,0,0,1,1,0,0,1,0,0,0
6127,0,56,0.910945,0.703488,0,1,0,1,0,1,...,0,0,1,0,1,0,1,0,0,0
4142,0,67,0.420398,0.445606,0,0,0,1,0,0,...,1,0,0,0,1,0,0,1,0,0


In [40]:
type(y_train)

pandas.core.series.Series

In [41]:
model = tf.keras.Sequential()
model.add(layers.Dense(64, input_dim=43, activation='sigmoid'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(64, activation='sigmoid'))
#model.add(layers.Dense(64, activation='sigmoid'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation='sigmoid'))
sgd = tf.keras.optimizers.SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='binary_crossentropy',
              #optimizer = tf.train.AdamOptimizer(0.001),
              optimizer='rmsprop',
              #optimizer = sgd,
              metrics=['accuracy'])

model.fit(X_train, y_train,
          epochs=100,
          batch_size=32)
score = model.evaluate(X_test_validation, y_test_validation, batch_size=32)

InternalError: GPU sync failed

In [22]:
model = tf.keras.Sequential()
model.add(layers.Dense(64, input_dim=43, activation='sigmoid'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(64, activation='sigmoid'))
#model.add(layers.Dense(64, activation='sigmoid'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation='linear'))
sgd = tf.keras.optimizers.SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='binary_crossentropy',
              #optimizer = tf.train.AdamOptimizer(0.001),
              optimizer='rmsprop',
              #optimizer = sgd,
              metrics=['accuracy'])

model.fit(X_train, y_train,
          epochs=100,
          batch_size=32)
score = model.evaluate(X_test_validation, y_test_validation, batch_size=32)

InternalError: GPU sync failed

In [None]:
model = tf.keras.Sequential()
model.add(layers.Dense(64, input_dim=43, activation='sigmoid'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(64, activation='sigmoid'))
#model.add(layers.Dense(64, activation='sigmoid'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation='linear'))
sgd = tf.keras.optimizers.SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='binary_crossentropy',
              optimizer = tf.train.AdamOptimizer(0.01),
              #optimizer='rmsprop',
              #optimizer = sgd,
              metrics=['accuracy'])

model.fit(X_train, y_train,
          epochs=30,
          batch_size=32)
score = model.evaluate(X_test_validation, y_test_validation, batch_size=32)

In [None]:
model.compile(optimizer=tf.train.AdamOptimizer(0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
model.fit(df_features, labels, epochs=10, batch_size=64)

In [None]:
print("No. of missing values: \n",df.isnull().sum())


In [None]:
df.isnull().values.any()