In [18]:
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
import time
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from sklearn.model_selection import train_test_split

In [19]:
df = pd.read_csv('E:\SnehaWork\AIProject\Dataset\Customer-Churn.csv')

In [17]:
Churned = df[df.Churn == 'Yes']
NonChurned = df[df.Churn == 'No']

number_of_Churned = len(Churned)
number_of_NonChurned = len(NonChurned)

Churned_percentage = round((number_of_Churned / (number_of_Churned + number_of_NonChurned)) * 100, 2)
valid_percentage = round((number_of_NonChurned / (number_of_Churned + number_of_NonChurned)) * 100, 2)
print(number_of_NonChurned,number_of_Churned)
df.columns.tolist()

5174 1869


['SeniorCitizen',
 'tenure',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'TechSupport',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'Churn']

In [6]:
#df.dtypes
#count yes and no on base data, train and test

In [21]:
list_of_cols = ['customerID',
 'gender',
 'Partner',
 'Dependents',
 'OnlineBackup',
 'DeviceProtection',
 'StreamingTV',
 'StreamingMovies',
 'MonthlyCharges',
 'TotalCharges']
df.drop(list_of_cols,axis='columns',inplace=True)

KeyError: "['customerID' 'gender' 'Partner' 'Dependents' 'OnlineBackup'\n 'DeviceProtection' 'StreamingTV' 'StreamingMovies' 'MonthlyCharges'\n 'TotalCharges'] not found in axis"

In [16]:
#error='coerce' ignores the errors. Conversion for all except for error values i.e. empty spaces. 
#.isnull() gives columns which have empty spaces
#shows the data which has TotalCharges as null. Good to drop these

df[pd.to_numeric(df.TotalCharges,errors='coerce').isnull()].shape

AttributeError: 'DataFrame' object has no attribute 'TotalCharges'

In [9]:
#Drop rows with TotalCharges as null

df1 = df[df.TotalCharges!=' ']

In [11]:
df1.TotalCharges = pd.to_numeric(df1.TotalCharges)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [34]:
df.replace('No internet service','No',inplace=True)
df.replace('No phone service','No',inplace=True)

In [43]:
#replace Yes/No to 1/0 since ML doesn't understand text

yes_no_columns = ['PhoneService','MultipleLines','OnlineSecurity',
                  'TechSupport','PaperlessBilling','Churn']

for col in yes_no_columns:
    df[col].replace({'Yes': 1,'No': 0},inplace=True)
    
#df['MultipleLines'].replace({'Yes': 1,'No': 0},inplace=True)   

In [18]:
df1['gender'].replace({'Male':1,'Female':0},inplace=True)


In [24]:
#One hot encoding for multiclass i.e. InternetService, Contract, PaymentMethod

multiclass_col=['InternetService', 'Contract', 'PaymentMethod']
df = pd.get_dummies(data=df,columns=multiclass_col)
 

In [36]:
#scaling the values between 1 and 0 for below
cols_to_scale = ['tenure']
                 #,'MonthlyCharges','TotalCharges']

#MinMaxScaler scales the data between 0 and 1
scaler = MinMaxScaler()

df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])


In [25]:
def print_unique_col_values(df):
    for column in df:
        print(f'{column} : {df[column].unique()}')

In [44]:
print_unique_col_values(df)

SeniorCitizen : [0 1]
tenure : [0.01388889 0.47222222 0.02777778 0.625      0.11111111 0.30555556
 0.13888889 0.38888889 0.86111111 0.18055556 0.22222222 0.80555556
 0.68055556 0.34722222 0.95833333 0.72222222 0.98611111 0.29166667
 0.16666667 0.41666667 0.65277778 1.         0.23611111 0.375
 0.06944444 0.63888889 0.15277778 0.97222222 0.875      0.59722222
 0.20833333 0.83333333 0.25       0.91666667 0.125      0.04166667
 0.43055556 0.69444444 0.88888889 0.77777778 0.09722222 0.58333333
 0.48611111 0.66666667 0.40277778 0.90277778 0.52777778 0.94444444
 0.44444444 0.76388889 0.51388889 0.5        0.56944444 0.08333333
 0.05555556 0.45833333 0.93055556 0.31944444 0.79166667 0.84722222
 0.19444444 0.27777778 0.73611111 0.55555556 0.81944444 0.33333333
 0.61111111 0.26388889 0.75       0.70833333 0.36111111 0.
 0.54166667]
PhoneService : [0 1]
MultipleLines : [0 1]
OnlineSecurity : [0 1]
TechSupport : [0 1]
PaperlessBilling : [1 0]
Churn : [0 1]
InternetService_DSL : [1 0]
InternetServ

In [45]:
#Drop churn 
x = df.drop('Churn',axis=1)  #check axis=0/1
y = df['Churn']
y.value_counts()

0    5174
1    1869
Name: Churn, dtype: int64

In [46]:
#test and train split. 80% for training and 20% for test

X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=5)

In [47]:
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,test_size=0.2,random_state=5)

In [52]:
y_train.value_counts()

0    3329
1    1171
Name: Churn, dtype: int64

In [53]:
y_val.value_counts()

0    835
1    290
Name: Churn, dtype: int64

In [48]:
y_test.value_counts()
num_of_cols = len(x.columns)
num_of_cols

17

In [49]:
num_of_cols = len(x.columns)
X_train=X_train.values.reshape(X_train.shape[0],num_of_cols, 1).astype('float32') 
X_test=X_test.values.reshape(X_test.shape[0],num_of_cols, 1).astype('float32')
X_val=X_val.values.reshape(X_val.shape[0],num_of_cols, 1).astype('float32')

#sometimes works as X_val.values.reshape(X_val.shape[0],26, 1).astype('float32')

In [56]:
X_test.shape

(1407, 26, 1)

In [57]:
X_val.shape

(1125, 26, 1)

In [58]:
X_train.shape

(4500, 26, 1)

In [51]:
Name = "CNN_{}".format(int(time.time())) 
log_dir="E:\\SnehaWork\\AIProject\\TF_Logs\\{}".format(Name)
cnn = models.Sequential([
    layers.Conv1D(filters=128, kernel_size=3, activation='relu', input_shape=(num_of_cols,1)),  
    layers.MaxPooling1D(pool_size=3),
    
    layers.Conv1D(filters=128, kernel_size=3, activation='relu'),
    layers.MaxPooling1D(pool_size=3),
    
    layers.Flatten(),
    layers.Dense(20, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

#cnn.summary()
tb_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir,histogram_freq=0)

cnn.compile(optimizer='Adam',loss='binary_crossentropy',metrics=['accuracy'])

cnn.fit(X_train, y_train, validation_data=(X_val,y_val), batch_size=375,epochs=70, callbacks=[tb_callback])

Train on 4507 samples, validate on 1127 samples
Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70


Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70


<tensorflow.python.keras.callbacks.History at 0x20576505088>

In [52]:
cnn.evaluate(X_test,y_test,batch_size=375)



[0.4450469503934033, 0.7835344]

In [21]:
#conda install tensorboard

In [88]:
%load_ext tensorboard
%tensorboard --logdir log_dir
#The tensorboard extension is already loaded. To reload it, use:
#%reload_ext tensorboard
#%tensorboard --logdir log_dir

#taskkill /F /PID 15092

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 15092), started 2 days, 22:58:06 ago. (Use '!kill 15092' to kill it.)

ValueError: Found array with dim 3. Estimator expected <= 2.

In [52]:
X_train

array([[[1.],
        [0.],
        [1.],
        ...,
        [0.],
        [1.],
        [0.]],

       [[0.],
        [0.],
        [1.],
        ...,
        [0.],
        [1.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [1.]],

       ...,

       [[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [1.]],

       [[0.],
        [0.],
        [1.],
        ...,
        [0.],
        [0.],
        [0.]],

       [[1.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]]], dtype=float32)