In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pickle

In [2]:
# loadind the dataset
data = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# Preprocessing data: dropping irrelevant features
data = data.drop(['customerID'],axis=1)
data

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [4]:
# Since we are explicitly modeling churn only for internet users we can replace "No internet service" with "No"
data.replace("No internet service", "No", inplace=True)

# similarly no phone service with no
data.replace("No phone service", "No", inplace=True)

In [5]:
# Observation: there are missing values in Total Charges column : Handling missing values using median
# Step 1: Replace empty strings or spaces with NaN
data['TotalCharges'] = data['TotalCharges'].replace(r'^\s*$', float("nan"), regex=True)

# Step 2: Convert to float
data['TotalCharges'] = data['TotalCharges'].astype(float)

# Step 3: Handle missing values using median
data['TotalCharges'].fillna(data['TotalCharges'].median(), inplace=True)  # Filling NaN with median value

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['TotalCharges'].fillna(data['TotalCharges'].median(), inplace=True)  # Filling NaN with median value


In [6]:
# Columns to encode
columns_to_encode = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'OnlineSecurity', 
                     'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 
                     'PaperlessBilling', 'Churn']

# Dictionary to store LabelEncoders for each column
label_encoders = {}

# Apply Label Encoding
for column in columns_to_encode:
    le = LabelEncoder()  # Create a new LabelEncoder for each column
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le  # Store the encoder for future use

In [7]:
# To keep track of the mappings (important for decoding later):
label_mappings = {}
for column in columns_to_encode:
    label_mappings[column] = dict(zip(le.classes_, le.transform(le.classes_)))
print(label_mappings) # To see the encoded values and their original values.

{'gender': {'No': 0, 'Yes': 1}, 'SeniorCitizen': {'No': 0, 'Yes': 1}, 'Partner': {'No': 0, 'Yes': 1}, 'Dependents': {'No': 0, 'Yes': 1}, 'PhoneService': {'No': 0, 'Yes': 1}, 'MultipleLines': {'No': 0, 'Yes': 1}, 'OnlineSecurity': {'No': 0, 'Yes': 1}, 'OnlineBackup': {'No': 0, 'Yes': 1}, 'DeviceProtection': {'No': 0, 'Yes': 1}, 'TechSupport': {'No': 0, 'Yes': 1}, 'StreamingTV': {'No': 0, 'Yes': 1}, 'StreamingMovies': {'No': 0, 'Yes': 1}, 'PaperlessBilling': {'No': 0, 'Yes': 1}, 'Churn': {'No': 0, 'Yes': 1}}


In [8]:
print(data.head())

   gender  SeniorCitizen  Partner  Dependents  tenure  PhoneService  \
0       0              0        1           0       1             0   
1       1              0        0           0      34             1   
2       1              0        0           0       2             1   
3       1              0        0           0      45             0   
4       0              0        0           0       2             1   

   MultipleLines InternetService  OnlineSecurity  OnlineBackup  \
0              0             DSL               0             1   
1              0             DSL               1             0   
2              0             DSL               1             1   
3              0             DSL               1             0   
4              0     Fiber optic               0             0   

   DeviceProtection  TechSupport  StreamingTV  StreamingMovies  \
0                 0            0            0                0   
1                 1            0            

In [9]:
# One hot encoding for columns where there is more than 2 categories 
from sklearn.preprocessing import OneHotEncoder

# List of categorical columns to One-Hot Encode
categorical_columns = ['InternetService', 'Contract', 'PaymentMethod']

# Initialize OneHotEncoder
onehot_encoder = OneHotEncoder(sparse_output=False, drop=None)  # drop='first' avoids dummy variable trap

# Apply OHE
encoded_data = onehot_encoder.fit_transform(data[categorical_columns])

In [10]:
# Convert encoded data to DataFrame
encoded_df = pd.DataFrame(encoded_data, columns=onehot_encoder.get_feature_names_out(categorical_columns))

# Drop original categorical columns and concatenate encoded data
data = data.drop(columns=categorical_columns).reset_index(drop=True)
data = pd.concat([data, encoded_df], axis=1)

In [11]:
# Display the updated dataset
data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,1,0,1,0,0,0,1,0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,0,0,0,34,1,0,1,0,1,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,1,0,0,0,2,1,0,1,1,0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1,0,0,0,45,0,0,1,0,1,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0,0,0,0,2,1,0,0,0,0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [12]:
# Saving Label Encoders as a .pkl file
with open("label_encoders.pkl", "wb") as file:
    pickle.dump(label_encoders, file)

# Save the OneHotEncoder as a .pkl file
with open("onehot_encoder.pkl", "wb") as file:
    pickle.dump(onehot_encoder, file)

In [13]:
print(data.dtypes)

gender                                       int32
SeniorCitizen                                int64
Partner                                      int32
Dependents                                   int32
tenure                                       int64
PhoneService                                 int32
MultipleLines                                int32
OnlineSecurity                               int32
OnlineBackup                                 int32
DeviceProtection                             int32
TechSupport                                  int32
StreamingTV                                  int32
StreamingMovies                              int32
PaperlessBilling                             int32
MonthlyCharges                             float64
TotalCharges                               float64
Churn                                        int32
InternetService_DSL                        float64
InternetService_Fiber optic                float64
InternetService_No             

In [14]:
# Dividing the dataset into independent and dependent feature
X = data.drop('Churn',axis=1)
y = data['Churn'] # ensure 1d array

# Splitting data into training and testing dataset
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# scaling the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
X_train

array([[-1.02516569, -0.4377492 , -0.96957859, ..., -0.52765585,
        -0.70964983,  1.84247002],
       [-1.02516569, -0.4377492 , -0.96957859, ..., -0.52765585,
        -0.70964983, -0.54274967],
       [ 0.97545208, -0.4377492 ,  1.03137591, ..., -0.52765585,
         1.40914569, -0.54274967],
       ...,
       [ 0.97545208, -0.4377492 ,  1.03137591, ..., -0.52765585,
         1.40914569, -0.54274967],
       [ 0.97545208,  2.28441306, -0.96957859, ..., -0.52765585,
         1.40914569, -0.54274967],
       [ 0.97545208, -0.4377492 , -0.96957859, ...,  1.89517467,
        -0.70964983, -0.54274967]])

In [16]:
X_test

array([[-1.02516569, -0.4377492 ,  1.03137591, ..., -0.52765585,
         1.40914569, -0.54274967],
       [ 0.97545208, -0.4377492 , -0.96957859, ..., -0.52765585,
        -0.70964983, -0.54274967],
       [-1.02516569, -0.4377492 ,  1.03137591, ..., -0.52765585,
        -0.70964983,  1.84247002],
       ...,
       [ 0.97545208, -0.4377492 ,  1.03137591, ..., -0.52765585,
        -0.70964983, -0.54274967],
       [-1.02516569, -0.4377492 ,  1.03137591, ...,  1.89517467,
        -0.70964983, -0.54274967],
       [ 0.97545208, -0.4377492 , -0.96957859, ..., -0.52765585,
        -0.70964983,  1.84247002]])

In [17]:
with open("scaler.pkl", "wb") as file:
    pickle.dump(scaler, file)

Conclusion: 
1. Drop Irrelevant features (Customer_ID)
2. We are explicitly modeling churn only for internet users hence replaced "No internet service" with "No" similarly "No phone service" with "No"
3. Handled missing values in Total_charges feature using median
4. Did label encoding for features with 2 class values
5. Did one hot encoding for features greater than 2 class values
6. Created pickle files for the same

## ANN Implementation

In [19]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard
import datetime

In [21]:
(X_train.shape[1],) # this is the input feature size this many no of nodes will be there at input layer

(26,)

In [44]:
# Building ANN model
model = Sequential([
    Dense(64,activation='relu',input_shape=(X_train.shape[1],)), # HL1 connected to input layer
    Dense(32,activation='relu'),#hl2
    Dense(1,activation='sigmoid') # output layer
]
)

In [45]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_13 (Dense)            (None, 64)                1728      
                                                                 
 dense_14 (Dense)            (None, 32)                2080      
                                                                 
 dense_15 (Dense)            (None, 1)                 33        
                                                                 
Total params: 3841 (15.00 KB)
Trainable params: 3841 (15.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [46]:
import tensorflow
opt = tensorflow.keras.optimizers.Adam(learning_rate=0.01)
loss = tensorflow.keras.losses.BinaryCrossentropy()
loss

<keras.src.losses.BinaryCrossentropy at 0x2752dafe250>

In [47]:
# compile the model
model.compile(optimizer="adam",loss="binary_crossentropy",metrics=['accuracy'])

In [63]:
# set up the tensorboard
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorflow_callback = TensorBoard(log_dir=log_dir,histogram_freq=1)

In [64]:
# set up Early stopping - loss value not decreasing then stop training
early_stopping_callback = EarlyStopping(monitor='val_loss',patience=10,restore_best_weights=True)

In [65]:
# training the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=100,
    callbacks=[early_stopping_callback, tensorflow_callback]  # adding early stopping and tensorboard callbacks
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100


In [66]:
model.save('model.h5')

  saving_api.save_model(


In [67]:
# Load Tensorboard Extension
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [68]:
%tensorboard --logdir logs/fit

Reusing TensorBoard on port 6006 (pid 4060), started 0:05:23 ago. (Use '!kill 4060' to kill it.)

Prediction with trained ANN model