In [1]:
import numpy as np
import pandas as pd
import seaborn as sb

In [2]:
df = pd.read_csv('Dataset/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [3]:
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [4]:
# Customer ID is removed, because it is a unique identifier of customer and no conclusion can be drawn from it
df = df.drop(['customerID'], axis=1)

In [5]:
# Check for null values
df.isnull().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [6]:
# Categorical values in dataset need to be replaced by numerical values (currently manual replacement is done, needs to be switched in favour of encoding):
# gender : male, female -> (0, 1)

df['gender'] = df['gender'].replace(['Male', 'Female'], [0, 1])

In [7]:
# Partner : yes, no -> (0, 1)
df['Partner'] = df['Partner'].replace(['No', 'Yes'], [0, 1])

In [8]:
# Dependents : No, Yes -> 0, 1
df['Dependents'] = df['Dependents'].replace(['No', 'Yes'], [0, 1])

In [9]:
# PhoneService : No, Yes -> 0, 1
df['PhoneService'] = df['PhoneService'].replace(['No', 'Yes'], [0, 1])

In [10]:
# MultipleLines : No, Yes, No Phone Service -> 0, 1, 2
df['MultipleLines'] = df['MultipleLines'].replace(['No', 'Yes', 'No phone service'], [0, 1, 2])

In [11]:
df['InternetService'].value_counts()

# InternetService : Fiber optic, DSL, no -> 0, 1, 2
df['InternetService'] = df['InternetService'].replace(['Fiber optic', 'DSL', 'no'], [0, 1, 2])

In [12]:
# OnlineSecurity : No, Yes, No internet service -> 0, 1, 2
# Online backup : No, Yes, No internet service -> 0, 1, 2
# Device Protected : No, Yes, No internet service -> 0, 1, 2
# TechSupport : No, Yes, No internet service -> 0, 1, 2
# StreamingTV : No, Yes, No internet service -> 0, 1, 2
# StreamingMovies : No, Yes, No internet service -> 0, 1, 2
df['OnlineSecurity'] = df['OnlineSecurity'].replace(['No', 'Yes', 'No internet service'], [0, 1, 2])
df['OnlineBackup'] = df['OnlineBackup'].replace(['No', 'Yes', 'No internet service'], [0, 1, 2])
df['DeviceProtection'] = df['OnlineBackup'].replace(['No', 'Yes', 'No internet service'], [0, 1, 2])
df['TechSupport'] = df['OnlineBackup'].replace(['No', 'Yes', 'No internet service'], [0, 1, 2])
df['StreamingTV'] = df['OnlineBackup'].replace(['No', 'Yes', 'No internet service'], [0, 1, 2])
df['StreamingMovies'] = df['OnlineBackup'].replace(['No', 'Yes', 'No internet service'], [0, 1, 2])

In [13]:
df['Contract'].value_counts()

# Contract : Month-to-month, One year, Two year -> 0, 1, 2
df['Contract'] = df['Contract'].replace(['Month-to-month', 'One year', 'Two year'], [0, 1, 2])

In [14]:
df['PaperlessBilling'].value_counts()

# PaperlessBilling : No, Yes -> 0, 1
df['PaperlessBilling'] = df['PaperlessBilling'].replace(['No', 'Yes'], [0, 1])

In [15]:
df['PaymentMethod'].value_counts()

# PaymentMethod : Electronic Check, Mailed check, Bank transfer (automatic), Credit card -> 0, 1, 2, 3
df['PaymentMethod'] = df['PaymentMethod'].replace(['Electronic check', 'Mailed check', 'Bank transfer (automatic)', 'Credit card (automatic)'], [0, 1, 2, 3])

In [16]:
# Churn : No, Yes -> 0, 1
df['Churn'] = df['Churn'].replace(['No', 'Yes'], [0, 1])

In [17]:
# Now that all the values are numerical, we can check the correlation coefficients.
df.corrwith(df["Churn"]).abs().sort_values()

gender              0.008612
PhoneService        0.011942
MultipleLines       0.019423
Partner             0.150448
SeniorCitizen       0.150889
Dependents          0.164221
PaperlessBilling    0.191825
MonthlyCharges      0.193356
PaymentMethod       0.262818
StreamingMovies     0.291449
StreamingTV         0.291449
OnlineBackup        0.291449
DeviceProtection    0.291449
TechSupport         0.291449
OnlineSecurity      0.332819
tenure              0.352229
Contract            0.396713
Churn               1.000000
dtype: float64

In [18]:
# Here, gender, phone service and multiple lines do not contribute much towards the target variable, as thier correlation coeffs are extremely low.
# Hence, removing these columns from the data frame.
df = df.drop(['gender', 'PhoneService', 'MultipleLines'], axis=1)

In [19]:
# Data frame after basic data processing.
df

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,1,0,1,1,0,1,1,1,1,1,0,1,0,29.85,29.85,0
1,0,0,0,34,1,1,0,0,0,0,0,1,0,1,56.95,1889.5,0
2,0,0,0,2,1,1,1,1,1,1,1,0,1,1,53.85,108.15,1
3,0,0,0,45,1,1,0,0,0,0,0,1,0,2,42.30,1840.75,0
4,0,0,0,2,0,0,0,0,0,0,0,0,1,0,70.70,151.65,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,1,1,24,1,1,0,0,0,0,0,1,1,1,84.80,1990.5,0
7039,0,1,1,72,0,0,1,1,1,1,1,1,1,3,103.20,7362.9,0
7040,0,1,1,11,1,1,0,0,0,0,0,0,1,0,29.60,346.45,0
7041,1,1,0,4,0,0,0,0,0,0,0,0,1,1,74.40,306.6,1
