# Importing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

%matplotlib inline

In [2]:
churn_data = pd.read_csv('/content/drive/MyDrive/Dataset/churn_data/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [3]:
churn_data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


Description about the data.
There are only three feature that are in numeric form

In [4]:
churn_data.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


Total 21 columns, where 17 of them are categorical (object data type) columns excluding the 'customrID' column, as it doesn't have any importance. 3 numeric columns 2 of the are in 'int64' data type and 1 in 'float64' data type. The 'churn' column is the target variable. A categorical column consiting 'yes' as the customer churned (left the company) and 'no' as the customer did not churn (stays in the company) 

In [5]:
churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


unique values that each feature holds, this was mainly done for the catgorical columns to observe how many categories each holds

In [6]:
for col in churn_data.columns[1:]:
    print("Unique values in '{}':\n".format(col), churn_data[col].unique())

Unique values in 'gender':
 ['Female' 'Male']
Unique values in 'SeniorCitizen':
 [0 1]
Unique values in 'Partner':
 ['Yes' 'No']
Unique values in 'Dependents':
 ['No' 'Yes']
Unique values in 'tenure':
 [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
Unique values in 'PhoneService':
 ['No' 'Yes']
Unique values in 'MultipleLines':
 ['No phone service' 'No' 'Yes']
Unique values in 'InternetService':
 ['DSL' 'Fiber optic' 'No']
Unique values in 'OnlineSecurity':
 ['No' 'Yes' 'No internet service']
Unique values in 'OnlineBackup':
 ['Yes' 'No' 'No internet service']
Unique values in 'DeviceProtection':
 ['No' 'Yes' 'No internet service']
Unique values in 'TechSupport':
 ['No' 'Yes' 'No internet service']
Unique values in 'StreamingTV':
 ['No' 'Yes' 'No internet service']
Unique values in 'StreamingMovies':
 ['No' 'Yes

No null values.

In [7]:
churn_data.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [8]:
categoricals = []
numericals = []

for col in churn_data.columns[1:]:
    if churn_data[col].dtype == 'float64' or churn_data[col].dtype == 'int64':
        numericals.append(col)
    else:
        categoricals.append(col)

In [9]:
# only three numeic features

numericals

['SeniorCitizen', 'tenure', 'MonthlyCharges']

In [10]:
categoricals

['gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'TotalCharges',
 'Churn']

In [11]:
# 17 categorical features, excluding the customerID

len(categoricals)

17

In [12]:
binary_cols = [col for col in categoricals if churn_data[col].nunique() == 2]
three_some_cols = [col for col in categoricals if churn_data[col].nunique() == 3]

In [13]:
three_some_cols 

['MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract']

In [14]:
# 9 categorical features that has 3 categories

len(three_some_cols)

9

In [15]:
binary_cols

['gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'PaperlessBilling',
 'Churn']

In [16]:
more_some_cols = [col for col in categoricals if churn_data[col].nunique() > 3]

In [17]:
more_some_cols

['PaymentMethod', 'TotalCharges']

In [18]:
# the TotalcCharges is mistyped as a categorical variable. need to fix that

churn_data['TotalCharges'].nunique()

6531

The TotalCharges column had numerical data mistyped as strings, it also had several rows without any values. In order to fix those we need their poistions and put some values.

In [19]:
indices = churn_data[churn_data['TotalCharges'] == ' '].index

In [20]:
for i in indices:
    churn_data['TotalCharges'][i] = '0' 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  churn_data['TotalCharges'][i] = '0'


In [21]:
churn_data[churn_data['TotalCharges'] == ' ']

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn


In [22]:
churn_data['TotalCharges'] = churn_data['TotalCharges'].astype('float')

In [23]:
churn_data[['TotalCharges']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   TotalCharges  7043 non-null   float64
dtypes: float64(1)
memory usage: 55.1 KB


In [24]:
indices = churn_data[churn_data['TotalCharges'] == 0].index

These are the indices where we replaced the empty rows of TotalCharges with the value 0

In [25]:
indices

Int64Index([488, 753, 936, 1082, 1340, 3331, 3826, 4380, 5218, 6670, 6754], dtype='int64')

In [26]:


# for x in churn_data['TotalCharges']:
#     print(x)

In [27]:
churn_data['PaymentMethod'].nunique()

4