In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [22]:
# lets load the data in df variable

In [23]:
df = pd.read_csv("/content/Telco-Customer-Churn.csv")

In [24]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [25]:
df.shape

(7043, 21)

In [26]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

- The dataset contains 7043 customers and 21 features.
- each row represent a unique customer with various features


In [27]:
df.dtypes

Unnamed: 0,0
customerID,object
gender,object
SeniorCitizen,int64
Partner,object
Dependents,object
tenure,int64
PhoneService,object
MultipleLines,object
InternetService,object
OnlineSecurity,object


- since 'TotalCharges' is stored as object, which means it represent a numerical values but may be showing an error. it may contain missing values or be formated incorrectly . so letts check.

In [28]:
df['TotalCharges'].isnull().sum()

np.int64(0)

In [29]:
# so there are no missing values - lets change it to numeric

In [30]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors = "coerce")

In [31]:
df['TotalCharges'].dtypes

dtype('float64')

In [32]:
#lets check for missing values in dataset

In [33]:
df.isnull().sum()

Unnamed: 0,0
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


- only one columns 'TotalCharges' has missing values (11 entries)
- possible ways to handle missing values
- fill them with median
- fill them with zero
- drop these 11 rows
-  so better way we replace with median

In [34]:
df['TotalCharges'].describe()

Unnamed: 0,TotalCharges
count,7032.0
mean,2283.300441
std,2266.771362
min,18.8
25%,401.45
50%,1397.475
75%,3794.7375
max,8684.8


In [35]:
df['TotalCharges'].median()

1397.475

In [36]:
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace = True)


In [37]:
df['TotalCharges'].isnull().sum()

np.int64(0)

In [38]:
#lets check for senior setizens also

In [39]:
df['SeniorCitizen'].unique()

array([0, 1])

In [40]:
df['SeniorCitizen'] = df['SeniorCitizen'].replace({1:"Yes", 0:"No"})

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   object 
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [42]:
df.duplicated().sum()

np.int64(0)

In [43]:
# lets check all the object columns for is there extra spaces
#, qutations are avaliable or not

In [44]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [45]:
categorical_column = ['gender', 'SeniorCitizen', 'Partner', 'Dependents',
      'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod',  'Churn']

In [48]:
NAME = 'Rahul'
age = 33
print(NAME,"age is",age)
print(f"{NAME} age is {age}")

Rahul age is 33
Rahul age is 33


In [51]:
for i in categorical_column:
  print(f"Unique Values in {i} is", df[i].unique(),"\n")

Unique Values in gender is ['Female' 'Male'] 

Unique Values in SeniorCitizen is ['No' 'Yes'] 

Unique Values in Partner is ['Yes' 'No'] 

Unique Values in Dependents is ['No' 'Yes'] 

Unique Values in PhoneService is ['No' 'Yes'] 

Unique Values in MultipleLines is ['No phone service' 'No' 'Yes'] 

Unique Values in InternetService is ['DSL' 'Fiber optic' 'No'] 

Unique Values in OnlineSecurity is ['No' 'Yes' 'No internet service'] 

Unique Values in OnlineBackup is ['Yes' 'No' 'No internet service'] 

Unique Values in DeviceProtection is ['No' 'Yes' 'No internet service'] 

Unique Values in TechSupport is ['No' 'Yes' 'No internet service'] 

Unique Values in StreamingTV is ['No' 'Yes' 'No internet service'] 

Unique Values in StreamingMovies is ['No' 'Yes' 'No internet service'] 

Unique Values in Contract is ['Month-to-month' 'One year' 'Two year'] 

Unique Values in PaperlessBilling is ['Yes' 'No'] 

Unique Values in PaymentMethod is ['Electronic check' 'Mailed check' 'Bank transfer