# Importing the data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
data = pd.read_csv('data.csv')
data.head()

In [None]:
data.tail()

In [None]:
data.info()

In [None]:
data.describe()

# Missing data

In [None]:
# Checking for missing data
data.isnull().sum()

In [None]:
# Checking column 'OnlineSecurity' as it shows 17 counts in isnull
data['OnlineSecurity']

In [None]:
data['OnlineSecurity'].value_counts()

##### Column OnlineSecurity is a categorical data and 50% of the people have No as the categorical value so replacing missing values with No. Using the mode of the column for data imputation.

In [None]:
data['OnlineSecurity'] = data['OnlineSecurity'].fillna(data['OnlineSecurity'].mode()[0])

In [None]:
data.isnull().sum()

#### We have no more missing values!

# Checking for duplicate entries in data

In [None]:
data.duplicated().value_counts()

##### We have found some duplicate rows in the data so removing these rows by drop_duplicate()

In [None]:
# Removing the duplicate rows
data.drop_duplicates(inplace=True)

In [None]:
data.duplicated().value_counts()

In [None]:
data.duplicated().value_counts()

##### All duplicate rows removed!

# Changing Datatype of columns 

In [None]:
data.info()

In [None]:
# Currently datatype of column totalcharges is of the 'object' datatype 
# Changing it to float
data['TotalCharges'] = data['TotalCharges'].astype('float64')
data.info()

#### This means we have some erroneous entries in totalcharges that are of string type instead numeric
#### Fixing that

# Fixing inconsistencies in data

In [None]:
data['TotalCharges'].map(type).value_counts()

In [None]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce', downcast='float')


In [None]:
data['TotalCharges'].map(type).value_counts()

In [None]:
data.head()

# Outlier removal

In [None]:
data.head()

In [None]:
data.info()

In [None]:
sns.pairplot(data=data)

In [None]:
# We have observed outliers in 
# tenure vs monthly charges
# montlycharges vs total charges

In [None]:
# Montlycharges seems to be contributing to the outlier entries
# all entries with monthly charges above 300 are outliers
# so removing all such entries
max_value = data['MonthlyCharges'].max()
outliers = data['MonthlyCharges'].between(300, max_value)
data = data[~outliers]
data.dropna(axis = 0, inplace=True)
data.isnull().sum()
# data = data[~outliers]
# data.reset_index(drop=True, inplace=True)

In [None]:
sns.pairplot(data=data)

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
# outliers removed!

# Addressing Inconsistencies in Categorical Data: Category Merge

##### There are multiple columns that have redundant category names which can be merged into one. This will reduce the complexity of the data to some extent.

In [None]:
data.head()

#### Column "MultipleLines" Category Merge

In [None]:
column = 'MultipleLines'
data[column].value_counts()

In [None]:
data.loc[data[column] == 'No phone service', column] = 'No'
data[column].value_counts()

#### Column "OnlineSecurity" Category Merge

In [None]:
column = 'OnlineSecurity'
data[column].value_counts()

In [None]:
data.loc[data[column] == 'No internet service', column] = 'No'
data[column].value_counts()

#### Column "OnlineBackup"Category Merge

In [None]:
column = 'OnlineBackup'
data[column].value_counts()

In [None]:
data.loc[data[column] == 'No internet service', column] = 'No'
data[column].value_counts()

#### Column "DeviceProtection" Category Merge

In [None]:
column = 'DeviceProtection'
data[column].value_counts()

In [None]:
data.loc[data[column] == 'No internet service', column] = 'No'
data[column].value_counts()

#### Column "TechSupport" Category Merge

In [None]:
column = 'TechSupport'
data[column].value_counts()

In [None]:
data.loc[data[column] == 'No internet service', column] = 'No'
data[column].value_counts()

#### Column "StreamingTV" Category Merge

In [None]:
column = 'StreamingTV'
data[column].value_counts()

In [None]:
data.loc[data[column] == 'No internet service', column] = 'No'
data[column].value_counts()

#### Column "StreamingMovies" Category Merge

In [None]:
column = 'StreamingMovies'
data[column].value_counts()

In [None]:
data.loc[data[column] == 'No internet service', column] = 'No'
data[column].value_counts()

In [None]:
data.head()

# Label Encoding Categorical Columns

#### Column: Partner

In [None]:
column = 'Partner'
mapping = {'Yes': 1, 'No': 0}
data[column].value_counts()

In [None]:
data[column] = data[column].map(mapping)
data[column].value_counts()

#### Column: Dependents

In [None]:
column = 'Dependents'
mapping = {'Yes': 1, 'No': 0}
data[column].value_counts()

In [None]:
data[column] = data[column].map(mapping)
data[column].value_counts()

#### Column: PhoneService

In [None]:
column = 'PhoneService'
mapping = {'Yes': 1, 'No': 0}
data[column].value_counts()

In [None]:
data[column] = data[column].map(mapping)
data[column].value_counts()

#### Column: MultipleLines

In [None]:
column = 'MultipleLines'
mapping = {'Yes': 1, 'No': 0}
data[column].value_counts()

In [None]:
data[column] = data[column].map(mapping)
data[column].value_counts()

#### Column: OnlineSecurity

In [None]:
column = 'OnlineSecurity'
mapping = {'Yes': 1, 'No': 0}
data[column].value_counts()

In [None]:
data[column] = data[column].map(mapping)
data[column].value_counts()

#### Column: OnlineBackup

In [None]:
column = 'OnlineBackup'
mapping = {'Yes': 1, 'No': 0}
data[column].value_counts()

In [None]:
data[column] = data[column].map(mapping)
data[column].value_counts()

#### Column: DeviceProtection

In [None]:
column = 'DeviceProtection'
mapping = {'Yes': 1, 'No': 0}
data[column].value_counts()

In [None]:
data[column] = data[column].map(mapping)
data[column].value_counts()

#### Column: TechSupport

In [None]:
column = 'TechSupport'
mapping = {'Yes': 1, 'No': 0}
data[column].value_counts()

In [None]:
data[column] = data[column].map(mapping)
data[column].value_counts()

#### Column: StreamingTV

In [None]:
column = 'StreamingTV'
mapping = {'Yes': 1, 'No': 0}
data[column].value_counts()

In [None]:
data[column] = data[column].map(mapping)
data[column].value_counts()

#### Column: StreamingMovies

In [None]:
column = 'StreamingMovies'
mapping = {'Yes': 1, 'No': 0}
data[column].value_counts()

In [None]:
data[column] = data[column].map(mapping)
data[column].value_counts()

#### Column: PaperlessBilling

In [None]:
column = 'PaperlessBilling'
mapping = {'Yes': 1, 'No': 0}
data[column].value_counts()

In [None]:
data[column] = data[column].map(mapping)
data[column].value_counts()

#### Column: Churn

In [None]:
column = 'Churn'
mapping = {'Yes': 1, 'No': 0}
data[column].value_counts()

In [None]:
data[column] = data[column].map(mapping)
data[column].value_counts()

In [None]:
data.head()

# Label Encoding for gender:

##### Binary encoding gender to make it similar to column having data of isMale?

In [None]:
mapping = {'Male': 1, 'Female': 0}
data['gender'].value_counts()

In [None]:
data['gender'] = data['gender'].map(mapping)
data['gender'].value_counts()

# One-hot encoding for categorical features

#### One-hot encoding for column InternetService

In [None]:
# Converting categorical to numerical: Fetching categories of Internet Service
data['InternetService'].value_counts()

In [None]:
# Performing one-hot encoding
data = pd.get_dummies(data, columns=['InternetService'], prefix='InternetService')

# Move the new three columns to its position 9
columns = data.columns.tolist()
columns = columns[:8] + columns[-3:] + columns[8:-3]
data = data[columns]

# Renaming newly created columns
mapping = {'InternetService_DSL': 'IntrntSrvc_DSL', 
           'InternetService_Fiber optic': 'IntrntSrvc_FiberOptic', 
           'InternetService_No': 'IntrntSrvc_No'}
data.rename(columns = mapping, inplace=True)

# New columns after one hot encoding
data.iloc[:5,8:11]

#### One-hot encoding for column Contract

In [None]:
# Convert categorical to numerical: Fetching categories of Contract
data['Contract'].value_counts()

In [None]:
# Performing one-hot encoding
data = pd.get_dummies(data, columns=['Contract'], prefix='Contract')

# Move the new three columns to its position 18
columns = data.columns.tolist()
columns = columns[:17] + columns[-3:] + columns[17:-3]
data = data[columns]

# Renaming newly created columns
mapping = {'Contract_Month-to-month': 'Contract_Monthly', 
           'Contract_One year': 'Contract_OneYear', 
           'Contract_Two year': 'Contract_TwoYear'}
data.rename(columns = mapping, inplace=True)

# New columns after one hot encoding
data.iloc[:5,17:20]

#### One-hot encoding for column PaymentMethod

In [None]:
# Convert categorical to numerical: Fetching categories of Payment Method
data['PaymentMethod'].value_counts()

In [None]:
# Performing one-hot encoding
data = pd.get_dummies(data, columns=['PaymentMethod'], prefix='PaymentMethod')

# Move the new four columns to its position 22
columns = data.columns.tolist()
columns = columns[:21] + columns[-4:] + columns[21:-4]
data = data[columns]

# Renaming newly created columns
mapping = {'PaymentMethod_Bank transfer (automatic)': 'PayMthd_BankTransfer', 
           'PaymentMethod_Credit card (automatic)': 'PayMthd_CreditCard', 
           'PaymentMethod_Electronic check': 'PayMthd_ElectronicCheck', 
           'PaymentMethod_Mailed check': 'PayMthd_MailedCheck'}
data.rename(columns = mapping, inplace=True)

# New columns after one hot encoding
data.iloc[:5,21:25]

In [None]:
data.head()

# Typecasting Categorical columns to data type 'category'

##### Since many of our columns are of the categorical type it is poor practice to store them as 'object' dtype thus converting them to the dtype 'category'

In [None]:
data.info()

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.columns

In [None]:
cols = ['OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies','PaperlessBilling']
data[cols]

In [None]:
categorical_columns = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 
                       'MultipleLines', 'OnlineSecurity','OnlineSecurity', 'OnlineBackup', 
                       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
                       'PaperlessBilling','Churn']
for col in categorical_columns:
    data[col] = data[col].astype('category')
data.info()

# Datatype downcasting eg. int 64 to int8 etc

##### Storing data as int64,float64 etc. not only requires more space but it also increases processing times. In such scenarios, downcasting will stop wastage of space and improve data processing times during training phases of our predictive models.

In [None]:
data['tenure'] = data['tenure'].astype('int8')
data.info()

In [None]:
data['MonthlyCharges'] = data['MonthlyCharges'].astype('float16')
data['TotalCharges'] = data['TotalCharges'].astype('float16')
data['customerID'] = data['customerID'].astype('string')
data.info()

# Saving cleaned data into new csv file for EDA and Model Training

In [None]:
filename = 'clean_data.csv'
data.to_csv(filename, index=False)

In [None]:
data.head()

In [None]:
# Importing the data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
data = pd.read_csv('data.csv')
data.head()
data.tail()
data.info()
data.describe()
# Missing data
# Checking for missing data
data.isnull().sum()
# Checking column 'OnlineSecurity' as it shows 17 counts in isnull
data['OnlineSecurity']
data['OnlineSecurity'].value_counts()
##### Column OnlineSecurity is a categorical data and 50% of the people have No as the categorical value so replacing missing values with No. Using the mode of the column for data imputation.
data['OnlineSecurity'] = data['OnlineSecurity'].fillna(data['OnlineSecurity'].mode()[0])
data.isnull().sum()
#### We have no more missing values!
# Checking for duplicate entries in data
data.duplicated().value_counts()
##### We have found some duplicate rows in the data so removing these rows by drop_duplicate()
# Removing the duplicate rows
data.drop_duplicates(inplace=True)
data.duplicated().value_counts()
data.duplicated().value_counts()
##### All duplicate rows removed!
# Changing Datatype of columns 
data.info()
# Currently datatype of column totalcharges is of the 'object' datatype 
# Changing it to float
data['TotalCharges'] = data['TotalCharges'].astype('float64')
data.info()
#### This means we have some erroneous entries in totalcharges that are of string type instead numeric
#### Fixing that
# Fixing inconsistencies in data
data['TotalCharges'].map(type).value_counts()
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce', downcast='float')

data['TotalCharges'].map(type).value_counts()
data.head()
# Outlier removal
data.head()
data.info()
sns.pairplot(data=data)
# We have observed outliers in 
# tenure vs monthly charges
# montlycharges vs total charges
# Montlycharges seems to be contributing to the outlier entries
# all entries with monthly charges above 300 are outliers
# so removing all such entries
max_value = data['MonthlyCharges'].max()
outliers = data['MonthlyCharges'].between(300, max_value)
data = data[~outliers]
data.dropna(axis = 0, inplace=True)
data.isnull().sum()
# data = data[~outliers]
# data.reset_index(drop=True, inplace=True)
sns.pairplot(data=data)
data.info()
data.isnull().sum()
# outliers removed!
# Addressing Inconsistencies in Categorical Data: Category Merge
##### There are multiple columns that have redundant category names which can be merged into one. This will reduce the complexity of the data to some extent.
data.head()
#### Column "MultipleLines" Category Merge
column = 'MultipleLines'
data[column].value_counts()
data.loc[data[column] == 'No phone service', column] = 'No'
data[column].value_counts()
#### Column "OnlineSecurity" Category Merge
column = 'OnlineSecurity'
data[column].value_counts()
data.loc[data[column] == 'No internet service', column] = 'No'
data[column].value_counts()
#### Column "OnlineBackup"Category Merge
column = 'OnlineBackup'
data[column].value_counts()
data.loc[data[column] == 'No internet service', column] = 'No'
data[column].value_counts()
#### Column "DeviceProtection" Category Merge
column = 'DeviceProtection'
data[column].value_counts()
data.loc[data[column] == 'No internet service', column] = 'No'
data[column].value_counts()
#### Column "TechSupport" Category Merge
column = 'TechSupport'
data[column].value_counts()
data.loc[data[column] == 'No internet service', column] = 'No'
data[column].value_counts()
#### Column "StreamingTV" Category Merge
column = 'StreamingTV'
data[column].value_counts()
data.loc[data[column] == 'No internet service', column] = 'No'
data[column].value_counts()
#### Column "StreamingMovies" Category Merge
column = 'StreamingMovies'
data[column].value_counts()
data.loc[data[column] == 'No internet service', column] = 'No'
data[column].value_counts()
data.head()
# Label Encoding Categorical Columns
#### Column: Partner
column = 'Partner'
mapping = {'Yes': 1, 'No': 0}
data[column].value_counts()
data[column] = data[column].map(mapping)
data[column].value_counts()
#### Column: Dependents
column = 'Dependents'
mapping = {'Yes': 1, 'No': 0}
data[column].value_counts()
data[column] = data[column].map(mapping)
data[column].value_counts()
#### Column: PhoneService
column = 'PhoneService'
mapping = {'Yes': 1, 'No': 0}
data[column].value_counts()
data[column] = data[column].map(mapping)
data[column].value_counts()
#### Column: MultipleLines
column = 'MultipleLines'
mapping = {'Yes': 1, 'No': 0}
data[column].value_counts()
data[column] = data[column].map(mapping)
data[column].value_counts()
#### Column: OnlineSecurity
column = 'OnlineSecurity'
mapping = {'Yes': 1, 'No': 0}
data[column].value_counts()
data[column] = data[column].map(mapping)
data[column].value_counts()
#### Column: OnlineBackup
column = 'OnlineBackup'
mapping = {'Yes': 1, 'No': 0}
data[column].value_counts()
data[column] = data[column].map(mapping)
data[column].value_counts()
#### Column: DeviceProtection
column = 'DeviceProtection'
mapping = {'Yes': 1, 'No': 0}
data[column].value_counts()
data[column] = data[column].map(mapping)
data[column].value_counts()
#### Column: TechSupport
column = 'TechSupport'
mapping = {'Yes': 1, 'No': 0}
data[column].value_counts()
data[column] = data[column].map(mapping)
data[column].value_counts()
#### Column: StreamingTV
column = 'StreamingTV'
mapping = {'Yes': 1, 'No': 0}
data[column].value_counts()
data[column] = data[column].map(mapping)
data[column].value_counts()
#### Column: StreamingMovies
column = 'StreamingMovies'
mapping = {'Yes': 1, 'No': 0}
data[column].value_counts()
data[column] = data[column].map(mapping)
data[column].value_counts()
#### Column: PaperlessBilling
column = 'PaperlessBilling'
mapping = {'Yes': 1, 'No': 0}
data[column].value_counts()
data[column] = data[column].map(mapping)
data[column].value_counts()
#### Column: Churn
column = 'Churn'
mapping = {'Yes': 1, 'No': 0}
data[column].value_counts()
data[column] = data[column].map(mapping)
data[column].value_counts()
data.head()
# Label Encoding for gender:
##### Binary encoding gender to make it similar to column having data of isMale?
mapping = {'Male': 1, 'Female': 0}
data['gender'].value_counts()
data['gender'] = data['gender'].map(mapping)
data['gender'].value_counts()
# One-hot encoding for categorical features
#### One-hot encoding for column InternetService
# Converting categorical to numerical: Fetching categories of Internet Service
data['InternetService'].value_counts()
# Performing one-hot encoding
data = pd.get_dummies(data, columns=['InternetService'], prefix='InternetService')

# Move the new three columns to its position 9
columns = data.columns.tolist()
columns = columns[:8] + columns[-3:] + columns[8:-3]
data = data[columns]

# Renaming newly created columns
mapping = {'InternetService_DSL': 'IntrntSrvc_DSL', 
           'InternetService_Fiber optic': 'IntrntSrvc_FiberOptic', 
           'InternetService_No': 'IntrntSrvc_No'}
data.rename(columns = mapping, inplace=True)

# New columns after one hot encoding
data.iloc[:5,8:11]
#### One-hot encoding for column Contract
# Convert categorical to numerical: Fetching categories of Contract
data['Contract'].value_counts()
# Performing one-hot encoding
data = pd.get_dummies(data, columns=['Contract'], prefix='Contract')

# Move the new three columns to its position 18
columns = data.columns.tolist()
columns = columns[:17] + columns[-3:] + columns[17:-3]
data = data[columns]

# Renaming newly created columns
mapping = {'Contract_Month-to-month': 'Contract_Monthly', 
           'Contract_One year': 'Contract_OneYear', 
           'Contract_Two year': 'Contract_TwoYear'}
data.rename(columns = mapping, inplace=True)

# New columns after one hot encoding
data.iloc[:5,17:20]
#### One-hot encoding for column PaymentMethod
# Convert categorical to numerical: Fetching categories of Payment Method
data['PaymentMethod'].value_counts()
# Performing one-hot encoding
data = pd.get_dummies(data, columns=['PaymentMethod'], prefix='PaymentMethod')

# Move the new four columns to its position 22
columns = data.columns.tolist()
columns = columns[:21] + columns[-4:] + columns[21:-4]
data = data[columns]

# Renaming newly created columns
mapping = {'PaymentMethod_Bank transfer (automatic)': 'PayMthd_BankTransfer', 
           'PaymentMethod_Credit card (automatic)': 'PayMthd_CreditCard', 
           'PaymentMethod_Electronic check': 'PayMthd_ElectronicCheck', 
           'PaymentMethod_Mailed check': 'PayMthd_MailedCheck'}
data.rename(columns = mapping, inplace=True)

# New columns after one hot encoding
data.iloc[:5,21:25]
data.head()
# Typecasting Categorical columns to data type 'category'
##### Since many of our columns are of the categorical type it is poor practice to store them as 'object' dtype thus converting them to the dtype 'category'
data.info()
data.head()
data.describe()
data.columns
cols = ['OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies','PaperlessBilling']
data[cols]
categorical_columns = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 
                       'MultipleLines', 'OnlineSecurity','OnlineSecurity', 'OnlineBackup', 
                       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
                       'PaperlessBilling','Churn']
for col in categorical_columns:
    data[col] = data[col].astype('category')
data.info()
# Datatype downcasting eg. int 64 to int8 etc
##### Storing data as int64,float64 etc. not only requires more space but it also increases processing times. In such scenarios, downcasting will stop wastage of space and improve data processing times during training phases of our predictive models.
data['tenure'] = data['tenure'].astype('int8')
data.info()
data['MonthlyCharges'] = data['MonthlyCharges'].astype('float16')
data['TotalCharges'] = data['TotalCharges'].astype('float16')
data['customerID'] = data['customerID'].astype('string')
data.info()
# Saving cleaned data into new csv file for EDA and Model Training
filename = 'clean_data.csv'
data.to_csv(filename, index=False)
data.head()
