# Importing the data

In [5]:
import numpy as np
import pandas as pd

In [67]:
data = pd.read_csv('data.csv')
# data.head()

In [None]:
data.tail()

In [None]:
data.info()

In [None]:
data.describe()

# 1. Checking for duplicate entries in data

In [None]:
data.duplicated().value_counts()

In [None]:
# We have 100 duplicated rows

# Outliers removal

# TotalCharges - str to int

# Addressing Inconsistencies in Categorical Data: Category Merge

#### Column "MultipleLines" Category Merge

In [36]:
column = 'MultipleLines'
data[column].value_counts()
data.loc[data[column] == 'No phone service', column] = 'No'
data[column].value_counts()

No     4119
Yes    3022
Name: MultipleLines, dtype: int64

#### Column "OnlineSecurity" Category Merge

In [37]:
column = 'OnlineSecurity'
data[column].value_counts()
data.loc[data[column] == 'No internet service', column] = 'No'
data[column].value_counts()

No     5084
Yes    2040
Name: OnlineSecurity, dtype: int64

#### Column "OnlineBackup"Category Merge

In [38]:
column = 'OnlineBackup'
data[column].value_counts()
data.loc[data[column] == 'No internet service', column] = 'No'
data[column].value_counts()

No     4675
Yes    2466
Name: OnlineBackup, dtype: int64

#### Column "DeviceProtection" Category Merge

In [39]:
column = 'DeviceProtection'
data[column].value_counts()
data.loc[data[column] == 'No internet service', column] = 'No'
data[column].value_counts()

No     4679
Yes    2462
Name: DeviceProtection, dtype: int64

#### Column "TechSupport" Category Merge

In [40]:
column = 'TechSupport'
data[column].value_counts()
data.loc[data[column] == 'No internet service', column] = 'No'
data[column].value_counts()

No     5069
Yes    2072
Name: TechSupport, dtype: int64

#### Column "StreamingTV" Category Merge

In [41]:
column = 'StreamingTV'
data[column].value_counts()
data.loc[data[column] == 'No internet service', column] = 'No'
data[column].value_counts()

No     4392
Yes    2749
Name: StreamingTV, dtype: int64

#### Column "StreamingMovies" Category Merge

In [42]:
column = 'StreamingMovies'
data[column].value_counts()
data.loc[data[column] == 'No internet service', column] = 'No'
data[column].value_counts()

No     4376
Yes    2765
Name: StreamingMovies, dtype: int64

# Standardizing Data Formats: Yes/No to 1/0

In [43]:
columns = ['Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'Churn']
mapping = {'Yes': 1, 'No': 0}

for column in columns:
    data[column] = data[column].map(mapping)

# Standardizing Data Formats: Male/Female to 1/0

In [44]:
mapping = {'Male': 1, 'Female': 0}
data['gender'] = data['gender'].map(mapping)

# One-hot encoding for categorical features

#### One-hot encoding for column InternetService

In [68]:
# Convert categorical to numerical
data['InternetService'].value_counts()

Fiber optic    3135
DSL            2460
No             1546
Name: InternetService, dtype: int64

In [69]:
# Perform one-hot encoding
data = pd.get_dummies(data, columns=['InternetService'], prefix='InternetService')

# Get the column names
columns = data.columns.tolist()

# Move the new three columns to its position 9
columns = columns[:8] + columns[-3:] + columns[8:-3]

# Create a new DataFrame with the modified column order
data = data[columns]

# Renaming Columns
mapping = {'InternetService_DSL': 'IntrntSrvc_DSL', 'InternetService_Fiber optic': 'IntrntSrvc_FiberOptic', 'InternetService_No': 'IntrntSrvc_No'}
data.rename(columns = mapping, inplace=True)

# New columns
data.iloc[:,8:11]

Unnamed: 0,IntrntSrvc_DSL,IntrntSrvc_FiberOptic,IntrntSrvc_No
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,0,1,0
...,...,...,...
7136,0,1,0
7137,0,1,0
7138,1,0,0
7139,0,1,0


#### One-hot encoding for column Contract

In [70]:
# Convert categorical to numerical
data['Contract'].value_counts()

Month-to-month    3932
Two year          1721
One year          1488
Name: Contract, dtype: int64

In [71]:
# Perform one-hot encoding
data = pd.get_dummies(data, columns=['Contract'], prefix='Contract')

# Get the column names
columns = data.columns.tolist()

# Move the new three columns to its position 18
columns = columns[:17] + columns[-3:] + columns[17:-3]

# Create a new DataFrame with the modified column order
data = data[columns]

# Renaming Columns
mapping = {'Contract_Month-to-month': 'Contract_Monthly', 'Contract_One year': 'Contract_OneYear', 'Contract_Two year': 'Contract_TwoYear'}
data.rename(columns = mapping, inplace=True)

# New columns
data.iloc[:,17:20]

Unnamed: 0,Contract_Monthly,Contract_OneYear,Contract_TwoYear
0,1,0,0
1,0,1,0
2,1,0,0
3,0,1,0
4,1,0,0
...,...,...,...
7136,1,0,0
7137,1,0,0
7138,1,0,0
7139,0,0,1


#### One-hot encoding for column PaymentMethod

In [72]:
# Convert categorical to numerical
data['PaymentMethod'].value_counts()

Electronic check             2398
Mailed check                 1629
Bank transfer (automatic)    1563
Credit card (automatic)      1551
Name: PaymentMethod, dtype: int64

In [73]:
# Perform one-hot encoding
data = pd.get_dummies(data, columns=['PaymentMethod'], prefix='PaymentMethod')

# Get the column names
columns = data.columns.tolist()

# Move the new four columns to its position 22
columns = columns[:21] + columns[-4:] + columns[21:-4]

# Create a new DataFrame with the modified column order
data = data[columns]

# Renaming Columns
mapping = {'PaymentMethod_Bank transfer (automatic)': 'PayMthd_BankTransfer', 'PaymentMethod_Credit card (automatic)': 'PayMthd_CreditCard', 'PaymentMethod_Electronic check': 'PayMthd_ElectronicCheck', 'PaymentMethod_Mailed check': 'PayMthd_MailedCheck'}
data.rename(columns = mapping, inplace=True)

# New columns
data.iloc[:,21:25]

Unnamed: 0,PayMthd_BankTransfer,PayMthd_CreditCard,PayMthd_ElectronicCheck,PayMthd_MailedCheck
0,0,0,1,0
1,0,0,0,1
2,0,0,0,1
3,1,0,0,0
4,0,0,1,0
...,...,...,...,...
7136,0,0,1,0
7137,0,1,0,0
7138,0,0,1,0
7139,0,1,0,0


# Typecasting all columns