In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import numpy as np

### Load Dataset

In [2]:
raw_dataset = pd.read_csv("dataset.csv")

In [3]:
# Check Dataset
raw_dataset.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### Check Null Values

In [4]:
raw_dataset.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

### Dataset Preprocessing

In [5]:
raw_dataset = raw_dataset.drop_duplicates()

In [6]:
raw_dataset = raw_dataset.drop(["customerID"],axis=1)

#### Drop Mismatch Values

In [7]:
# Drop empty value rows from dataset
value = ' '
raw_dataset = raw_dataset[raw_dataset["TotalCharges"]!= value]
# Type conversion for string values
raw_dataset["TotalCharges"] = raw_dataset["TotalCharges"].astype("float64")
raw_dataset = raw_dataset.reset_index()

#### Categorical Data Encoding

In [8]:
raw_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   index             7032 non-null   int64  
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   object 
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   object 
 9   OnlineSecurity    7032 non-null   object 
 10  OnlineBackup      7032 non-null   object 
 11  DeviceProtection  7032 non-null   object 
 12  TechSupport       7032 non-null   object 
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 


#### Encode Columns With Binary Values

In [9]:
# Columns names including Yes , No Values
yes_no_columns = ["Partner","Dependents","PhoneService","PaperlessBilling","Churn"]

In [10]:
# Encode the values to  Yes=1 and No=0 
for column in yes_no_columns:
    raw_dataset[column] = raw_dataset[column].map(dict(Yes=1, No=0))

In [11]:
# Binary encoding for gender column
raw_dataset["gender"] = raw_dataset["gender"].map(dict(Female=1, Male=0))

In [12]:
# Check the column datatype and unique values
print("Number Of Unique Values",end="\n\n")
for key in raw_dataset:
    print(f"{key}: ",raw_dataset[f"{key}"].nunique(dropna=True) ,raw_dataset[f"{key}"].dtype)

Number Of Unique Values

index:  7032 int64
gender:  2 int64
SeniorCitizen:  2 int64
Partner:  2 int64
Dependents:  2 int64
tenure:  72 int64
PhoneService:  2 int64
MultipleLines:  3 object
InternetService:  3 object
OnlineSecurity:  3 object
OnlineBackup:  3 object
DeviceProtection:  3 object
TechSupport:  3 object
StreamingTV:  3 object
StreamingMovies:  3 object
Contract:  3 object
PaperlessBilling:  2 int64
PaymentMethod:  4 object
MonthlyCharges:  1584 float64
TotalCharges:  6530 float64
Churn:  2 int64


#### Check Unique Values of Categorical Columns

In [13]:
categorical_columns = raw_dataset.select_dtypes(include=['object']).columns.tolist()

In [14]:
categorical_columns

['MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaymentMethod']

In [15]:
# Each Unique Value in categorical columns
for column in categorical_columns:
    print(column, raw_dataset[column].unique())

MultipleLines ['No phone service' 'No' 'Yes']
InternetService ['DSL' 'Fiber optic' 'No']
OnlineSecurity ['No' 'Yes' 'No internet service']
OnlineBackup ['Yes' 'No' 'No internet service']
DeviceProtection ['No' 'Yes' 'No internet service']
TechSupport ['No' 'Yes' 'No internet service']
StreamingTV ['No' 'Yes' 'No internet service']
StreamingMovies ['No' 'Yes' 'No internet service']
Contract ['Month-to-month' 'One year' 'Two year']
PaymentMethod ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']


#### One Hot Encoding of Categorical Columns

In [16]:
# Initialize Encoder
encoder = OneHotEncoder(sparse_output=False)

In [19]:
# Encode categorical values
one_hot_encoded = encoder.fit_transform(raw_dataset[categorical_columns])
# Create a dataframe of encoded values
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))
# Combine those dataframes
df_encoded = pd.concat([raw_dataset, one_hot_df], axis=1)
# Drop Categorical columns from raw_dataset
df_encoded = df_encoded.drop(categorical_columns, axis=1).drop("index",axis=1)

### Final Dataset

In [20]:
df_encoded

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,...,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1,0,1,0,1,0,1,29.85,29.85,0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0,0,0,0,34,1,0,56.95,1889.50,0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0,0,0,0,2,1,1,53.85,108.15,1,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0,0,0,0,45,0,0,42.30,1840.75,0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,1,0,0,0,2,1,1,70.70,151.65,1,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,0,1,1,24,1,1,84.80,1990.50,0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
7028,1,0,1,1,72,1,1,103.20,7362.90,0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
7029,1,0,1,1,11,0,1,29.60,346.45,0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
7030,0,1,1,0,4,1,1,74.40,306.60,1,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


#### Save Dataset as .CSV

In [24]:
df_encoded.to_csv("Processed_dataset.csv",index=False)