# Introduction

In this notebook, the necessary standardizations will be applied to prepare the dataset for the modeling phase. This process will ensure that the features are scaled appropriately, allowing the subsequent machine learning models to perform optimally. 


In [7]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

## Dataset load and first visualization:

In [8]:
path = '../telco_customer_churn/data/cleaned_dataset.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,Gender,SeniorCitizen,Partner,Dependents,Tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## Mapping categorical variables using One-Hot-Encoding

In [9]:

df_encoded = pd.get_dummies(df, columns=['Gender', 'Partner', 'Dependents',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod','Churn'])
df_encoded.drop(columns=[
    "Gender_Female",
    "Partner_No",
    "Dependents_No",
    "PhoneService_No",
    "MultipleLines_No",
    "OnlineSecurity_No",
    "OnlineBackup_No",
    "DeviceProtection_No",
    "TechSupport_No",
    "StreamingTV_No",
    "StreamingMovies_No",
    "PaperlessBilling_No",
    "Churn_No"
], inplace=True)
ohe_columns = [
       'SeniorCitizen','Gender_Male','Partner_Yes',
       'Dependents_Yes', 'PhoneService_Yes',
       'MultipleLines_Yes', 'InternetService_DSL',
       'InternetService_Fiber optic', 'InternetService_No',
       'OnlineSecurity_Yes', 'OnlineBackup_Yes', 'DeviceProtection_Yes',
       'TechSupport_Yes', 'StreamingTV_Yes', 'StreamingMovies_Yes',
       'Contract_Month-to-month', 'Contract_One year', 'Contract_Two year',
       'PaperlessBilling_Yes', 'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check',
       'Churn_Yes'
]
df_encoded[ohe_columns] = df_encoded[ohe_columns].astype(int)
df_encoded.head()



Unnamed: 0,SeniorCitizen,Tenure,MonthlyCharges,TotalCharges,Gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_Yes,InternetService_DSL,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn_Yes
0,0,1,29.85,29.85,0,1,0,0,0,1,...,0,1,0,0,1,0,0,1,0,0
1,0,34,56.95,1889.5,1,0,0,1,0,1,...,0,0,1,0,0,0,0,0,1,0
2,0,2,53.85,108.15,1,0,0,1,0,1,...,0,1,0,0,1,0,0,0,1,1
3,0,45,42.3,1840.75,1,0,0,0,0,1,...,0,0,1,0,0,1,0,0,0,0
4,0,2,70.7,151.65,0,0,0,1,0,0,...,0,1,0,0,1,0,0,1,0,1


## Scaling Numerical Features to Values Between 0 and 1:


In [10]:
cols = ['Tenure','MonthlyCharges','TotalCharges']
scarler = MinMaxScaler()
scaled_df = df_encoded.copy()
scaled_df[cols] = scarler.fit_transform(df_encoded[cols])

In [11]:
scaled_df.head()

Unnamed: 0,SeniorCitizen,Tenure,MonthlyCharges,TotalCharges,Gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_Yes,InternetService_DSL,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn_Yes
0,0,0.0,0.115423,0.001275,0,1,0,0,0,1,...,0,1,0,0,1,0,0,1,0,0
1,0,0.464789,0.385075,0.215867,1,0,0,1,0,1,...,0,0,1,0,0,0,0,0,1,0
2,0,0.014085,0.354229,0.01031,1,0,0,1,0,1,...,0,1,0,0,1,0,0,0,1,1
3,0,0.619718,0.239303,0.210241,1,0,0,0,0,1,...,0,0,1,0,0,1,0,0,0,0
4,0,0.014085,0.521891,0.01533,0,0,0,1,0,0,...,0,1,0,0,1,0,0,1,0,1


## Saving the Preprocessed Dataset in a CSV File:

In [12]:
scaled_df.to_csv('../telco_customer_churn/data/preprocessed_dataset.csv',index=False)