<h1 align='center'>Data Preprocessing</h1>

In [39]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

In [40]:
df=pd.read_excel('../data/raw/Telco_customer_churn.xlsx')


In [41]:
#Fix Data Types
df['Total Charges']=pd.to_numeric(df['Total Charges'],errors='coerce')
print(df['Total Charges'].dtype )

float64


In [42]:
#Define Target Value
y=df['Churn Value']
X=df.drop(columns=['Churn Value'])

In [43]:
#Drop useless Columns
drop_cols=[
  "CustomerID",
  "Count",
  "Country",
  "State",
  "City",
  "Zip Code",
  "Lat Long",
  "Latitude",
  "Longitude",
  "Zip Code",
  "Churn Label",
  "CLTV",
  "Churn Score",
  "Churn Reason"
]
X=X.drop(columns=drop_cols)

In [44]:
X.head()

Unnamed: 0,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges
0,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15
1,Female,No,No,Yes,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65
2,Female,No,No,Yes,8,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5
3,Female,No,Yes,Yes,28,Yes,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05
4,Male,No,No,Yes,49,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3


In [45]:
#Numerical Features
num_featers=['Tenure Months','Monthly Charges','Total Charges']
#Categorical Features [Binary, Ternary]
cat_features=[col for col in X.columns if col not in num_featers]

In [46]:
#Split into train and test
X_train, X_test, y_train, y_test=train_test_split(X,
                                                  y,
                                                  test_size=0.2,
                                                  stratify=y, # Becasue data is unbalanced
                                                  random_state=42)

In [47]:
#Create the Numerical Pipeline
num_pipeline=Pipeline([
    ('imputer',SimpleImputer(strategy='median')),#Because data is Skewed
    ('scaler',StandardScaler())
])

In [48]:
#Create the Categorical Pipeline
cat_pipeline=Pipeline([
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown='ignore',sparse_output=False))
])

#### Column Transformer
- Allow you to apply different "Pipelines" to specific columns of data

In [49]:
#ColumnTransformer
preprocesssor=ColumnTransformer(
  transformers=[
      ('num',num_pipeline,num_featers),
      ('cat',cat_pipeline,cat_features)
  ]
)

In [50]:
# Fit and Transform 
X_train_processed=preprocesssor.fit_transform(X_train)
X_test_processed=preprocesssor.transform(X_test)

In [None]:
X_train_processed.shape

(5634, 46)

In [52]:
preprocesssor.get_feature_names_out()

array(['num__Tenure Months', 'num__Monthly Charges', 'num__Total Charges',
       'cat__Gender_Female', 'cat__Gender_Male', 'cat__Senior Citizen_No',
       'cat__Senior Citizen_Yes', 'cat__Partner_No', 'cat__Partner_Yes',
       'cat__Dependents_No', 'cat__Dependents_Yes',
       'cat__Phone Service_No', 'cat__Phone Service_Yes',
       'cat__Multiple Lines_No', 'cat__Multiple Lines_No phone service',
       'cat__Multiple Lines_Yes', 'cat__Internet Service_DSL',
       'cat__Internet Service_Fiber optic', 'cat__Internet Service_No',
       'cat__Online Security_No',
       'cat__Online Security_No internet service',
       'cat__Online Security_Yes', 'cat__Online Backup_No',
       'cat__Online Backup_No internet service', 'cat__Online Backup_Yes',
       'cat__Device Protection_No',
       'cat__Device Protection_No internet service',
       'cat__Device Protection_Yes', 'cat__Tech Support_No',
       'cat__Tech Support_No internet service', 'cat__Tech Support_Yes',
       'cat__Str

In [54]:
#Save Processed Data
np.save('../data/processed/X_train.npy',X_train_processed)
np.save('../data/processed/X_test.npy',X_test_processed)
np.save('../data/processed/y_train.npy',y_train)
np.save('../data/processed/y_test.npy',y_test)