In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [15]:
# loading the data
df = pd.read_csv('./data/Telco-Customer-Churn.csv')
df.drop(['TotalCharges'],inplace=True,axis=1)

In [16]:
train_set,test_set =train_test_split(df,test_size=0.2,shuffle=True,random_state=42)

In [17]:
train_set.shape,test_set.shape

((5634, 20), (1409, 20))

## Data Preprocessing

In [18]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'Churn'],
      dtype='object')

In [19]:
# reverting to a clean training set
# and making seperation between predictors and labels

churn_data = train_set.drop("Churn",axis=1)
churn_data_labels = train_set["Churn"].copy()

In [20]:
churn_data.shape,churn_data_labels.shape

((5634, 19), (5634,))

### Handling Categorical Variables

In [21]:
cat_attrs = ['gender', 'SeniorCitizen', 'Partner', 'Dependents',
             'PhoneService', 'MultipleLines', 'InternetService',
             'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
             'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
             'PaymentMethod']

In [22]:
preprocess_pipeline = ColumnTransformer([
    ("cat", OneHotEncoder(), cat_attrs)
],
    remainder="passthrough",
    verbose_feature_names_out=False
)

In [23]:
churn_data_prepared = pd.DataFrame(
    preprocess_pipeline.fit_transform(churn_data),
    columns=preprocess_pipeline.get_feature_names_out()
)