# Preprocessing Pipeline

## Encoding pipeline

### Importing Libraries

In [1]:
import pickle
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

### Load cleaned data

In [2]:
df = pd.read_pickle("processed_data/cleaned_data.pkl")
print("Cleaned data loaded")

Cleaned data loaded


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4931 entries, 0 to 4999
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            4931 non-null   object 
 1   SeniorCitizen     4931 non-null   int64  
 2   Partner           4931 non-null   object 
 3   Dependents        4931 non-null   object 
 4   tenure            4931 non-null   int64  
 5   PhoneService      4931 non-null   object 
 6   MultipleLines     4931 non-null   object 
 7   InternetService   4931 non-null   object 
 8   OnlineSecurity    4931 non-null   object 
 9   OnlineBackup      4931 non-null   object 
 10  DeviceProtection  4931 non-null   object 
 11  TechSupport       4931 non-null   object 
 12  StreamingTV       4931 non-null   object 
 13  StreamingMovies   4931 non-null   object 
 14  Contract          4931 non-null   object 
 15  PaperlessBilling  4931 non-null   object 
 16  PaymentMethod     4931 non-null   object 
 17  

### Custom transformer for multi-column LabelEncoder

In [4]:
# Custom transformer for multi-column LabelEncoder
class MultiColumnLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        self.encoders_ = {}

    def fit(self, X, y=None):
        if self.columns is not None:
            for col in self.columns:
                le = LabelEncoder()
                # Fit on the specific column
                le.fit(X[col].values)
                self.encoders_[col] = le
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        if self.columns is not None:
            for col in self.columns:
                X_copy[col] = self.encoders_[col].transform(X_copy[col].values)
        return X_copy

### Pipeline

#### Encoding pipeline general

In [5]:
# Define the column transformers for different encoding types
one_hot_cols = ['InternetService', 'PaymentMethod']  
ordinal_cols = ['Contract']
label_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']  
internet_service_label_cols = ['MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

# Create transformers
one_hot_transformer = OneHotEncoder(drop=None, sparse_output=False)
ordinal_transformer = OrdinalEncoder(categories=[['Month-to-month', 'One year', 'Two year']])

# Combine transformers into ColumnTransformer
# Use custom transformer for label_cols and internet_service_label_cols
encoder_preprocessor_tree = ColumnTransformer(
    transformers=[
        ('onehot', one_hot_transformer, one_hot_cols),
        ('ordinal', ordinal_transformer, ordinal_cols),
        ('label', MultiColumnLabelEncoder(columns=label_cols), label_cols),
        ('internet_label', MultiColumnLabelEncoder(columns=internet_service_label_cols), internet_service_label_cols)
    ],
    remainder='passthrough'  # Keep other columns as they are
)

# Create the pipeline
pipeline_tree = Pipeline(steps=[
    ('encoder_preprocessor', encoder_preprocessor_tree)
])

In [6]:
# Fit the tree-based pipeline
df_encoded_tree_data = pipeline_tree.fit_transform(df)

# Manually construct feature names for the tree pipeline
feature_names_tree = []
onehot_names = pipeline_tree.named_steps['encoder_preprocessor'].named_transformers_['onehot'].get_feature_names_out(one_hot_cols)
feature_names_tree.extend(onehot_names)
feature_names_tree.extend(ordinal_cols)
feature_names_tree.extend(label_cols)
feature_names_tree.extend(internet_service_label_cols)
remainder_features = [col for col in df.columns if col not in (one_hot_cols + ordinal_cols + label_cols + internet_service_label_cols)]
feature_names_tree.extend(remainder_features)

# Create a DataFrame and save it
df_encoded_tree = pd.DataFrame(df_encoded_tree_data, columns=feature_names_tree)
print("\nShape of data for tree models:", df_encoded_tree.shape)
df_encoded_tree.head()

# Save data for tree-based models
df_encoded_tree.to_pickle("processed_data/encoded_data_tree.pkl")
print("\n Data for tree-based models saved to encoded_data_tree.pkl")


Shape of data for tree models: (4931, 24)

 Data for tree-based models saved to encoded_data_tree.pkl


#### Encoding Pipeline for Linear models

In [7]:
# Define columns for one-hot and ordinal encoding
one_hot_cols_linear = [
    'gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling',
    'InternetService', 'PaymentMethod', 'MultipleLines', 'OnlineSecurity',
    'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies'
]
ordinal_cols = ['Contract'] 

# Create transformers
# Use handle_unknown='ignore' to prevent errors during prediction
one_hot_transformer_linear = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ordinal_transformer = OrdinalEncoder(categories=[['Month-to-month', 'One year', 'Two year']])

# Create the preprocessor for the linear pipeline
encoder_preprocessor_linear = ColumnTransformer(
    transformers=[
        ('onehot', one_hot_transformer_linear, one_hot_cols_linear),
        ('ordinal', ordinal_transformer, ordinal_cols),
    ],
    remainder='passthrough'
)

# Create the pipeline for linear models
pipeline_linear = Pipeline(steps=[
    ('encoder_preprocessor', encoder_preprocessor_linear)
])

# Fit the linear pipeline
df_encoded_linear_data = pipeline_linear.fit_transform(df)

# Get feature names automatically from the linear pipeline
feature_names_linear = pipeline_linear.named_steps['encoder_preprocessor'].get_feature_names_out()

# Create a DataFrame and save it
df_encoded_linear = pd.DataFrame(df_encoded_linear_data, columns=feature_names_linear)
print("\nShape of data for linear models:", df_encoded_linear.shape)
df_encoded_linear.head()

# Save data for linear models
df_encoded_linear.to_pickle("processed_data/encoded_data_linear.pkl")
print("\nData for linear models saved to encoded_data_linear.pkl")


Shape of data for linear models: (4931, 43)

Data for linear models saved to encoded_data_linear.pkl


## Train-Test-Split

### Regression

In [8]:
# Separating features and target for Regression from the LINEAR-ENCODED data
X_reg = df_encoded_linear.drop('remainder__MonthlyCharges', axis=1)
y_reg = df_encoded_linear['remainder__MonthlyCharges']

# train, test, split for regression
X_train_reg_linear, X_test_reg_linear, y_train_reg_linear, y_test_reg_linear = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

# Save linear regression splits
with open("processed_data/X_train_reg_linear.pkl", "wb") as f: pickle.dump(X_train_reg_linear, f)
with open("processed_data/X_test_reg_linear.pkl", "wb") as f: pickle.dump(X_test_reg_linear, f)
with open("processed_data/y_train_reg_linear.pkl", "wb") as f: pickle.dump(y_train_reg_linear, f)
with open("processed_data/y_test_reg_linear.pkl", "wb") as f: pickle.dump(y_test_reg_linear, f)

print(" Data splits for LINEAR REGRESSION saved.")

 Data splits for LINEAR REGRESSION saved.


### Classification

In [9]:
# Separating features and target for classification from the TREE-ENCODED data
X_cls = df_encoded_tree.drop('Churn', axis=1)
y_cls = df_encoded_tree['Churn']

# train, test, split for classification
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(
    X_cls, y_cls, test_size=0.2, random_state=42, stratify=y_cls
)

# Save classification splits
with open("processed_data/X_train_clf.pkl", "wb") as f: pickle.dump(X_train_cls, f)
with open("processed_data/X_test_clf.pkl", "wb") as f: pickle.dump(X_test_cls, f)
with open("processed_data/y_train_clf.pkl", "wb") as f: pickle.dump(y_train_cls, f)
with open("processed_data/y_test_clf.pkl", "wb") as f: pickle.dump(y_test_cls, f)
    
print("Data splits for CLASSIFICATION saved.")

Data splits for CLASSIFICATION saved.
