# Preprocessing Pipeline

## Encoding pipeline

### Importing Libraries

In [1]:
import pickle
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

### Load cleaned data

In [2]:
df = pd.read_pickle("processed_data/cleaned_data.pkl")
print("Cleaned data loaded")

Cleaned data loaded


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4931 entries, 0 to 4999
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   gender                     4931 non-null   object  
 1   SeniorCitizen              4931 non-null   object  
 2   Partner                    4931 non-null   object  
 3   Dependents                 4931 non-null   object  
 4   PhoneService               4931 non-null   object  
 5   MultipleLines              4931 non-null   object  
 6   InternetService            4931 non-null   object  
 7   OnlineSecurity             4931 non-null   object  
 8   OnlineBackup               4931 non-null   object  
 9   DeviceProtection           4931 non-null   object  
 10  TechSupport                4931 non-null   object  
 11  StreamingTV                4931 non-null   object  
 12  StreamingMovies            4931 non-null   object  
 13  Contract                   4931 non-nu

### Pipeline

### 1. Encoding pipeline 

#### Custom transformer for multi-column LabelEncoder

In [4]:
# Custom transformer for multi-column LabelEncoder
class MultiColumnLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        self.encoders_ = {}

    def fit(self, X, y=None):
        if self.columns is not None:
            for col in self.columns:
                le = LabelEncoder()
                # Fit on the specific column
                le.fit(X[col].values)
                self.encoders_[col] = le
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        if self.columns is not None:
            for col in self.columns:
                X_copy[col] = self.encoders_[col].transform(X_copy[col].values)
        return X_copy

In [5]:
df['Tenure_(12-month_groups)'].dtype

CategoricalDtype(categories=[[1, 13), [13, 25), [25, 37), [37, 49), [49, 61), [61, 73)], ordered=True, categories_dtype=interval[int64, left])

In [6]:
df['Tenure_(12-month_groups)'] = df['Tenure_(12-month_groups)'].apply(lambda x: f"{x.left} - {x.right}")

In [7]:
# Define the column transformers for different encoding types
one_hot_cols = ['InternetService', 'PaymentMethod']  
ordinal_cols = ['Contract']
label_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Tenure_(12-month_groups)']  
internet_service_label_cols = ['MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

# Create transformers
one_hot_transformer = OneHotEncoder(drop=None, sparse_output=False)
ordinal_transformer = OrdinalEncoder(categories=[['Month-to-month', 'One year', 'Two year']])

# Combine transformers into ColumnTransformer
# Use custom transformer for label_cols and internet_service_label_cols
encoder_preprocessor_mixed = ColumnTransformer(
    transformers=[
        ('onehot', one_hot_transformer, one_hot_cols),
        ('ordinal', ordinal_transformer, ordinal_cols),
        ('label', MultiColumnLabelEncoder(columns=label_cols), label_cols),
        ('internet_label', MultiColumnLabelEncoder(columns=internet_service_label_cols), internet_service_label_cols)
    ],
    remainder='passthrough'  # Keep other columns as they are
)

# Create the pipeline
pipeline_mixed_encoding = Pipeline(steps=[
    ('encoder_preprocessor', encoder_preprocessor_mixed)
])

In [8]:
# Fit the mixed pipeline
df_mixed_encoded = pipeline_mixed_encoding.fit_transform(df)

# Manually construct feature names for the tree pipeline
feature_names_mixed = []
onehot_names = pipeline_mixed_encoding.named_steps['encoder_preprocessor'].named_transformers_['onehot'].get_feature_names_out(one_hot_cols)
feature_names_mixed.extend(onehot_names)
feature_names_mixed.extend(ordinal_cols)
feature_names_mixed.extend(label_cols)
feature_names_mixed.extend(internet_service_label_cols)
remainder_features = [col for col in df.columns if col not in (one_hot_cols + ordinal_cols + label_cols + internet_service_label_cols)]
feature_names_mixed.extend(remainder_features)

print(df_mixed_encoded, feature_names_mixed)

# Create a DataFrame and save it
df_mixed_encoded = pd.DataFrame(df_mixed_encoded, columns=feature_names_mixed)

# remapping object to correct data type
df_mixed_encoded = df_mixed_encoded.infer_objects()

print("Shape of data for tree models:", df_mixed_encoded.shape)
df_mixed_encoded.info()

# Save data for models
df_mixed_encoded.to_pickle("processed_data/data_mixed_encoded.pkl")
print("\n Data for general models saved to data_mixed_encoded.pkl")

[[0.0 1.0 0.0 ... 96.48 0 2]
 [0.0 0.0 1.0 ... 87.58 0 0]
 [0.0 0.0 1.0 ... 79.78 0 0]
 ...
 [0.0 0.0 1.0 ... 91.49 0 0]
 [0.0 0.0 1.0 ... 47.74 0 0]
 [0.0 0.0 1.0 ... 67.31 1 0]] ['InternetService_DSL', 'InternetService_Fiber optic', 'InternetService_No', 'PaymentMethod_Bank transfer (automatic)', 'PaymentMethod_Credit card (automatic)', 'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check', 'Contract', 'gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Tenure_(12-month_groups)', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'SeniorCitizen', 'MonthlyCharges', 'Churn', 'TotalInternetServicesUsed']
Shape of data for tree models: (4931, 25)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4931 entries, 0 to 4930
Data columns (total 25 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0

In [9]:
df_mixed_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4931 entries, 0 to 4930
Data columns (total 25 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   InternetService_DSL                      4931 non-null   float64
 1   InternetService_Fiber optic              4931 non-null   float64
 2   InternetService_No                       4931 non-null   float64
 3   PaymentMethod_Bank transfer (automatic)  4931 non-null   float64
 4   PaymentMethod_Credit card (automatic)    4931 non-null   float64
 5   PaymentMethod_Electronic check           4931 non-null   float64
 6   PaymentMethod_Mailed check               4931 non-null   float64
 7   Contract                                 4931 non-null   float64
 8   gender                                   4931 non-null   int64  
 9   Partner                                  4931 non-null   int64  
 10  Dependents                               4931 no

## Train-Test-Split

### Regression

In [10]:
# Separating features and target for Regression 
X_reg_mixed = df_mixed_encoded.drop('MonthlyCharges', axis=1)
y_reg_mixed = df_mixed_encoded['MonthlyCharges']

# Train, test, split for regression
X_train_reg_mixed, X_test_reg_mixed, y_train_reg_mixed, y_test_reg_mixed = train_test_split(
    X_reg_mixed, y_reg_mixed, test_size=0.2, random_state=42
)

# Save the mixed encoded regression splits with specific names
with open("processed_data/X_train_reg_mixed.pkl", "wb") as f: pickle.dump(X_train_reg_mixed, f)
with open("processed_data/X_test_reg_mixed.pkl", "wb") as f: pickle.dump(X_test_reg_mixed, f)
with open("processed_data/y_train_reg_mixed.pkl", "wb") as f: pickle.dump(y_train_reg_mixed, f)
with open("processed_data/y_test_reg_mixed.pkl", "wb") as f: pickle.dump(y_test_reg_mixed, f)

print("Data splits for REGRESSION (MIXED ENCODED) saved.")

Data splits for REGRESSION (MIXED ENCODED) saved.


### Classification

In [11]:
# Separating features and target for classification
X_cls = df_mixed_encoded.drop('Churn', axis=1)
y_cls = df_mixed_encoded['Churn']

# train, test, split for classification
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(
    X_cls, y_cls, test_size=0.2, random_state=42, stratify=y_cls
)

# Save classification splits
with open("processed_data/X_train_clf.pkl", "wb") as f: pickle.dump(X_train_cls, f)
with open("processed_data/X_test_clf.pkl", "wb") as f: pickle.dump(X_test_cls, f)
with open("processed_data/y_train_clf.pkl", "wb") as f: pickle.dump(y_train_cls, f)
with open("processed_data/y_test_clf.pkl", "wb") as f: pickle.dump(y_test_cls, f)
    
print("Data splits for CLASSIFICATION saved.")

Data splits for CLASSIFICATION saved.
