# Cancelamento de Clientes - Telco (dataset criado pela IBM para demonstração da ferramenta IBM Cognos Analytics)

### Contém informações sobre uma empresa fictícia de telecomunicações que forneceu serviços de telefonia residencial e internet para 7043 clientes na Califórnia no 3º trimestre.

### Etapa do pipeline - Realizado por Sabrina Otoni da Silva - 2024/01

### Objetivo:

In [1]:
from pathlib import Path

import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

import sys
import os

automations_dir = os.path.join(os.getcwd(), '../automations')

if automations_dir not in sys.path:
    sys.path.append(automations_dir)

from data_processing import LogTransformer, BoxCoxTransformer, RBFTransformer, KMeansCluster, DropColumns, ServiceTransformer, CategoricalEncoder

import warnings
warnings.filterwarnings('ignore')

In [2]:
datapath = Path('../data')
csv_path = Path(f'{datapath}/d02_intermediate')
preprocesspath = Path('../preprocessing')

In [3]:
X_train = pd.read_csv(f'{csv_path}/X_train.csv')
y_train = pd.read_csv(f'{csv_path}/y_train.csv')

In [4]:
class PassthroughTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X

In [5]:
def get_transformer(transformer_type: str, columns: list = None):
    if transformer_type == 'log':
        return LogTransformer(model_path='../preprocessing/log_transformer_model.pkl', columns=columns)
    elif transformer_type == 'boxcox':
        return BoxCoxTransformer(model_path='../preprocessing/boxcox_transformer_model.pkl', columns=columns)
    else:
        return PassthroughTransformer()

In [6]:
class ConditionalServiceTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, service_transformer, encoder_type):
        self.service_transformer = service_transformer
        self.encoder_type = encoder_type

    def fit(self, X, y=None):
        if self.encoder_type == 'onehot':
            self.service_transformer.fit(X, y)
        return self

    def transform(self, X):
        if self.encoder_type == 'onehot':
            return self.service_transformer.transform(X)
        return X

In [7]:
def get_conditional_transformer(use_rbf, use_kmeans):
    transformers = []
    if use_rbf:
        transformers.append(('rbf_transformer', RBFTransformer(model_path='../preprocessing/rbf_transformer_model.pkl', column='Tenure Months')))
    else:
        transformers.append(('rbf_passthrough', PassthroughTransformer()))

    if use_kmeans:
        transformers.append(('kmeans_cluster', KMeansCluster(model_path='../preprocessing/kmeans_model.pkl', columns_cluster=['Latitude', 'Longitude'])))
    else:
        transformers.append(('kmeans_passthrough', PassthroughTransformer()))
        
    return FeatureUnion(transformers)

In [8]:
def get_scaler(scaler: str):
    if scaler == 'standard':
        return StandardScaler()
    elif scaler == 'minmax':
        return MinMaxScaler()
    elif scaler == 'robust':
        return RobustScaler()
    else:
        return PassthroughTransformer()

In [9]:
def select_model(model_name: str, **params):
    model_dict = {
        'dummy': DummyClassifier,
        'logistic_regression': LogisticRegression,
        #'svr': SVR,
        'knn': KNeighborsClassifier,
        'random_forest': RandomForestClassifier,
        'xgboost': XGBClassifier
    }

    model_class = model_dict.get(model_name)
    
    if model_class is not None:
        return model_class(**params) if params else model_class()
    else:
        raise ValueError(f"Modelo {model_name} não encontrado.")

In [10]:
def build_pipeline(transformer_type: str, cond_encoder_type: str, cat_encoder_type: str, use_rbf: bool, use_kmeans: bool, scaler_type: str, model_name: str, model_params: dict = None):
    pipeline_steps = [
        ('transformation', get_transformer(transformer_type, columns=['Total Charges'])),
        ('import_drop', DropColumns(drop_columns=['City', 'Latitude', 'Longitude', 'ID', 'Tenure Months'])),
        ('conditional_service', ConditionalServiceTransformer(
            service_transformer=ServiceTransformer(
                columns=['Multiple Lines', 'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies']), 
                 encoder_type=cond_encoder_type)),
        ('categorical_encoder', CategoricalEncoder(
            specified_columns=['Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Phone Service', 'Multiple Lines', 'Internet Service',
                               'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies', 
                               'Contract', 'Paperless Billing', 'Payment Method', 'Cluster'], encoder_type=cat_encoder_type)),           
        ('conditional_transformer', get_conditional_transformer(use_rbf, use_kmeans)),
        ('scaler', get_scaler(scaler_type))
    ]
    if model_params:
        pipeline_steps.append(('model', select_model(model_name, **model_params)))
    else:
        pipeline_steps.append(('model', select_model(model_name)))
    return Pipeline(pipeline_steps)

In [11]:
pipeline = build_pipeline(
    transformer_type='log',
    cond_encoder_type='onehot',
    cat_encoder_type='onehot',
    use_rbf=True,
    use_kmeans=True,
    scaler_type='minmax',
    model_name='dummy'
)

In [12]:
param_grid = {
    'transformation': [get_transformer(transformer_type='log'), get_transformer(transformer_type='boxcox'), PassthroughTransformer()],
    'conditional_service__encoder_type': ['onehot', 'label'],
    'categorical_encoder__encoder_type': ['onehot', 'label'],
    'conditional_transformer': [get_conditional_transformer(True, True), get_conditional_transformer(True, False), get_conditional_transformer(False, True), get_conditional_transformer(False, False)],
    'scaler': [StandardScaler(), MinMaxScaler()],
    'model': [select_model('dummy'), select_model('logistic_regression'), select_model('knn'), select_model('random_forest'), select_model('xgboost')]
}

param_grid = {
    # ... outros parâmetros
    'model': [select_model('logistic_regression'), select_model('random_forest')],
    'model__C': [0.1, 1, 10],  # Parâmetro para LogisticRegression
    'model__n_estimators': [100, 200]  # Parâmetro para RandomForestClassifier
}
param_grid = {
    # ... outros parâmetros
    'model': [select_model('logistic_regression'), select_model('random_forest')],
    'model__C': [0.1, 1, 10],  # Parâmetro para LogisticRegression
    'model__n_estimators': [100, 200]  # Parâmetro para RandomForestClassifier
}

In [13]:
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 480 candidates, totalling 2400 fits


[CV] END categorical_encoder__encoder_type=onehot, conditional_service__encoder_type=onehot, conditional_transformer=FeatureUnion(transformer_list=[('rbf_transformer',
                                RBFTransformer(column='Tenure Months',
                                               model_path='../preprocessing/rbf_transformer_model.pkl')),
                               ('kmeans_cluster',
                                KMeansCluster(columns_cluster=['Latitude',
                                                               'Longitude'],
                                              model_path='../preprocessing/kmeans_model.pkl'))]), model=DummyClassifier(), scaler=StandardScaler(), transformation=LogTransformer(model_path='../preprocessing/log_transformer_model.pkl'); total time=   0.1s
[CV] END categorical_encoder__encoder_type=onehot, conditional_service__encoder_type=onehot, conditional_transformer=FeatureUnion(transformer_list=[('rbf_transformer',
                              

ValueError: 
All the 2400 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1200 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\012728631\AppData\Local\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\012728631\AppData\Local\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\012728631\AppData\Local\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 416, in fit
    Xt = self._fit(X, y, **fit_params_steps)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\012728631\AppData\Local\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 370, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\012728631\AppData\Local\anaconda3\Lib\site-packages\joblib\memory.py", line 353, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\012728631\AppData\Local\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 950, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\012728631\AppData\Local\anaconda3\Lib\site-packages\sklearn\utils\_set_output.py", line 140, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\012728631\AppData\Local\anaconda3\Lib\site-packages\sklearn\base.py", line 918, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\012728631\Desktop\Data Science & AI\PROJETOS\Churn\notebooks\../automations\data_processing.py", line 287, in fit
    self.encoders[col] = OneHotEncoder(sparse_output=False, handle_unknown='error', drop=self.drop).fit(X[[col]])
                                                                                                        ~^^^^^^^
  File "c:\Users\012728631\AppData\Local\anaconda3\Lib\site-packages\pandas\core\frame.py", line 3813, in __getitem__
    indexer = self.columns._get_indexer_strict(key, "columns")[1]
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\012728631\AppData\Local\anaconda3\Lib\site-packages\pandas\core\indexes\base.py", line 6070, in _get_indexer_strict
    self._raise_if_missing(keyarr, indexer, axis_name)
  File "c:\Users\012728631\AppData\Local\anaconda3\Lib\site-packages\pandas\core\indexes\base.py", line 6130, in _raise_if_missing
    raise KeyError(f"None of [{key}] are in the [{axis_name}]")
KeyError: "None of [Index(['Cluster'], dtype='object')] are in the [columns]"

--------------------------------------------------------------------------------
1200 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\012728631\AppData\Local\anaconda3\Lib\site-packages\pandas\core\indexes\base.py", line 3802, in get_loc
    return self._engine.get_loc(casted_key)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\_libs\index.pyx", line 165, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\_libs\hashtable_class_helper.pxi", line 5745, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas\_libs\hashtable_class_helper.pxi", line 5753, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Cluster'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\012728631\AppData\Local\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\012728631\AppData\Local\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\012728631\AppData\Local\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 416, in fit
    Xt = self._fit(X, y, **fit_params_steps)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\012728631\AppData\Local\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 370, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\012728631\AppData\Local\anaconda3\Lib\site-packages\joblib\memory.py", line 353, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\012728631\AppData\Local\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 950, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\012728631\AppData\Local\anaconda3\Lib\site-packages\sklearn\utils\_set_output.py", line 140, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\012728631\AppData\Local\anaconda3\Lib\site-packages\sklearn\base.py", line 918, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\012728631\Desktop\Data Science & AI\PROJETOS\Churn\notebooks\../automations\data_processing.py", line 290, in fit
    self.encoders[col] = LabelEncoder().fit(X[col])
                                            ~^^^^^
  File "c:\Users\012728631\AppData\Local\anaconda3\Lib\site-packages\pandas\core\frame.py", line 3807, in __getitem__
    indexer = self.columns.get_loc(key)
              ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\012728631\AppData\Local\anaconda3\Lib\site-packages\pandas\core\indexes\base.py", line 3804, in get_loc
    raise KeyError(key) from err
KeyError: 'Cluster'


In [None]:
grid_search.best_params_

{'categorical_encoder__encoder_type': 'label',
 'conditional_service_transformer__encoder_type': 'label',
 'kmeans_cluster': KMeansCluster(columns_cluster=['Latitude', 'Longitude']),
 'random_forest__class_weight': 'balanced',
 'rbf_transformer': RBFTransformer(column='Tenure Months'),
 'scaler': StandardScaler(),
 'transformation': BoxCoxTransformer(columns=['Total Charges'])}