# Cancelamento de Clientes - Telco (dataset criado pela IBM para demonstração da ferramenta IBM Cognos Analytics)

### Contém informações sobre uma empresa fictícia de telecomunicações que forneceu serviços de telefonia residencial e internet para 7043 clientes na Califórnia no 3º trimestre.

### Etapa do pipeline - Realizado por Sabrina Otoni da Silva - 2024/01

### Objetivo: Testar diversas combinações dos tratamentos desenvolvidos e ajustes nos modelos escolhidos para refinamento das predições. 

In [1]:
from pathlib import Path

import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import sys
import os

automations_dir = os.path.join(os.getcwd(), '../automations')

if automations_dir not in sys.path:
    sys.path.append(automations_dir)

from data_processing import LogTransformer, BoxCoxTransformer, RBFTransformer, KMeansCluster, DropColumns, ServiceTransformer, CategoricalEncoder

import warnings
warnings.filterwarnings('ignore')

Por questões de processamento, foram selecionados 3 modelos.

In [2]:
datapath = Path('../data')
csv_path = Path(f'{datapath}/d02_intermediate')
preprocesspath = Path('../preprocessing')

In [3]:
X_train = pd.read_csv(f'{csv_path}/X_train.csv')
y_train = pd.read_csv(f'{csv_path}/y_train.csv')

In [4]:
class PassthroughTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X

In [5]:
def get_transformer(transformer_type: str, columns: list = None):
    if transformer_type == 'log':
        return LogTransformer(model_path='../preprocessing/log_transformer_model.pkl', columns=columns)
    elif transformer_type == 'boxcox':
        return BoxCoxTransformer(model_path='../preprocessing/boxcox_transformer_model.pkl', columns=columns)
    else:
        return PassthroughTransformer()

In [6]:
class ConditionalServiceTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, service_transformer, encoder_type):
        self.service_transformer = service_transformer
        self.encoder_type = encoder_type

    def fit(self, X, y=None):
        if self.encoder_type == 'onehot':
            self.service_transformer.fit(X, y)
        return self

    def transform(self, X):
        if self.encoder_type == 'onehot':
            return self.service_transformer.transform(X)
        return X

In [7]:
def get_conditional_transformer(use_rbf, use_kmeans):
    transformers = []
    if use_rbf:
        transformers.append(('rbf_transformer', RBFTransformer(model_path='../preprocessing/rbf_transformer_model.pkl', column='Tenure Months')))
    else:
        transformers.append(('rbf_passthrough', PassthroughTransformer()))

    if use_kmeans:
        transformers.append(('kmeans_cluster', KMeansCluster(model_path='../preprocessing/kmeans_model.pkl', columns_cluster=['Latitude', 'Longitude'])))
    else:
        transformers.append(('kmeans_passthrough', PassthroughTransformer()))
        
    return FeatureUnion(transformers)

In [8]:
def get_scaler(scaler: str):
    if scaler == 'standard':
        return StandardScaler()
    elif scaler == 'minmax':
        return MinMaxScaler()
    elif scaler == 'robust':
        return RobustScaler()
    else:
        return PassthroughTransformer()

In [9]:
def select_model(model_name: str, **params):
    model_dict = {
        'dummy': DummyClassifier,
        'logistic_regression': LogisticRegression,
        'svr': SVR,
        'knn': KNeighborsClassifier,
        'random_forest': RandomForestClassifier,
        'xgboost': XGBClassifier
    }

    model_class = model_dict.get(model_name)
    
    if model_class is not None:
        return model_class(**params) if params else model_class()
    else:
        raise ValueError(f"Modelo {model_name} não encontrado.")

In [10]:
def build_pipeline(transformer_type: str, cond_encoder_type: str, cat_encoder_type: str, use_rbf: bool, use_kmeans: bool, scaler_type: str, model_name: str, model_params: dict = None):
    pipeline_steps = [
        ('conditional_transformer', get_conditional_transformer(use_rbf, use_kmeans)),
        ('import_drop', DropColumns(drop_columns=['City', 'Latitude', 'Longitude', 'ID', 'Tenure Months'])),
        ('conditional_service', ConditionalServiceTransformer(
            service_transformer=ServiceTransformer(columns=['Multiple Lines', 'Online Security', 'Online Backup', 'Device Protection', 
                                                            'Tech Support', 'Streaming TV', 'Streaming Movies']), encoder_type=cond_encoder_type)),
        ('categorical_encoder', CategoricalEncoder(
            specified_columns=['Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Phone Service', 'Multiple Lines', 'Internet Service',
                               'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies', 
                               'Contract', 'Paperless Billing', 'Payment Method', 'Cluster'], encoder_type=cat_encoder_type)), 
        ('transformation', get_transformer(transformer_type, columns=['Total Charges'])),          
        ('scaler', get_scaler(scaler_type))
    ]
    if model_params:
        pipeline_steps.append(('model', select_model(model_name, **model_params)))
    else:
        pipeline_steps.append(('model', select_model(model_name)))
    return Pipeline(pipeline_steps)

In [11]:
pipeline = build_pipeline(
    transformer_type='log',
    cond_encoder_type='onehot',
    cat_encoder_type='onehot',
    use_rbf=True,
    use_kmeans=True,
    scaler_type='minmax',
    model_name='dummy'
)

In [12]:
param_grid = {
    'transformation': [get_transformer(transformer_type='log'), get_transformer(transformer_type='boxcox'), PassthroughTransformer()],
    'conditional_service__encoder_type': ['onehot', 'label'],
    'categorical_encoder__encoder_type': ['onehot', 'label'],
    'conditional_transformer': [get_conditional_transformer(True, True), get_conditional_transformer(True, False), get_conditional_transformer(False, True), get_conditional_transformer(False, False)],
    'scaler': [MinMaxScaler()],
    'model': [select_model('dummy'), select_model('logistic_regression'), select_model('xgboost')]
}

# param_grid = {
#     # ... outros parâmetros
#     'model': [select_model('logistic_regression'), select_model('random_forest')],
#     'model__C': [0.1, 1, 10],  # Parâmetro para LogisticRegression
#     'model__n_estimators': [100, 200]  # Parâmetro para RandomForestClassifier
# }
# param_grid = {
#     # ... outros parâmetros
#     'model': [select_model('logistic_regression'), select_model('random_forest')],
#     'model__C': [0.1, 1, 10],  # Parâmetro para LogisticRegression
#     'model__n_estimators': [100, 200]  # Parâmetro para RandomForestClassifier
# }

In [13]:
# grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=StratifiedKFold.split(X_train, y_train), scoring='accuracy', verbose=2)
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, error_score="raise")
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


NotImplementedError: Transformação de numpy.ndarray não suportada nesta versão.

In [None]:
grid_search.best_params_

{'categorical_encoder__encoder_type': 'label',
 'conditional_service_transformer__encoder_type': 'label',
 'kmeans_cluster': KMeansCluster(columns_cluster=['Latitude', 'Longitude']),
 'random_forest__class_weight': 'balanced',
 'rbf_transformer': RBFTransformer(column='Tenure Months'),
 'scaler': StandardScaler(),
 'transformation': BoxCoxTransformer(columns=['Total Charges'])}