## Bibliotecas

In [0]:
!pip install xgboost

In [0]:
import pyspark.sql.functions as F
from sklearn.model_selection import train_test_split
from pyspark.ml.feature import VectorAssembler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mlflow
import seaborn as sns

colors = ['#353839', '#5D8AA8']

In [0]:
def split(df):
#     df = df.toPandas()
    X = df.drop(columns=['label'])
    y = df['label']

    # Treino e teste
    X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, 
    random_state=42,
    stratify=y )

    # Treino e validação
    X_train, X_val, y_train, y_val = train_test_split( X_train,y_train,
    test_size=0.25,  # 0.25 * 0.8 = 0.2 do total
    random_state=42,
    stratify=y_train)

    sizes = [len(X_train),len(X_val),len(X_test)]
    labels = [f'Treino\n{sizes[0]} amostras\n({sizes[0]/sum(sizes):.1%})', 
            f'Validação\n{sizes[1]} amostras\n({sizes[1]/sum(sizes):.1%})', 
            f'Teste\n{sizes[2]} amostras\n({sizes[2]/sum(sizes):.1%})']

    plt.figure(figsize=(8,6))

    # Cores sugestivas
    colors = ['#353839', '#E6C229', '#5D8AA8']

    # Criar figura
    plt.figure(figsize=(8, 6))
    plt.pie(sizes, 
            labels=labels, 
            colors=colors,
            autopct='%1.1f%%', 
            startangle=90,
            wedgeprops={'width': 0.1})

    return X_train, X_val, X_test, y_train, y_val, y_test


In [0]:
df_spark = spark.table('telecom.gold.teleco_ml_ready')

In [0]:
# utilizarei a coluna 'label' para a variavel alvo (target)
df_spark = df_spark.drop('Churn')
df = df_spark.toPandas()

aux = df['label'].copy()
df = df.drop(columns=['label'])
df['label'] = aux

df.head()


In [0]:
X_train, X_val, X_test, y_train, y_val, y_test = split(df)

## Feature Engineering

In [0]:
col = list(df.columns)
categorical_features = []
numerical_features = []
for i in col:
    if len(df[i].unique()) > 6:
        numerical_features.append(i)
    else:
        categorical_features.append(i)
        
categorical_features.remove('label')
print('Categorical Features :',*categorical_features)
print('Numerical Features :',*numerical_features)

In [0]:
corr = df.corrwith(df['label']).sort_values(ascending=False).to_frame()
corr.columns = ['Correlation']
plt.figure(figsize=(5,8))
sns.heatmap(corr,annot=True,cmap=colors,linewidths=.8,linecolor='black',vmin=-1,vmax=1)
plt.tight_layout()
plt.show()



notamos que há algumas features que não apresentam correlação relevante. Irei dropar as colunas que tenham correlação entre (-0.1,0.1)

In [0]:
no_corr = corr.loc[(corr['Correlation'] > -0.1) & (corr['Correlation'] < 0.1)].reset_index()
no_corr = list(no_corr['index'])
print(no_corr)


## Feature selection

In [0]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2,mutual_info_classif


In [0]:
features = df.loc[:,categorical_features]
target = df.loc[:,'label']

# teste qui2 para analsie de categorical features
best_features = SelectKBest(score_func = chi2,k = 'all')
fit = best_features.fit(features,target)

featureScores = pd.DataFrame(data = fit.scores_,index = list(features.columns),columns = ['Chi Squared Score']) 

plt.subplots(figsize = (5,8))
sns.heatmap(featureScores.sort_values(ascending = False,by = 'Chi Squared Score'),annot = True,cmap = colors,linewidths = 0.4,linecolor = 'black',fmt = '.2f');
plt.title('Selection of Categorical Features');

In [0]:
featureScores

bad_features = featureScores.loc[(featureScores['Chi Squared Score'] < 20)].reset_index()
bad_features = list(bad_features['index'])
print(bad_features)


In [0]:
cols_to_drop = list(set(no_corr) | set(bad_features))

cols_to_drop

In [0]:
df_final = df.drop(columns=cols_to_drop)
print(df_final.shape)

In [0]:
!pip install -U imbalanced-learn
import imblearn
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [0]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler , FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import make_pipeline 



def drop_categorical_columns(X,column = cols_to_drop ):
    return X.drop(columns=column)

categorical_pipeline = Pipeline([
    ('drop', FunctionTransformer(drop_categorical_columns))
])

prep_pipeline = ColumnTransformer(
    [('minmax', MinMaxScaler(), numerical_features),
     ('cat', categorical_pipeline, categorical_features)]
)

# Pipeline final com SMOTE
full_pipeline = make_pipeline(
    prep_pipeline,
    SMOTE(sampling_strategy = 1,random_state=42)  
)




In [0]:
# --------------------------------------------
# 1. Dados de TREINO 
# --------------------------------------------
# Com SMOTE (balanceados)
X_train_smote, y_train_smote = full_pipeline.fit_resample(X_train, y_train)

# Sem SMOTE (apenas pré-processados)
X_train_prep,y_train_prep = prep_pipeline.fit_transform(X_train),y_train  # y_train não muda

# --------------------------------------------
# 2. Dados de VALIDAÇÃO (sem SMOTE)
# --------------------------------------------
X_val_prep,y_val_prep = prep_pipeline.transform(X_val),y_val  # y_val não muda

# --------------------------------------------
# 3. Dados de TESTE (sem SMOTE) - GUARDE PARA O FINAL!
# --------------------------------------------
X_test_prep,y_test_prep = prep_pipeline.transform(X_test),y_test  # y_test não muda

# Verificação
print("\n--- Shapes ---")
print(f"Treino (SMOTE): {X_train_smote.shape}, Target: {y_train_smote.shape}")
print(f"Treino (sem SMOTE): {X_train_prep.shape}, Target: {y_train.shape}")
print(f"Validação: {X_val_prep.shape}, Target: {y_val.shape}")
print(f"Teste: {X_test_prep.shape}, Target: {y_test.shape}")

In [0]:
# 1. Nomes das features numéricas (não mudam)
numeric_features_transformed = numerical_features  

# 2. Nomes das features categóricas (após remoção)
categorical_features_kept = [col for col in categorical_features if col not in cols_to_drop]

# 3. Juntar todas as features (ordem igual à saída do ColumnTransformer)
feature_names = numeric_features_transformed + categorical_features_kept

print("Nomes das features após pré-processamento:\n", feature_names)

## Modeling


### Func

In [0]:
from sklearn.model_selection import RepeatedStratifiedKFold,cross_val_score
from sklearn.metrics import (roc_auc_score,
                             roc_curve,
                            accuracy_score,
                            RocCurveDisplay,
                            confusion_matrix,
                            precision_recall_curve,
                            classification_report,f1_score, mean_squared_error)

import seaborn as sns

In [0]:
def plot_metric(y_val,y_probs):
    fpr,tpr,_ = roc_curve(y_val,y_probs)   
    rocauc_score = roc_auc_score(y_val,y_probs)
    print(f'ROC AUC SCORE: {(rocauc_score):.2f}')

    precison,recalls,thresholds = precision_recall_curve(y_val,y_probs)  
    precision_score = precision_recall_curve(y_val,y_probs)
    print(f'Precision Score: {precision_score}')
    print(f'Recall Score: {recalls}')

    fig = plt.figure(figsize=(8,6))
    plt.subplot(1,2,1)
    RocCurveDisplay(fpr=fpr,tpr=tpr).plot()

    plt.subplot(1,2,2)
    plt.plot(thresholds,precison[:-1],"b--",label="Precision")
    plt.plot(thresholds,recalls[:-1],"b--",label="Recall")
   
    


In [0]:
def model(classifier, x_train, y_train, x_val, y_val):
    # Fit the classifier
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_val)
    y_probs = classifier.predict_proba(x_val)[:, 1]

    # Cross-validation
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
    cross_score = cross_val_score(classifier, x_train, y_train, cv=cv, scoring='roc_auc')
    print(f'Cross Validation Score: {cross_score.mean():.2f}')
    print(f'Cross Validation Std: {cross_score.std():.2f}')

    # ROC Curve
    fpr, tpr, _ = roc_curve(y_val, y_probs)
    roc_auc = roc_auc_score(y_val, y_probs)
    print(f'ROC AUC SCORE: {roc_auc:.2f}')

    # Precision-Recall Curve
    precision, recall, thresholds = precision_recall_curve(y_val, y_probs)

    # Plotting
    fig, axs = plt.subplots(1, 2, figsize=(14, 6))

    # ROC Curve plot
    axs[0].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
    axs[0].plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--')
    axs[0].set_xlim([0.0, 1.0])
    axs[0].set_ylim([0.0, 1.05])
    axs[0].set_xlabel('False Positive Rate')
    axs[0].set_ylabel('True Positive Rate')
    axs[0].set_title('ROC Curve')
    axs[0].legend(loc="lower right")

    # Precision-Recall Curve plot
    axs[1].plot(thresholds, precision[:-1], label='Precision', color='green')
    axs[1].plot(thresholds, recall[:-1], label='Recall', color='blue')
    axs[1].set_xlabel('Threshold')
    axs[1].set_ylabel('Score')
    axs[1].set_title('Precision-Recall vs Threshold')
    axs[1].legend()

    plt.tight_layout()
    plt.show()


In [0]:
def model_evaluation(classifier,x_test,y_test):
    cm = confusion_matrix(y_test,classifier.predict(x_test))
    names = ['True Neg','False Pos','False Neg','True Pos']
    counts = [value for value in cm.flatten()]
    percentages = ['{0:.2%}'.format(value) for value in cm.flatten()/np.sum(cm)]
    labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(names,counts,percentages)]
    labels = np.asarray(labels).reshape(2,2)
    sns.heatmap(cm,annot = labels,cmap = 'Blues',fmt ='')
    plt.title('Matriz de Confusão')
    plt.show()
    
 
    print(classification_report(y_test,classifier.predict(x_test)))

### XGbosstClasifier

#### Busca dos melhores parametros com Optuna

In [0]:
!pip install optuna

In [0]:
def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "n_estimators": 1000,
        "verbosity": 0,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
    }

    model = XGBClassifier(**params)
    model.fit(X_train_smote, y_train_smote, verbose=False)
    predictions = model.predict(X_val_prep)
    rmse = mean_squared_error(y_val_prep, predictions)
    return rmse

In [0]:
import optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)


In [0]:
print('Melhores hiperparâmetros:', study.best_params)
print('Melhor RMSE:', study.best_value)

#### Sem Otimização dos hiperparametros

In [0]:
from xgboost import XGBClassifier

xgb_classifier = XGBClassifier(learning_rate=0.001,
                               max_depth=5,
                               n_estimators=1000,
                                subsample=0.8,
                                colsample_bytree=0.7,
                               random_state=42)


model(xgb_classifier,X_train_smote,y_train_smote,X_val_prep,y_val)
model_evaluation(xgb_classifier,X_val_prep,y_val)

In [0]:
xgb_classifier = XGBClassifier(learning_rate=0.001,
                               max_depth=5,
                               n_estimators=1000,
                                subsample=0.8,
                                colsample_bytree=0.7,
                               random_state=42)


model(xgb_classifier,X_train_prep,y_train_prep,X_val_prep,y_val)
model_evaluation(xgb_classifier,X_val_prep,y_val)

%md
#### Com Otimização dos hiperparametros

In [0]:

xgb_classifier = XGBClassifier(learning_rate=0.021160840396171342,
                               max_depth=7,
                               n_estimators=700,
                                subsample=0.8120575211020147,
                                colsample_bytree=0.1441064068410362,
                                min_child_weight=1,
                               random_state=42)


model(xgb_classifier,X_train_smote,y_train_smote,X_val_prep,y_val)
model_evaluation(xgb_classifier,X_val_prep,y_val)

In [0]:
from xgboost import plot_importance

# Método 1: Usar get_booster() para atribuir nomes
xgb_classifier.get_booster().feature_names = feature_names

# Plotar importância
plt.figure(figsize=(12, 8))
plot_importance(xgb_classifier, max_num_features=20,importance_type='weight')
plt.title('Importância das Features - Weight')

plot_importance(xgb_classifier, max_num_features=20,importance_type='cover')
plt.title('Importância das Features - cover')
plt.show()

### LGBM


In [0]:
!pip install lightgbm
from lightgbm import LGBMClassifier

In [0]:
def objective(trial):
    params = {
        
        "metric": "binary",
        "n_estimators": 1000,
        "verbosity": -1,
        "bagging_freq": 1,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 2**10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
    }

    model = LGBMClassifier(**params)
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    f1 = f1_score(y_val, predictions)
    return f1

In [0]:
# # study_lgbm = optuna.create_study(direction='minimize')
# # study_lgbm.optimize(objective, n_trials=30)
# print('Melhores hiperparâmetros:', study_lgbm.best_params)
# print('Melhor RMSE:', study_lgbm.best_value)

In [0]:
study_lgbm2 = optuna.create_study(direction='maximize')
study_lgbm2.optimize(objective, n_trials=30)
print('Melhores hiperparâmetros:', study_lgbm2.best_params)
print('Melhor RMSE:', study_lgbm2.best_value)

In [0]:
classifier_lgbm = LGBMClassifier(learning_rate= 0.014511147170945548,
                                 max_depth = 123,
                                 subsample=0.12900147519059652,
                                 colsample_bytree=0.4668403391748687,
                                 min_data_in_leaf= 75,
                                 n_estimators = 1000)


model(classifier_lgbm,X_train_smote,y_train_smote,X_val_prep,y_val)
model_evaluation(classifier_lgbm,X_val_prep,y_val)


In [0]:
classifier_lgbm = LGBMClassifier(learning_rate= 0.06078628608330118,
                                 max_depth = 700,
                                 subsample=0.07546964460988906,
                                 colsample_bytree=0.21716907803240054,
                                 min_data_in_leaf= 80,
                                 n_estimators = 1000)


model(classifier_lgbm,X_train_smote,y_train_smote,X_val_prep,y_val)
model_evaluation(classifier_lgbm,X_val_prep,y_val)

In [0]:
# Melhores hiperparâmetros: {'learning_rate': 0.06078628608330118, 'num_leaves': 700, 'subsample': 0.07546964460988906, 'colsample_bytree': 0.21716907803240054, 'min_data_in_leaf': 80}
# Melhor RMSE: 0.6133720930232558