# Terra Signal Hackathon
This notebook is provided as a starting point. Feel free to use it, discard it, modify it, or pretend it doesn't exist.

In [0]:
%pip install pandas
%pip install xgboost

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import (
    classification_report, 
    roc_auc_score, 
    confusion_matrix
)
from sklearn.metrics import RocCurveDisplay
from sklearn.dummy import DummyClassifier
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

# Read the CSV file using pandas
file_path = "./history.csv"
df = pd.read_csv(file_path)
df.head().transpose()

An√°lise Explorat√≥ria e limpeza dos dados

In [0]:
df.info()

In [0]:
for col in df.columns:
    print(f"\n--- {col} ---")
    print(df[col].value_counts())


In [0]:
# criando um data frame limpo
df_clean = df.copy()
df_clean.columns = df_clean.columns.str.strip()

# limpando a coluna 'tenure'

# substituindo 'unknown' por NaN e convertendo para num√©rico
df_clean["tenure"] = df_clean["tenure"].replace("unknown", pd.NA)
df_clean["tenure"] = pd.to_numeric(df_clean["tenure"], errors="coerce")

# substituindo valores 0 por NaN
df_clean.loc[df_clean["tenure"] == 0, "tenure"] = pd.NA

# preenchendo NaN com a mediana e convertendo para inteiro
df_clean["tenure"] = df_clean["tenure"].fillna(df_clean["tenure"].median())
df_clean["tenure"] = df_clean["tenure"].astype(int)

# normalizando phone service
df_clean["PhoneService"] = (
    df_clean["PhoneService"]
    .astype(str)
    .str.strip()
    .str.lower()
    .replace({"yes": 1, "no": 0})
    .astype(int)  # <- evitar FutureWarning
)

# normalizando multiple lines
df_clean["MultipleLines"] = (
    df_clean["MultipleLines"]
    .replace({"No phone service": "No"})
    .map({"Yes": 1, "No": 0})
    .astype(int)
)

# normalizando colunas de internet
internet_cols = [
    "OnlineSecurity", "OnlineBackup", "DeviceProtection",
    "TechSupport", "StreamingTV", "StreamingMovies"
]

for col in internet_cols:
    df_clean[col] = (
        df_clean[col]
        .replace({"No internet service": "No"})
        .map({"Yes": 1, "No": 0})
        .astype(int)
    )

# normalizando colunas bin√°rias
for col in ["Partner", "Dependents", "PaperlessBilling"]:
    df_clean[col] = df_clean[col].map({"Yes": 1, "No": 0}).astype(int)

# convertendo total charges para num√©rico e tratando NaN
df_clean["TotalCharges"] = pd.to_numeric(df_clean["TotalCharges"], errors="coerce")
df_clean["TotalCharges"] = df_clean["TotalCharges"].fillna(df_clean["TotalCharges"].median())

# limpando coluna de feedback do cliente
df_clean["CustomerFeedback"] = df_clean["CustomerFeedback"].fillna("").astype(str)
df_clean["CustomerFeedback_clean"] = (
    df_clean["CustomerFeedback"]
    .str.lower()
    .str.replace("[^a-zA-Z0-9 ]", "", regex=True)
)

# tratando categ√≥ricas com one-hot encoding
cat_cols = [
    "gender", "InternetService", "Contract", "PaymentMethod"
]

df_clean = pd.get_dummies(df_clean, columns=cat_cols, drop_first=True)

# convertendo target para bin√°rio
df_clean["Churn"] = df_clean["Churn"].map({"Yes": 1, "No": 0}).astype(int)


In [0]:
df_clean.info()


In [0]:
df_clean.head()

In [0]:
df_clean.to_csv("history_clean.csv", index=False)

In [0]:
df_clean.columns.tolist()


In [0]:
# Selecionar apenas as 3 colunas que voc√™ quer na tabela final
df_minimal = df_clean[["customerID", "Churn", "CustomerFeedback_clean"]]

# Converter para Spark DataFrame
df_spark = spark.createDataFrame(df_minimal)
df_spark.createOrReplaceTempView("tmp_history_clean")


In [0]:
%sql
CREATE OR REPLACE TABLE workspace.churn.history_clean AS
SELECT customerID, Churn, CustomerFeedback_clean
FROM tmp_history_clean;

In [0]:
'''
import datetime


def prediction_function(input_df):
    X = input_df[['customerID']].copy()
    X['prediction'] = np.random.uniform(size=len(X)) >= 0.5
    X['prediction'] = X['prediction'].map({True: 'Yes', False: 'No'})
    return X

test_df = pd.read_csv('inference.csv')
prediction = prediction_function(test_df)
print(prediction.head().transpose())
# Use this code to save the prediction to a csv file for submission:
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
prediction.to_csv(f'prediction_<MY_GROUP_NAME>_{timestamp}.csv')
'''

In [0]:
plt.figure(figsize=(6,4))
sns.countplot(
    data=df_clean,
    x="Churn",
    hue="Churn",
    palette="Set2",
    legend=False
)
plt.title("Quantidade de Clientes: Churn vs N√£o Churn")
plt.xticks([0,1], ["N√£o churn", "Churn"])
plt.ylabel("Quantidade")
plt.xlabel("")
plt.show()


In [0]:
plt.figure(figsize=(10,5))
sns.histplot(
    data=df_clean,
    x="tenure",
    hue="Churn",
    multiple="stack",
    bins=40,
    palette="Set2"
)
plt.title("Distribui√ß√£o de Tenure por Churn")
plt.show()


In [0]:
plt.figure(figsize=(8,5))
sns.boxplot(
    data=df_clean,
    x="Churn",
    y="MonthlyCharges",
    hue="Churn",
    palette="Set2",
    legend=False   # evita legenda duplicada
)
plt.title("MonthlyCharges por Churn")
plt.xticks([0,1], ["N√£o churn", "Churn"])
plt.show()


In [0]:
plt.figure(figsize=(8,5))
sns.violinplot(data=df_clean, x="Churn", y="TotalCharges", hue="Churn", palette="Set2", legend=False)
plt.title("Total Charges por Churn")
plt.show()


In [0]:
colors = ["#C0C0C0", "#2ECC71"]
# Reconstroi Contract
Contract_series = df_clean.apply(
    lambda r: (
        "Two year" if r["Contract_Two year"] == 1 else
        "One year" if r["Contract_One year"] == 1 else
        "Month-to-month"
    ),
    axis=1
)

# Reconstroi InternetService
InternetService_series = df_clean.apply(
    lambda r: (
        "Fiber optic" if r["InternetService_Fiber optic"] == 1 else
        "No internet" if r["InternetService_No"] == 1 else
        "DSL"
    ),
    axis=1
)

# Reconstroi PaymentMethod
PaymentMethod_series = df_clean.apply(
    lambda r: (
        "Credit card (automatic)" if r["PaymentMethod_Credit card (automatic)"] == 1 else
        "Electronic check" if r["PaymentMethod_Electronic check"] == 1 else
        "Mailed check" if r["PaymentMethod_Mailed check"] == 1 else
        "Bank transfer (automatic)"
    ),
    axis=1
)


In [0]:
def plot_stacked(series, churn, title, xlabel):
    ct = pd.crosstab(series, churn, normalize="index")
    ct.columns = ["No", "Yes"]
    ct.plot(kind="bar", stacked=True, figsize=(7,4), color=colors)
    plt.title(title)
    plt.ylabel("Propor√ß√£o")
    plt.xlabel(xlabel)
    plt.ylim(0,1)
    plt.legend(title="Churn")
    plt.show()


In [0]:
plot_stacked(
    Contract_series,
    df_clean["Churn"],
    "Composi√ß√£o: Churn vs N√£o Churn ‚Äî Contract",
    "Contract Type"
)

plot_stacked(
    InternetService_series,
    df_clean["Churn"],
    "Composi√ß√£o: Churn vs N√£o Churn ‚Äî Internet Service",
    "Internet Service Type"
)
plot_stacked(
    PaymentMethod_series,
    df_clean["Churn"],
    "Composi√ß√£o: Churn vs N√£o Churn ‚Äî Payment Method",
    "Payment Method"
)


In [0]:
plt.figure(figsize=(14,10))
corr = df_clean.corr(numeric_only=True)
sns.heatmap(corr, annot=False, cmap="coolwarm", linewidths=.5)
plt.title("Matriz de Correla√ß√£o - Vari√°veis Num√©ricas")
plt.show()


In [0]:
ct_os = pd.crosstab(
    df_clean["OnlineSecurity"],
    df_clean["Churn"],
    normalize='index'
)
ct_os.columns = ["No", "Yes"]

ct_os.plot(
    kind="bar",
    stacked=True,
    figsize=(6,4),
    color=colors
)

plt.title("Composi√ß√£o: Churn vs N√£o Churn ‚Äî OnlineSecurity (0 = No, 1 = Yes)")
plt.ylabel("Propor√ß√£o")
plt.xlabel("OnlineSecurity")
plt.ylim(0,1)
plt.legend(title="Churn")
plt.show()


In [0]:
ct_ts = pd.crosstab(
    df_clean["TechSupport"],
    df_clean["Churn"],
    normalize='index'
)
ct_ts.columns = ["No", "Yes"]

ct_ts.plot(
    kind="bar",
    stacked=True,
    figsize=(6,4),
    color=colors
)

plt.title("Composi√ß√£o: Churn vs N√£o Churn ‚Äî TechSupport (0 = No, 1 = Yes)")
plt.ylabel("Propor√ß√£o")
plt.xlabel("TechSupport")
plt.ylim(0,1)
plt.legend(title="Churn")
plt.show()


In [0]:
ct_stv = pd.crosstab(
    df_clean["StreamingTV"],
    df_clean["Churn"],
    normalize='index'
)
ct_stv.columns = ["No", "Yes"]

ct_stv.plot(
    kind="bar",
    stacked=True,
    figsize=(6,4),
    color=colors
)

plt.title("Composi√ß√£o: Churn vs N√£o Churn ‚Äî StreamingTV (0 = No, 1 = Yes)")
plt.ylabel("Propor√ß√£o")
plt.xlabel("StreamingTV")
plt.ylim(0,1)
plt.legend(title="Churn")
plt.show()


In [0]:
ct_pb = pd.crosstab(
    df_clean["PaperlessBilling"],
    df_clean["Churn"],
    normalize='index'
)
ct_pb.columns = ["No", "Yes"]

ct_pb.plot(
    kind="bar",
    stacked=True,
    figsize=(6,4),
    color=colors
)

plt.title("Composi√ß√£o: Churn ‚Äî Paperless Billing (0 = No, 1 = Yes)")
plt.ylabel("Propor√ß√£o")
plt.xlabel("Paperless Billing")
plt.ylim(0,1)
plt.legend(title="Churn")
plt.show()


In [0]:
plt.figure(figsize=(7,4))
sns.boxplot(x=InternetService_series, y=df_clean["MonthlyCharges"])
plt.title("Distribui√ß√£o de MonthlyCharges por Tipo de Internet")
plt.xlabel("Internet Service Type")
plt.ylabel("MonthlyCharges")
plt.show()


plt.figure(figsize=(8,5))
sns.boxplot(x=Contract_series, y=df_clean["MonthlyCharges"])
plt.title("Distribui√ß√£o de MonthlyCharges por Tipo de Contract")
plt.xlabel("Contract Type")
plt.ylabel("MonthlyCharges")
plt.xticks(rotation=15)
plt.tight_layout()
plt.show()

plt.figure(figsize=(8,5))
sns.boxplot(x=PaymentMethod_series, y=df_clean["MonthlyCharges"])
plt.title("Distribui√ß√£o de MonthlyCharges por Payment Method")
plt.xlabel("Payment Method")
plt.ylabel("MonthlyCharges")
plt.xticks(rotation=20)
plt.tight_layout()
plt.show()


# Conclus√µes Completas da An√°lise Explorat√≥ria

## 1. Distribui√ß√£o geral do churn
O gr√°fico de contagem mostra que a maioria dos clientes n√£o churnou, com churn representando uma minoria relevante.  
Isso revela um desbalanceamento natural da vari√°vel-alvo, mas n√£o impede a an√°lise explorat√≥ria.

## 2. Rela√ß√£o entre tenure e churn
O histograma mostra um padr√£o claro:
- Clientes com tenure muito baixo apresentam alta incid√™ncia de churn.
- Conforme o tenure aumenta, a propor√ß√£o de churn cai drasticamente.
- Clientes com tenure muito alto quase n√£o apresentam churn.

Conclus√£o: churn √© predominantemente concentrado nos clientes que rec√©m entraram na base.

## 3. MonthlyCharges e churn
O boxplot indica:
- Clientes churn possuem MonthlyCharges mais altos.
- A mediana e o intervalo interquartil de churn s√£o superiores aos de n√£o churn.

Conclus√£o: mensalidades altas aumentam a probabilidade de churn.

## 4. TotalCharges e churn
O violin plot mostra:
- Clientes churn apresentam TotalCharges muito mais baixos.
- Clientes n√£o churn concentram-se em valores altos de TotalCharges.

Conclus√£o: churn est√° fortemente associado a clientes com pouco tempo de relacionamento (TotalCharges baixo √© um proxy de tenure baixo).

## 5. Tipo de Internet e churn (InternetService reconstru√≠do)
O gr√°fico de propor√ß√£o mostra tr√™s padr√µes:
- Fiber optic apresenta a maior propor√ß√£o de churn.
- DSL tem churn moderado.
- No internet praticamente n√£o tem churn.

Conclus√£o: o tipo de internet √© um driver importante, com fibra √≥ptica associada a maior insatisfa√ß√£o ou maior sensibilidade a pre√ßo.

## 6. M√©todo de pagamento e churn
O gr√°fico de PaymentMethod indica:
- Clientes que utilizam Electronic check possuem propor√ß√£o significativamente maior de churn.
- Os demais m√©todos apresentam churn bem menor.

Conclus√£o: Electronic check √© um forte indicador de risco.

## 7. Contrato (Contract reconstru√≠do)
No gr√°fico reconstru√≠do:
- Month-to-month apresenta claramente a maior propor√ß√£o de churn.
- One year e Two year exibem churn drasticamente menor.

Conclus√£o: contrato √© um dos fatores de reten√ß√£o mais relevantes: quanto mais longo, menor o churn.

## 8. OnlineSecurity e churn
O gr√°fico mostra:
- Clientes sem OnlineSecurity t√™m churn visivelmente maior.
- Clientes com OnlineSecurity churnam menos.

Conclus√£o: seguran√ßa adicional funciona como um mecanismo de reten√ß√£o.

## 9. TechSupport e churn
O padr√£o √© semelhante ao anterior:
- Aus√™ncia de TechSupport est√° associada a maior churn.
- Clientes com suporte apresentam taxas menores.

Conclus√£o: suporte t√©cnico reduz churn, possivelmente por aumentar valor percebido.

## 10. StreamingTV e churn
O gr√°fico mostra:
- Pequena diferen√ßa de churn entre usar ou n√£o usar StreamingTV.
- A vari√°vel n√£o apresenta impacto forte.

Conclus√£o: StreamingTV √© um fator secund√°rio e n√£o parece explicar churn de maneira contundente.

## 11. Heatmap de correla√ß√£o
O heatmap confirma:
- As correla√ß√µes de Pearson entre as vari√°veis num√©ricas e churn s√£o baixas ou nulas.
- Isso √© esperado, pois churn √© uma vari√°vel bin√°ria com rela√ß√µes n√£o lineares.

Conclus√£o: correla√ß√£o linear n√£o √© apropriada para medir rela√ß√£o com churn; os gr√°ficos categ√≥ricos capturam muito melhor os padr√µes.

---

# S√≠ntese Geral dos Principais Fatores de Churn

## Fatores fortes (alta separa√ß√£o nos gr√°ficos)
- Tenure baixo ‚Üí churn alto.
- MonthlyCharges alto ‚Üí maior churn.
- TotalCharges baixo ‚Üí churn alto (clientes novos).
- InternetService = Fiber optic ‚Üí churn elevado.
- PaymentMethod = Electronic check ‚Üí churn elevado.
- Contract = Month-to-month ‚Üí maior churn da base.
- Aus√™ncia de OnlineSecurity e TechSupport ‚Üí aumento do churn.

## Fatores fracos
- StreamingTV (impacto pequeno).
- Outras vari√°veis num√©ricas n√£o mostram rela√ß√£o linear significativa.


Escolha de modelo

In [0]:
# target
y = df_clean["Churn"]

# features (excluindo target e textos)
X = df_clean.drop(columns=["customerID", "Churn", "CustomerFeedback", "CustomerFeedback_clean"])

numeric_cols = X.select_dtypes(include=["int64","float64"]).columns.tolist()
boolean_cols = X.select_dtypes(include=["bool"]).columns.tolist()


preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("bool", "passthrough", boolean_cols)
    ]
)

models = {
    "log_reg": LogisticRegression(max_iter=2000, class_weight="balanced"),
    "random_forest": RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        class_weight="balanced",
        random_state=42
    ),
    "xgboost": XGBClassifier(
        n_estimators=400,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        random_state=42
    )
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = {}

for name, model in models.items():
    pipe = Pipeline([
        ("preprocess", preprocess),
        ("model", model)
    ])
    
    scores = cross_val_score(pipe, X, y, cv=cv, scoring="roc_auc")
    results[name] = scores
    print(f"{name} ‚Üí AUC m√©dio = {scores.mean():.4f} | std = {scores.std():.4f}")

best_model_name = max(results, key=lambda k: results[k].mean())
best_model = models[best_model_name]

print("Melhor modelo:", best_model_name)

Certifica√ß√£o

In [0]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

pipe_best = Pipeline([
    ("preprocess", preprocess),
    ("model", best_model)
])

pipe_best.fit(X_train, y_train)

y_pred = pipe_best.predict(X_test)\
    
y_proba = pipe_best.predict_proba(X_test)[:,1]


print("=== CERTIFICA√á√ÉO DO MODELO ===")
print(classification_report(y_test, y_pred, digits=3))
print("AUC:", roc_auc_score(y_test, y_proba))
print("Matriz de confus√£o:\n", confusion_matrix(y_test, y_pred))

print("=== COMPARA√á√ÉO COM DUMMY BASELINE ===")
dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(X_train, y_train)

y_dummy = dummy.predict(X_test)
y_dummy_prob = dummy.predict_proba(X_test)[:,1]

print("=== Dummy Classifier (Most Frequent) ===")
print(classification_report(y_test, y_dummy, zero_division=0))
print("AUC:", roc_auc_score(y_test, y_dummy_prob))
print(confusion_matrix(y_test, y_dummy))

In [0]:
RocCurveDisplay.from_predictions(y_test, y_proba)
plt.title("Curva ROC - Modelo Selecionado")
plt.show()

## Treinamento do modelo final com dataset completo

In [0]:
print("=== TREINANDO MODELO FINAL COM DATASET COMPLETO ===")

# Criar pipeline final com os mesmos par√¢metros do melhor modelo
pipe_final = Pipeline([
    ("preprocess", preprocess),
    ("model", best_model)
])

# Treinar com 100% dos dados
pipe_final.fit(X, y)

print(f"Modelo final treinado com {len(X)} amostras")
print("Pronto para infer√™ncia!")

##Registrar modelo no MLflow Model Registry

In [0]:
# usa o pipeline j√° treinado no notebook
base_model = pipe_final  # seu pipeline treinado

# exemplo de input para assinatura / input_example
input_example = X.head(3)

# wrapper que retorna DataFrame com prediction e probabilidades
class ProbWrapper(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        # usamos o objeto em mem√≥ria (pipe_final)
        self.model = base_model

    def predict(self, context, model_input):
        # model_input √© um DataFrame com as mesmas colunas que seu pipeline espera
        probs = self.model.predict_proba(model_input)  # shape (n, 2)
        preds = self.model.predict(model_input)        # shape (n,)
        # retornar um DataFrame para ficar claro no serving
        out = pd.DataFrame({
            "prediction": preds.astype(int),
            "prob_class0": probs[:, 0],
            "prob_class1": probs[:, 1]
        })
        return out

# nome do registered model no workspace (novo nome para n√£o sobrescrever)
registered_model_name = "workspace_modelo_churn_with_probs"

with mlflow.start_run(run_name="register_pyfunc_probs"):
    mlflow.pyfunc.log_model(
        artifact_path="model_pyfunc_probs",
        python_model=ProbWrapper(),
        registered_model_name=registered_model_name,
        input_example=input_example.head(3),
        signature=infer_signature(input_example.head(3),
                                  base_model.predict_proba(input_example.head(3)))
    )
    mlflow.log_param("note", "pyfunc wrapper that returns prediction + probabilities")

print("Registrado:", registered_model_name)

## Aplica√ß√£o do Modelo Classificador ao dataset inference.csv e gera√ß√£o do Prediction.csv

In [0]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# ==============================
# 1) Fun√ß√£o de limpeza
# ==============================
def clean_like_history(df_raw: pd.DataFrame) -> pd.DataFrame:
    df = df_raw.copy()

    # tenure: 'unknown' -> NaN -> mediana -> int
    if "tenure" in df.columns:
        df["tenure"] = df["tenure"].replace("unknown", np.nan)
        df["tenure"] = pd.to_numeric(df["tenure"], errors="coerce")
        median_tenure = df["tenure"].median()
        df["tenure"] = df["tenure"].fillna(median_tenure).astype(int)

    # TotalCharges: num√©rico + mediana
    if "TotalCharges" in df.columns:
        df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
        median_total = df["TotalCharges"].median()
        df["TotalCharges"] = df["TotalCharges"].fillna(median_total)

    # bin√°rias Yes/No -> 0/1
    binary_cols = [
        "PhoneService", "MultipleLines", "OnlineSecurity", "OnlineBackup",
        "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies",
        "Partner", "Dependents", "PaperlessBilling"
    ]
    for col in binary_cols:
        if col in df.columns:
            df[col] = df[col].map({"Yes": 1, "No": 0}).astype(float)

    # texto limpo (se existir)
    if "CustomerFeedback" in df.columns:
        df["CustomerFeedback_clean"] = (
            df["CustomerFeedback"]
            .fillna("")
            .str.lower()
            .str.replace(r"[^0-9a-zA-Z ]", " ", regex=True)
            .str.replace(r"\s+", " ", regex=True)
            .str.strip()
        )

    # dummies das categ√≥ricas
    cat_cols = ["gender", "InternetService", "Contract", "PaymentMethod"]
    cat_cols = [c for c in cat_cols if c in df.columns]
    if cat_cols:
        df = pd.get_dummies(df, columns=cat_cols, drop_first=False)

    return df

# ==============================
# 2) Treinar modelo com history.csv
# ==============================
history_raw = pd.read_csv("./history.csv")

history_clean = clean_like_history(history_raw)

# mapear Churn para 0/1
if history_clean["Churn"].dtype == "O":
    history_clean["Churn"] = history_clean["Churn"].map({"No": 0, "Yes": 1}).astype(int)

# opcional: salvar history_clean
history_clean.to_csv("./history_clean.csv", index=False)

# definir features (tudo menos target, id e texto)
drop_cols = ["Churn", "customerID", "CustomerFeedback", "CustomerFeedback_clean"]
feature_cols = [c for c in history_clean.columns if c not in drop_cols]

X_train = history_clean[feature_cols]
y_train = history_clean["Churn"]

# medianas para imputa√ß√£o
train_median = X_train.median(numeric_only=True)

# ---------- IMPUTA√á√ÉO EM TREINO (para evitar NaN no fit) ----------
for col in X_train.columns:
    if col in train_median.index:
        X_train[col] = X_train[col].fillna(train_median[col])

# qualquer coisa ainda NaN -> 0
X_train = X_train.fillna(0)

# pipeline: scaler + log√≠stica balanceada
pipe_final = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced"))
])

pipe_final.fit(X_train, y_train)
print("Modelo treinado. Linhas:", X_train.shape[0], "| Features:", X_train.shape[1])

# ==============================
# 3) Ler e limpar inference.csv
# ==============================
inference_raw = pd.read_csv("./inference.csv")

inference_clean = clean_like_history(inference_raw)

# alinhar colunas com as do treino
X_inf = inference_clean.reindex(columns=feature_cols, fill_value=0)

# IMPUTA√á√ÉO EM INFER√äNCIA (consistente com treino)
for col in X_inf.columns:
    if col in train_median.index:
        X_inf[col] = X_inf[col].fillna(train_median[col])

X_inf = X_inf.fillna(0)

# opcional: salvar vers√£o limpa de inference
X_inf.to_csv("./inference_clean.csv", index=False)

# ==============================
# 4) Prever churn com pipe_final
# ==============================
y_pred = pipe_final.predict(X_inf).astype(int)

# ==============================
# 5) Gerar prediction_grupo2.csv
#    formato: ,customerID,prediction
# ==============================
pred_str = np.where(y_pred == 1, "Yes", "No")

prediction_df = pd.DataFrame({
    "customerID": inference_raw["customerID"],
    "prediction": pred_str
})

prediction_df.to_csv("./prediction_grupo2.csv")
print("prediction_grupo2.csv gerado no formato esperado.")

## MODELO DE MOTIVOS

In [0]:
# Carregar tabela de motivos (Delta)
reason_df = spark.table("workspace.churn.churn_reason_final").toPandas()

# Visualizar
reason_df.head()


In [0]:
# df_clean: seu dataset tratado com features finais (j√° visto no seu PDF)
df_full = df_clean.copy()

# garantir que customer_id est√° como string
df_full["customerID"] = df_full["customerID"].astype(str)
reason_df["customer_id"] = reason_df["customer_id"].astype(str)

# juntar
merged = df_full.merge(
    reason_df[["customer_id", "churn_category"]],
    left_on="customerID",
    right_on="customer_id",
    how="inner"
)

print("Merged shape antes de filtrar:", merged.shape)

# üî• remover classe rara direto aqui
merged = merged[ merged["churn_category"] != "Other / unclear" ].copy()

print("Merged shape DEPOIS de filtrar:", merged.shape)
print(merged["churn_category"].value_counts())
merged.head()


In [0]:
import pandas as pd
import numpy as np

df_check = merged.copy()   # <-- seu dataset j√° alinhado

print("=== SHAPE ===")
print(df_check.shape)

print("\n=== TIPOS ===")
print(df_check.dtypes)

print("\n=== COUNT DE NULOS POR COLUNA ===")
print(df_check.isna().sum())

print("\n=== LINHAS COM ALGUM NA ===")
print(df_check[df_check.isna().any(axis=1)].head())

print("\n=== CHECANDO VALORES INF E -INF ===")
num_df = df_check.select_dtypes(include=[np.number])
print(np.isinf(num_df).sum())

print("\n=== CHECANDO STRINGS VAZIAS ===")
obj_cols = df_check.select_dtypes(include=['object']).columns
empty_mask = df_check[obj_cols].apply(lambda col: col.astype(str).str.strip() == "")
print(empty_mask.sum())

print("\n=== MOSTRAR VALORES ESTRANHOS NAS CATEGORIAS ===")
for col in obj_cols:
    uniques = df_check[col].astype(str).str.strip().unique()
    weird = [u for u in uniques if u in ["", " ", "None", "nan", "NaN", "NULL", "null"]]
    if weird:
        print(f"Coluna {col} tem valores suspeitos -> {weird}")

print("\n=== CHECANDO MISTURA DE TIPOS NAS COLUNAS ===")
for col in df_check.columns:
    types = df_check[col].apply(lambda x: type(x).__name__).unique()
    if len(types) > 1:
        print(f"Coluna {col} tem mistura de tipos: {types}")

print("\n=== CONTAGEM DE CLASSES NO TARGET ===")
if "churn_category" in df_check.columns:
    print(df_check["churn_category"].value_counts(dropna=False))

print("\n=== EXEMPLOS DE LINHAS PROBLEM√ÅTICAS ===")
problem_rows = df_check[
    df_check.isna().any(axis=1) |
    empty_mask.any(axis=1)
]
print(problem_rows.head(20))

print("\n=== FIM DA VERIFICA√á√ÉO ===")


In [0]:
# remover qualquer linha cujo churn_category √© None
merged = merged[ merged["churn_category"].notna() ].copy()

# garantir que tudo virou string e est√° limpo
merged["churn_category"] = merged["churn_category"].astype(str).str.strip()

# reconstruir X e y
y_reason = merged["churn_category"]

X_reason = merged.drop(columns=[
    "churn_category",
    "customerID",
    "customer_id",
    "CustomerFeedback",
    "CustomerFeedback_clean",
    "reason_short",
    "reason_long"
], errors="ignore")

print("Classes ap√≥s limpeza final:")
print(y_reason.value_counts(dropna=False))


In [0]:
numeric_cols = X_reason.select_dtypes(include=["int64", "float64"]).columns.tolist()
bool_cols    = X_reason.select_dtypes(include=["bool"]).columns.tolist()

numeric_cols, bool_cols


In [0]:
preprocess_reason = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("bool", "passthrough", bool_cols)
    ]
)

models_reason = {
    "log_reg": LogisticRegression(
        max_iter=3000,
        class_weight="balanced",
        multi_class="auto"
    ),
    "random_forest": RandomForestClassifier(
        n_estimators=400,
        class_weight="balanced",
        random_state=42
    ),
    "xgboost": XGBClassifier(
        n_estimators=600,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.9,
        colsample_bytree=0.9,
        eval_metric="mlogloss",
        random_state=42
    )
}


In [0]:
# -------------- corre√ß√£o e run seguro para XGBoost --------------
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score
from xgboost import XGBClassifier
import traceback

# 1) label-encode do target (para XGBoost)
le = LabelEncoder()
y_enc = le.fit_transform(y_reason.astype(str).str.strip())
# mapa para interpreta√ß√£o posterior
label_map = dict(enumerate(le.classes_))

print("Label encoding feito. classes:", label_map)

# 2) substituir/configurar XGBoost "seguro" (se quiser persistir)
safe_xgb = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42, n_estimators=200, verbosity=0)
# se preferir, substitua no dicion√°rio models_reason:
# models_reason['xgboost'] = safe_xgb

# 3) cv seguro (ajusta n_splits conforme menor classe de y_reason original)
min_count = int(y_reason.value_counts().min())
n_splits = min(5, min_count) if min_count >= 2 else 2
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
print("Usando StratifiedKFold com n_splits =", n_splits)

# 4) loop robusto: usa y_enc para XGB e y_reason para os outros
results = {}
errors = {}

for name, model in models_reason.items():
    print(f"\n>>> Testando: {name} (tipo: {type(model)})")
    # se este for o XGB do dicion√°rio, use safe_xgb (opcional)
    if 'xgboost' in name.lower():
        model_to_use = safe_xgb
        y_to_use = y_enc
    else:
        model_to_use = model
        y_to_use = y_reason

    pipe = Pipeline([("preprocess", preprocess_reason), ("model", model_to_use)])
    try:
        scores = cross_val_score(pipe, X_reason, y_to_use, cv=cv, scoring="f1_macro", error_score=np.nan)
        mean = float(np.nanmean(scores))
        std = float(np.nanstd(scores))
        results[name] = (mean, std)
        print(f"  OK -> F1_macro = {mean:.4f} ¬± {std:.4f}  (scores: {scores})")
    except Exception as e:
        tb = traceback.format_exc()
        errors[name] = tb
        results[name] = (np.nan, np.nan)
        print(f"  FALHOU -> {type(e).__name__}: {e}")
        print(tb[:1000])

# 5) resumo e sele√ß√£o do melhor entre v√°lidos
valid = {k:v for k,v in results.items() if not (np.isnan(v[0]))}

print("\n=== RESUMO ===")
for k,v in results.items():
    status = "VALID" if k in valid else "INVALID"
    print(k, ":", status, "->", v)

if not valid:
    print("\nNenhum modelo v√°lido. Tracebacks (resumidos):")
    for k,tb in errors.items():
        print(f"\n--- {k} ---\n{tb[:1000]}")
    raise RuntimeError("Nenhum modelo completou com sucesso.")
else:
    best_name = max(valid, key=lambda k: valid[k][0])
    print("\n>>> MELHOR MODELO:", best_name, "->", valid[best_name])

    # se o melhor for xgboost e voc√™ quiser ver as classes originais:
    if 'xgboost' in best_name.lower():
        print("Observa√ß√£o: esse modelo usou labels codificados. Mapeamento label->classe:")
        for k,v in label_map.items():
            print(k, "->", v)


In [0]:

X_train, X_test, y_train, y_test = train_test_split(
    X_reason, y_reason, test_size=0.2, stratify=y_reason, random_state=42
)

pipe_reason = Pipeline([
    ("preprocess", preprocess_reason),
    ("model", best_model)
])

pipe_reason.fit(X_train, y_train)

y_pred = pipe_reason.predict(X_test)

print("==== CERTIFICA√á√ÉO DO MODELO DE MOTIVOS ====")
print(classification_report(y_test, y_pred, digits=3))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))