In [35]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np
import joblib

In [36]:
# Cargar los datos
df = pd.read_excel('C:/Users/Pedro/Desktop/DataScience-SoyHenry/AGSM--power bi/BASE PRUEBA BI.xlsx', sheet_name='DATOS')

# --- 1. Preprocesamiento y creación de la variable objetivo ---


In [37]:
#  Estados exitosos
estados_venta_exitosa = [
    "abierto en solicitud de plan de ahorro",
    "abierto en preventa"
]

#  Filtrar y crear target
df_abiertos = df[df["Estado"].str.lower().str.startswith("abierto")].copy()
df_abiertos["Venta_Exitosa"] = df_abiertos["Estado"].str.lower().apply(
    lambda x: 1 if x in estados_venta_exitosa else 0
)

#  Variables derivadas
df_abiertos["Cantidad_Supervisores"] = df_abiertos["Supervisores"].astype(str).apply(lambda x: len(x.split(",")))
df_abiertos["Dia_Semana"] = pd.to_datetime(df_abiertos["Fecha alta"], errors='coerce').dt.dayofweek
df_abiertos["Días"] = pd.to_numeric(df_abiertos["Días"], errors='coerce')

#  Features y target
features = [
    "Días", "Cantidad_Supervisores", "Dia_Semana",
    "Origen", "Tipo de operación", "Rubro", "Categoría", "Tipo", "Tipo de Proceso Comercial"
]
X = df_abiertos[features].copy()
y = df_abiertos["Venta_Exitosa"]

# --- 2. Preprocesamiento y entrenamiento del modelo ---

In [38]:
#  Convertir columnas categóricas a string
for col in X.columns:
    if X[col].dtype == "object" or pd.api.types.is_categorical_dtype(X[col]) or pd.api.types.is_datetime64_any_dtype(X[col]):
        X[col] = X[col].astype(str)

#  Dividir
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

#  Detectar tipos de columnas
numeric_features = ["Días", "Cantidad_Supervisores", "Dia_Semana"]
categorical_features = ["Origen", "Tipo de operación", "Rubro", "Categoría", "Tipo", "Tipo de Proceso Comercial"]

#  Pipelines de transformación
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", MinMaxScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

#  Pipeline final con SMOTE y modelo
pipeline = ImbPipeline(steps=[
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("classifier", GradientBoostingClassifier(random_state=42))
])

  if X[col].dtype == "object" or pd.api.types.is_categorical_dtype(X[col]) or pd.api.types.is_datetime64_any_dtype(X[col]):


In [39]:
print(type(clf))



<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>


# --- 3. Seleccionar variables de entrada (features) ---

In [40]:

#  Entrenar
pipeline.fit(X_train, y_train)

#  Predicciones y métricas
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

#  Importancia de variables
preprocessor_fitted = pipeline.named_steps["preprocessor"]
numeric_feature_names = preprocessor_fitted.transformers_[0][2]
categorical_encoder = preprocessor_fitted.transformers_[1][1].named_steps["onehot"]
categorical_feature_names = categorical_encoder.get_feature_names_out(categorical_features)
feature_names = list(numeric_feature_names) + list(categorical_feature_names)
importances = pipeline.named_steps["classifier"].feature_importances_

importancia_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

print("Importancia de características:")
print(importancia_df.head(20))
importancia_df.to_csv("importancia_modelo_con_smote.csv", index=False)

Accuracy: 0.9826086956521739
ROC AUC: 0.99210669569951
              precision    recall  f1-score   support

           0       0.73      0.73      0.73        11
           1       0.99      0.99      0.99       334

    accuracy                           0.98       345
   macro avg       0.86      0.86      0.86       345
weighted avg       0.98      0.98      0.98       345

Importancia de características:
                            Feature    Importance
0                              Días  9.897823e-01
153                       Rubro_nan  5.958729e-03
9               Origen_Landing_Page  3.047721e-03
5              Origen_Facebook_Form  4.771660e-04
1             Cantidad_Supervisores  2.329209e-04
7             Origen_Instagram_Form  1.805547e-04
275  Tipo de Proceso Comercial_Otro  1.409209e-04
274  Tipo de Proceso Comercial_Cero  1.318234e-04
276  Tipo de Proceso Comercial_Plan  2.447065e-05
3               Origen_Bot_Whatsapp  1.034048e-05
21     Tipo de operación_FINANCIADO 

In [41]:
# Guardar para Streamlit
joblib.dump(pipeline, "modelo_con_smote.pkl")
joblib.dump(preprocessor.transform(X_test), "X_test_con_smote.pkl")
joblib.dump(y_test, "y_test_con_smote.pkl")


['y_test_con_smote.pkl']