In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import numpy as np

# 1. Cargar el archivo de datos
file_path = 'C:/Users/jzuhayle/OneDrive - UC CHRISTUS Chile/Escritorio/INFO PROYECTO/DICIEMBRE 2024/DATA/111.xlsx'
df = pd.read_excel(file_path)

# 2. Preparar las variables
X = df[['IR GRD', 'SERVICIO INGRESO', 'CONVENIO']]
y_gain = df['GANANCIA']
y_stay = df['ESTANCIA']

# 3. Codificar las variables categóricas
label_encoder_service = LabelEncoder()
label_encoder_diagnosis = LabelEncoder()
label_encoder_convenio = LabelEncoder()

X['SERVICIO INGRESO'] = label_encoder_service.fit_transform(X['SERVICIO INGRESO'])
X['IR GRD'] = label_encoder_diagnosis.fit_transform(X['IR GRD'])
X['CONVENIO'] = label_encoder_convenio.fit_transform(X['CONVENIO'])

# 4. Escalar las variables de entrada
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 5. Dividir los datos en entrenamiento y prueba
X_train, X_test, y_gain_train, y_gain_test = train_test_split(X_scaled, y_gain, test_size=0.2, random_state=42)
X_train, X_test, y_stay_train, y_stay_test = train_test_split(X_scaled, y_stay, test_size=0.2, random_state=42)

# 6. Entrenar el modelo optimizado para GANANCIAS
rf_gain = RandomForestRegressor(
    max_depth=3, 
    min_samples_split=2, 
    min_samples_leaf=1, 
    n_estimators=100, 
    random_state=42
)
rf_gain.fit(X_train, y_gain_train)

# 7. Entrenar el modelo optimizado para ESTANCIA
rf_stay = RandomForestRegressor(
    max_depth=10, 
    min_samples_split=2, 
    min_samples_leaf=1, 
    n_estimators=100, 
    random_state=42
)
rf_stay.fit(X_train, y_stay_train)

# 8. Evaluar los modelos
def evaluate_model(model, X_train, y_train, X_test, y_test, target_name):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    metrics = {
        "Target": target_name,
        "RMSE Train": np.sqrt(mean_squared_error(y_train, y_train_pred)),
        "RMSE Test": np.sqrt(mean_squared_error(y_test, y_test_pred)),
        "R2 Train": r2_score(y_train, y_train_pred),
        "R2 Test": r2_score(y_test, y_test_pred),
    }
    return metrics

gain_metrics = evaluate_model(rf_gain, X_train, y_gain_train, X_test, y_gain_test, "Gain")
stay_metrics = evaluate_model(rf_stay, X_train, y_stay_train, X_test, y_stay_test, "Stay")

# Mostrar las métricas
metrics_df = pd.DataFrame([gain_metrics, stay_metrics])
print(metrics_df)

# 9. Guardar los modelos y transformaciones
joblib.dump(rf_gain, "optimized_rf_gain.pkl")
joblib.dump(rf_stay, "optimized_rf_stay.pkl")
joblib.dump(label_encoder_service, "label_encoder_service.pkl")
joblib.dump(label_encoder_diagnosis, "label_encoder_diagnosis.pkl")
joblib.dump(label_encoder_convenio, "label_encoder_convenio.pkl")
joblib.dump(scaler, "scaler.pkl")

print("Modelos guardados correctamente.")

