# Implementación 1: Modelo RandomForest

## Importación de librerias y creación de funciones

In [1]:
# Importar las bibliotecas necesarias
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import geopandas as gpd
from sklearn.model_selection import train_test_split
import os
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler,OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC  # Para clasificación SVM
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.calibration import CalibratedClassifierCV
from sklearn import metrics
from sklearn.model_selection import cross_val_score,GridSearchCV,ShuffleSplit
from sklearn import tree
from sklearn.metrics import accuracy_score,confusion_matrix,ConfusionMatrixDisplay,recall_score,precision_score,f1_score

In [2]:
def cargar_csv(ruta_csv):
    try:
        df = pd.read_csv(ruta_csv)
        return df
    except FileNotFoundError:
        print(f"El archivo en la ruta {ruta_csv} no se encontró.")
    except pd.errors.EmptyDataError:
        print("El archivo CSV está vacío.")
    except pd.errors.ParserError:
        print("Error al analizar el archivo CSV.")
    except Exception as e:
        print(f"Ocurrió un error: {e}")

In [3]:
df=cargar_csv(os.path.join('..','Data','output','train_data_1.csv'))

## Desarrollo

### Parametrizacion

In [4]:
independent_var_erase=[]
dependent_var='deslizamientos'
independent_var=[x for x in df.columns if x not in independent_var_erase and x not in dependent_var]

In [5]:
X=df[independent_var]
y=df[dependent_var]

# Seleccionar columnas categoricas
VAR_categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]

# Seleccionar columnas numericas
VAR_numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

# Total de columnas
VAR_cols = VAR_categorical_cols + VAR_numerical_cols

print('Variables categoricas: ',VAR_categorical_cols)
print('Variables numericas: ',VAR_numerical_cols)
print('Variables independientes: ',VAR_cols)

Variables categoricas:  ['zonificacion', 'Nomenclatura_del_Suelo', 'tipo_geologia']
Variables numericas:  ['ELEVACION', 'PENDIENT', '24h', '7d', '10d', '15d', '30d', '60d', '90d']
Variables independientes:  ['zonificacion', 'Nomenclatura_del_Suelo', 'tipo_geologia', 'ELEVACION', 'PENDIENT', '24h', '7d', '10d', '15d', '30d', '60d', '90d']


In [6]:
# Crea un diccionario para los codificadores
label_encoders = {}

for feature in VAR_categorical_cols:
    le = LabelEncoder()
    X[feature] = le.fit_transform(X[feature])
    label_encoders[feature] = le

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [8]:
shuffle_split = ShuffleSplit(test_size=.3, train_size=.7, n_splits=10)

In [9]:
param_grid_rf = {
    'classifier__n_estimators': [100,250,500], 
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
    }

In [10]:
preprocessor_rf = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), VAR_numerical_cols)
    ],
    remainder='passthrough'
)

In [11]:
pipeline_rf= Pipeline([
    ('preprocessor',preprocessor_rf),
    ('classifier', RandomForestClassifier())
])
pipeline_rf

In [12]:
X_train.head()

Unnamed: 0,ELEVACION,PENDIENT,zonificacion,Nomenclatura_del_Suelo,tipo_geologia,24h,7d,10d,15d,30d,60d,90d
899,1054.006714,22.413567,4,25,9,10.0,167.0,188.0,313.0,456.0,536.0,694.0
462,1064.613892,13.493474,1,6,8,0.0,0.0,0.0,0.0,31.6,147.9,380.9
747,1064.599976,60.087574,5,25,7,24.4,114.8,139.0,165.2,262.6,371.2,449.0
651,1029.619263,4.604765,1,25,9,0.0,21.3,21.3,21.3,84.6,417.5,622.5
289,1307.238647,22.076199,5,15,7,1.0,7.0,7.2,24.6,264.2,294.8,309.2


In [13]:
y_train.head()

899    0
462    1
747    0
651    1
289    1
Name: deslizamientos, dtype: int64

In [14]:
grid_search = GridSearchCV(pipeline_rf, param_grid_rf, cv=shuffle_split,scoring='precision')
grid_search.fit(X_train, y_train)
msg_training=f"Rendimiento en training -> RandomForestClassifier \n Puntaje recall en entrenamiento: {grid_search.best_score_}, \n modelo ganador: {grid_search.best_estimator_} \n mejor combinacion de parametros: {grid_search.best_params_}"
print(msg_training)
model_rf=grid_search.best_estimator_

Rendimiento en training -> RandomForestClassifier 
 Puntaje recall en entrenamiento: 0.9354255284147465, 
 modelo ganador: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num', StandardScaler(),
                                                  ['ELEVACION', 'PENDIENT',
                                                   '24h', '7d', '10d', '15d',
                                                   '30d', '60d', '90d'])])),
                ('classifier', RandomForestClassifier(n_estimators=500))]) 
 mejor combinacion de parametros: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 500}


In [15]:
y_pred = model_rf.predict(X_valid)
precision = precision_score(y_valid, y_pred)
print(f"Recall en test para RandomForestClassifier: {precision}")

Recall en test para RandomForestClassifier: 0.9444444444444444
