## Preprocesamiento

In [5]:
import pandas as pd
from PIL import Image
import numpy as np
import joblib

In [17]:
dataset = pd.read_csv('data/data.csv')

In [4]:
numeric_features = ['extent']
categorical_features = ['growth_stage', 'damage', 'season']

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

transformers_encoder_onehot = [
    ('encoder', OneHotEncoder(), categorical_features)
]
column_transformer = ColumnTransformer(transformers_encoder_onehot, remainder='passthrough')

In [6]:
dataset_encoded = column_transformer.fit_transform(dataset)
dataset = pd.DataFrame(dataset_encoded, columns=column_transformer.get_feature_names_out())

In [7]:
new_column_names = [col.replace('encoder__', '').replace('remainder__', '') for col in column_transformer.get_feature_names_out()]
dataset.columns = new_column_names

In [8]:
dataset = dataset.drop(columns=['ID'])

In [9]:
dataset.head()

Unnamed: 0,growth_stage_F,growth_stage_M,growth_stage_S,growth_stage_V,damage_DR,damage_DS,damage_FD,damage_G,damage_ND,damage_PS,damage_WD,damage_WN,season_LR2020,season_LR2021,season_SR2020,season_SR2021,filename,extent
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,L427F01330C01S03961Rp02052.jpg,0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,L1083F00930C39S12674Ip.jpg,0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,24_initial_1_1463_1463.JPG,0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,L341F00167C01S00324Rp14178.jpg,60
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,L1084F02394C39S13931Ip.jpg,0


In [10]:
def resizeImg(path, size):
    img = Image.open(path)

    width, height = img.size
    aspectRatio = width / height

    if aspectRatio < 1:
        nHeight = size
        nWidth = int(size * aspectRatio)
    else:
        nWidth = size
        nHeight = int(size / aspectRatio)

    resizedImg = img.resize((nWidth, nHeight), Image.LANCZOS)

    newImg = Image.new("RGB", (size, size), (255, 255, 255))

    xOffset = (size - nWidth) // 2
    yOffset = (size - nHeight) // 2
    newImg.paste(resizedImg, (xOffset, yOffset))

    return newImg

In [11]:
import os
from tqdm import tqdm

# Ruta donde se encuentran las imágenes
image_path = "data/train/"
image_size = 128
output_file = "resized_images.npy"

def load_and_resize_images(df, path, size):
    images = []
    for filename in tqdm(df['filename']):
        img = resizeImg(os.path.join(path, filename), size)
        img_array = np.array(img) / 255.0
        images.append(img_array)
    return np.array(images)

# Verificar si ya existe el archivo .npy
if not os.path.exists(output_file):
    images = load_and_resize_images(dataset, image_path, image_size)
    np.save(output_file, images)  # Guardar el archivo .npy
else:
    images = np.load(output_file)  # Cargar el archivo .npy


100%|██████████| 26068/26068 [16:31<00:00, 26.29it/s] 


## CNN

In [12]:
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model

# Cargar el modelo preentrenado VGG16
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(image_size, image_size, 3))
# Extraer las características de la última capa convolucional
model = Model(inputs=base_model.input, outputs=base_model.layers[-1].output)

# Extraer características
features = model.predict(images)
# Aplanar las características para usarlas en Random Forest
features_flat = features.reshape(len(features), -1)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m58889256/58889256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 0us/step
[1m815/815[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1581s[0m 2s/step


In [14]:
# Convertir las características a un DataFrame y unir con el dataset original
features_df = pd.DataFrame(features_flat)
df_features = pd.concat([dataset.reset_index(drop=True), features_df], axis=1)
df_features.drop(columns=["filename"], inplace=True)  # Retirar la columna 'filename' ya que ahora tenemos las features

In [26]:
# Guardar el DataFrame completo con las características y etiquetas
df_features.to_csv("CNN_RandomForest/final_dataset.csv", index=False)

In [3]:
# Cargar el DataFrame en caso de que se haya guardado previamente
df_features = pd.read_csv("CNN_RandomForest/final_dataset.csv")

## Random Forest

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Dividir datos
X = df_features.drop(columns=["extent"])
y = df_features["extent"]

# Convertir nombres de columnas a tipo string
X.columns = X.columns.astype(str)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar el modelo Random Forest
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

In [None]:
# Grid search

from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'max_features': ['auto', 'sqrt']
}
model_rf = RandomForestRegressor(random_state=42)
random_search = RandomizedSearchCV(model_rf, param_grid, n_iter=10, cv=3, random_state=42)
random_search.fit(X_train, y_train)

In [11]:

print(f"Best parameters: {random_search.best_params_}")

Mejores parámetros: {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2, 'max_features': 'sqrt'}


In [17]:
from sklearn.metrics import mean_squared_error, r2_score

# Evaluar el modelo
y_pred = model_rf.predict(X_test)

# Calcular RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE: {rmse}")

# Calcular R²
r2 = r2_score(y_test, y_pred)
print(f"R²: {r2}")

RMSE: 8.726778820651981
R²: 0.7901191755276311




In [16]:
# Validación cruzada

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

# Usar RMSE como métrica de evaluación en la validación cruzada
rmse_scorer = make_scorer(mean_squared_error, squared=False)

# Aplicar validación cruzada
cv_scores = cross_val_score(model_rf, X, y, cv=5, scoring=rmse_scorer)

# Imprimir resultados de validación cruzada
print(f"RMSE para cada fold de validación cruzada: {cv_scores}")
print(f"RMSE promedio: {np.mean(cv_scores)}")
print(f"Desviación estándar de RMSE: {np.std(cv_scores)}")




RMSE para cada fold de validación cruzada: [8.51452233 8.25258047 9.12848175 8.02106844 8.6620312 ]
RMSE promedio: 8.515736838548188
Desviación estándar de RMSE: 0.37704322085867964




In [9]:
# Visualizando el primer registro del conjunto de prueba
single_example = X_test.iloc[0].values.reshape(1, -1)  # Redimensionar para que sea un array de 2D

# Hacer la predicción
single_prediction = model_rf.predict(single_example)

print(f"\nValor real para el ejemplo: {y_test.iloc[0]}")
print(f"Predicción para el ejemplo: {single_prediction[0]}")


Valor real para el ejemplo: 30
Predicción para el ejemplo: 34.0




In [None]:
# Guardar el modelo entrenado
joblib.dump(model_rf, "CNN_RandomForest/random_forest_model.pkl")

In [13]:
# Seleccionar varios ejemplos del conjunto de prueba
num_examples = 100  # Número de ejemplos a observar
examples = X_test.iloc[:num_examples]  # Seleccionar los primeros ejemplos
true_values = y_test.iloc[:num_examples]  # Valores reales correspondientes

# Hacer predicciones para varios ejemplos
predictions = model_rf.predict(examples)

# Mostrar resultados
for i in range(num_examples):
    print(f"\nEjemplo {i+1}:")
    print(f"Valor real: {true_values.iloc[i]}")
    print(f"Predicción: {predictions[i]}")


Ejemplo 1:
Valor real: 30
Predicción: 34.0

Ejemplo 2:
Valor real: 0
Predicción: 0.0

Ejemplo 3:
Valor real: 0
Predicción: 0.0

Ejemplo 4:
Valor real: 0
Predicción: 0.0

Ejemplo 5:
Valor real: 0
Predicción: 0.0

Ejemplo 6:
Valor real: 10
Predicción: 34.1

Ejemplo 7:
Valor real: 0
Predicción: 0.0

Ejemplo 8:
Valor real: 0
Predicción: 0.0

Ejemplo 9:
Valor real: 0
Predicción: 0.0

Ejemplo 10:
Valor real: 0
Predicción: 0.0

Ejemplo 11:
Valor real: 0
Predicción: 0.0

Ejemplo 12:
Valor real: 0
Predicción: 0.0

Ejemplo 13:
Valor real: 0
Predicción: 0.0

Ejemplo 14:
Valor real: 0
Predicción: 0.0

Ejemplo 15:
Valor real: 0
Predicción: 0.0

Ejemplo 16:
Valor real: 0
Predicción: 0.0

Ejemplo 17:
Valor real: 0
Predicción: 0.0

Ejemplo 18:
Valor real: 0
Predicción: 0.0

Ejemplo 19:
Valor real: 0
Predicción: 0.0

Ejemplo 20:
Valor real: 0
Predicción: 0.0

Ejemplo 21:
Valor real: 40
Predicción: 41.2

Ejemplo 22:
Valor real: 0
Predicción: 0.0

Ejemplo 23:
Valor real: 80
Predicción: 47.7

Ejemplo 24: