In [9]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler

from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, GlobalAveragePooling2D, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [10]:
IMG_SIZE = (96,96)
IMAGE_DIR = "train_images"
TARGET_COLS = ["X4_mean", "X11_mean", "X18_mean", "X26_mean", "X50_mean", "X3112_mean"]

In [11]:
train_df = pd.read_csv("train.csv")

In [12]:
X_tab_raw = train_df.drop(columns=["id"]).iloc[:, :-6]
Q1 = X_tab_raw.quantile(0.25)
Q3 = X_tab_raw.quantile(0.75)
IQR = Q3 - Q1

outlier_mask = (X_tab_raw < (Q1 - 1.5 * IQR)) | (X_tab_raw > (Q3 + 1.5 * IQR))
outlier_count = outlier_mask.sum(axis=1)

mask = outlier_count <= 5

X_tab_raw = X_tab_raw[mask]
train_df = train_df[mask] 

y = train_df[TARGET_COLS].values

y_scaler = StandardScaler()
y_scaled = y_scaler.fit_transform(y)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_tab_raw)

pca = PCA(n_components=20)
X_pca = pca.fit_transform(X_scaled)
print(f"Total de varianza explicada por los 20 componentes: {pca.explained_variance_ratio_.sum():.4f}")
print(f"Total de instancias para entrenamiento: {len(train_df)}")

Total de varianza explicada por los 20 componentes: 0.9253
Total de instancias para entrenamiento: 37706


In [13]:
def load_and_process_image(img_id):
    img_path = os.path.join(IMAGE_DIR, f"{img_id}.jpg")
    if not os.path.exists(img_path):
        img_path = os.path.join(IMAGE_DIR, f"{img_id}.png")
    if not os.path.exists(img_path):
        img_path = os.path.join(IMAGE_DIR, f"{img_id}.jpeg")
    img = load_img(img_path, target_size=IMG_SIZE)
    img = img_to_array(img)
    return preprocess_input(img)

image_array = np.array([load_and_process_image(i) for i in tqdm(train_df['id'], desc="Cargando imágenes")])

Cargando imágenes: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 37706/37706 [05:02<00:00, 124.50it/s]


In [14]:
def build_model(img_shape, tab_shape, output_dim):
    # Imagen
    img_input = Input(shape=img_shape)
    base_cnn = MobileNetV2(include_top=False, weights='imagenet', input_shape=img_shape)
    base_cnn.trainable = True  # Fine-tune
    x_img = base_cnn(img_input)
    x_img = GlobalAveragePooling2D()(x_img)

    # Tabular
    tab_input = Input(shape=(tab_shape,))
    x_tab = Dense(128, activation='relu')(tab_input)
    x_tab = Dropout(0.3)(x_tab)

    # Concatenación
    x = Concatenate()([x_img, x_tab])
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.3)(x)
    output = Dense(output_dim)(x)

    model = Model(inputs=[img_input, tab_input], outputs=output)
    return model

model = build_model((96, 96, 3), X_pca.shape[1], y_scaled.shape[1])
model.compile(optimizer=Adam(1e-5), loss='mse', metrics=['root_mean_squared_error'])

In [15]:
X_img_train, X_img_val, X_tab_train, X_tab_val, y_train, y_val = train_test_split(
    image_array, X_pca, y_scaled, test_size=0.2, random_state=42)


history = model.fit(
    [X_img_train, X_tab_train], y_train,
    validation_data=([X_img_val, X_tab_val], y_val),
    epochs=15,
    batch_size=32,
    callbacks=[EarlyStopping(patience=3, restore_best_weights=True)],
    verbose=1
)

Epoch 1/15
[1m943/943[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m368s[0m 360ms/step - loss: 2.5696 - root_mean_squared_error: 1.5478 - val_loss: 1.4981 - val_root_mean_squared_error: 1.2240
Epoch 2/15
[1m943/943[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m331s[0m 352ms/step - loss: 0.9828 - root_mean_squared_error: 0.9792 - val_loss: 1.3865 - val_root_mean_squared_error: 1.1775
Epoch 3/15
[1m943/943[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m330s[0m 350ms/step - loss: 0.8326 - root_mean_squared_error: 0.8945 - val_loss: 1.3531 - val_root_mean_squared_error: 1.1632
Epoch 4/15
[1m943/943[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m334s[0m 354ms/step - loss: 0.7122 - root_mean_squared_error: 0.8197 - val_loss: 1.3366 - val_root_mean_squared_error: 1.1561
Epoch 5/15
[1m943/943[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m324s[0m 344ms/step - loss: 0.6724 - root_mean_squared_error: 0.8123 - val_loss: 1.3279 - val_root_mean_squared_error: 1.1524
Epoch 6/15
[1m943/9

In [None]:
import numpy as np
from sklearn.metrics import r2_score as sklearn_r2

print("\nR² por variable:")
for i, col in enumerate(TARGET_COLS):
    # Extrae la columna i
    y_true = y_val_real[:, i]
    y_pred = y_pred_val_real[:, i]
    
    # Calcula percentiles del valor real
    p5, p95 = np.percentile(y_true, [5, 95])
    
    # Máscara para conservar solo los valores dentro del rango
    mask = (y_true >= p5) & (y_true <= p95)
    
    # Filtra outliers
    y_true_filtered = y_true[mask]
    y_pred_filtered = y_pred[mask]
    
    # Calcula R² con datos filtrados
    r2 = sklearn_r2(y_true_filtered, y_pred_filtered)
    print(f"{col}: {r2:.4f}")

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Copias para conservar datos originales
y_val_real_filt = y_val_real.copy()
y_pred_val_real_filt = y_pred_val_real.copy()

# Filtrado por percentiles 5%-95% para cada variable
mask_global = np.ones(len(y_val_real), dtype=bool)
for i in range(y_val_real.shape[1]):
    # Determinar los percentiles para los valores reales
    p5_real, p95_real = np.percentile(y_val_real[:, i], [5, 95])
    # Determinar los percentiles para las predicciones
    p5_pred, p95_pred = np.percentile(y_pred_val_real[:, i], [5, 95])

    # Crear una máscara booleana combinada (ambos dentro de rango)
    mask_i = (
        (y_val_real[:, i] >= p5_real) & (y_val_real[:, i] <= p95_real) &
        (y_pred_val_real[:, i] >= p5_pred) & (y_pred_val_real[:, i] <= p95_pred)
    )

    # Combinar con la máscara global (conserva solo los puntos válidos en todas las columnas)
    mask_global &= mask_i

# Aplicar la máscara global
y_val_real_filt = y_val_real[mask_global]
y_pred_val_real_filt = y_pred_val_real[mask_global]

# Graficar los datos filtrados
fig, axs = plt.subplots(2, 3, figsize=(15, 8))
for i, ax in enumerate(axs.flat):
    ax.scatter(y_val_real_filt[:, i], y_pred_val_real_filt[:, i], alpha=0.3)
    ax.plot([min(y_val_real_filt[:, i]), max(y_val_real_filt[:, i])],
            [min(y_val_real_filt[:, i]), max(y_val_real_filt[:, i])], 'r--')
    ax.set_title(f'{TARGET_COLS[i]}')
    ax.set_xlabel('Real')
    ax.set_ylabel('Predicción')
plt.tight_layout()
plt.show()