# Script de Submission Múltiple

Este notebook genera múltiples submissions con diferentes factores de riesgo para optimizar las predicciones.

In [None]:
# Importar librerías necesarias
import pandas as pd
import numpy as np
import re
from sklearn.neighbors import NearestNeighbors
import lightgbm as lgb
import warnings

In [None]:
# Configuración de archivos y parámetros
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"

# Pon el 'best_iteration' que encontraste en el Script 1
N_ESTIMATORS_OPTIMO = 115 # (Este es un EJEMPLO)

N_NEIGHBORS = 5

# Define tus 8 "Factores de Riesgo" 
RISK_FACTORS = {
    'intento_1': 30,
    'intento_2': 20,
    'intento_3': 25,
}

In [None]:
# Funciones auxiliares
def parse_embedding(embedding_str):
    if pd.isna(embedding_str): return np.zeros(512) 
    try:
        cleaned_str = re.sub(r"[^0-9.,-]", "", str(embedding_str))
        values = [float(x) for x in cleaned_str.split(',') if x]
        return np.array(values) if len(values) == 512 else np.zeros(512)
    except Exception as e: return np.zeros(512)

def preprocess_features_lgbm(df, categorical_features):
    df['phase_in'] = pd.to_datetime(df['phase_in'], errors='coerce', dayfirst=True)
    df['phase_in_month'] = df['phase_in'].dt.month.fillna(0).astype(int)
    df['phase_in_week'] = df['phase_in'].dt.isocalendar().week.fillna(0).astype('UInt32')
    bool_cols = df.select_dtypes(include=['bool']).columns
    for col in bool_cols: df[col] = df[col].fillna(False)
    for col in df.columns:
        if col in categorical_features:
            df[col] = df[col].fillna("Missing").astype('category')
        elif col in df.select_dtypes(include=np.number).columns:
            df[col] = df[col].fillna(0)
    return df

## Paso 1: Cargar y limpiar datos

In [None]:
# Cargar datos
print(f"Paso 1: Cargando {TRAIN_FILE} y {TEST_FILE}...")
train_df = pd.read_csv(TRAIN_FILE, delimiter=';')
test_df = pd.read_csv(TEST_FILE, delimiter=';')
test_df = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]

In [None]:
# Preprocessing - Eliminando columnas con >40% missings
print("Paso 1.1: Preprocessing - Eliminando columnas con >40% missings...")

# Identificar y eliminar columnas con >40% de valores faltantes en train
missing_pct = train_df.isnull().sum() / len(train_df)
cols_to_drop = missing_pct[missing_pct > 0.40].index.tolist()
print(f"Columnas eliminadas por >40% missings: {cols_to_drop}")

train_df = train_df.drop(columns=cols_to_drop, errors='ignore')
test_df = test_df.drop(columns=cols_to_drop, errors='ignore')

# Guardar dimensiones originales
original_train_len = len(train_df)
print(f"Filas originales train: {original_train_len}")

## Paso 2: Agregar datos de entrenamiento

In [None]:
# Agregar datos de entrenamiento por ID
print("Paso 2: Agregando datos de entrenamiento por ID...")
agg_ops = {
    'weekly_demand': 'sum', 'weekly_sales': 'sum', 'Production': 'first',
    'image_embedding': 'first', 'num_stores': 'first', 'num_sizes': 'first',
    'price': 'mean', 'life_cycle_length': 'first', 'phase_in': 'first',
    'aggregated_family': 'first', 'family': 'first', 'category': 'first', 'fabric': 'first',
    'color_name': 'first', 'length_type': 'first', 'silhouette_type': 'first',
    'waist_type': 'first', 'neck_lapel_type': 'first', 'sleeve_length_type': 'first',
    'woven_structure': 'first', 'knit_structure': 'first', 'print_type': 'first',
    'archetype': 'first', 'moment': 'first', 'has_plus_sizes': 'first'
}
agg_ops = {col: op for col, op in agg_ops.items() if col in train_df.columns}
train_agg_df = train_df.groupby('ID').agg(agg_ops).reset_index()
train_agg_df.rename(columns={'weekly_demand': 'total_demand', 'weekly_sales': 'total_sales'}, inplace=True)
train_agg_df['sell_through'] = train_agg_df['total_sales'] / (train_agg_df['Production'] + 1e-6) 
train_agg_df['sell_through'] = train_agg_df['sell_through'].fillna(0)
train_agg_df['is_stockout'] = (train_agg_df['sell_through'] >= 0.98).astype(int)

In [None]:
# Eliminar filas con valores faltantes en columnas clave
print("Paso 2.1: Eliminando filas con valores faltantes en columnas clave...")
cols_critical = ['Production', 'image_embedding']  # Columnas que no pueden tener NaN
train_agg_df = train_agg_df.dropna(subset=[c for c in cols_critical if c in train_agg_df.columns])
print(f"Filas después de eliminar missings críticos: {len(train_agg_df)} (eliminadas: {original_train_len - len(train_agg_df)})")

## Paso 3: Procesamiento de embeddings y k-NN

In [None]:
# Procesamiento de embeddings
print("Paso 3: Procesando embeddings y creando features k-NN...")
train_agg_df['embedding_vec'] = train_agg_df['image_embedding'].apply(parse_embedding)
test_df['embedding_vec'] = test_df['image_embedding'].apply(parse_embedding)
X_train_embeddings = np.stack(train_agg_df['embedding_vec'].values)
X_test_embeddings = np.stack(test_df['embedding_vec'].values)

In [None]:
# Crear modelo k-NN y obtener vecinos
knn_model = NearestNeighbors(n_neighbors=N_NEIGHBORS, metric='cosine', n_jobs=-1)
knn_model.fit(X_train_embeddings)
distances, indices = knn_model.kneighbors(X_test_embeddings)

cols_to_fetch = {
    'total_demand': 'total_demand', 'total_sales': 'total_sales', 'Production': 'Production', 
    'num_stores': 'num_stores', 'sell_through': 'sell_through', 'is_stockout': 'is_stockout'
}
for col_name_in_train, new_feature_prefix in cols_to_fetch.items():
    if col_name_in_train in train_agg_df.columns:
        neighbor_values = train_agg_df.iloc[indices.flatten()][col_name_in_train].values
        neighbor_values = neighbor_values.reshape(len(test_df), N_NEIGHBORS)
        test_df[f'avg_neighbor_{new_feature_prefix}'] = np.mean(neighbor_values, axis=1)

## Paso 4: Pre-procesamiento final

In [None]:
# Preparar datos para One-Hot Encoding
print("Paso 4: Pre-procesamiento final - One-Hot Encoding y Escalado...")
from sklearn.preprocessing import StandardScaler

special_cols = ['image_embedding', 'embedding_vec', 'phase_in', 'phase_out', 'color_rgb', 'total_demand', 'total_sales', 'ID']
all_categorical = list(set(train_agg_df.select_dtypes(include=['object']).columns) | set(test_df.select_dtypes(include=['object']).columns))
all_categorical = [col for col in all_categorical if col not in special_cols]

train_shape = len(train_agg_df)
combined_df = pd.concat([train_agg_df.drop(['total_demand', 'total_sales'], axis=1, errors='ignore').copy(), test_df.copy()], ignore_index=True)

# Preprocesar fechas y básicos
combined_df = preprocess_features_lgbm(combined_df, all_categorical)

In [None]:
# One-Hot Encoding
print(f"Aplicando One-Hot Encoding a {len(all_categorical)} columnas categóricas...")
# Convertir categóricas a strings primero
for col in all_categorical:
    if col in combined_df.columns:
        combined_df[col] = combined_df[col].astype(str)

# One-hot encoding
combined_df = pd.get_dummies(combined_df, columns=all_categorical, drop_first=True, dtype=int)

In [None]:
# Limpieza de nombres de columnas para LightGBM
# LightGBM no acepta caracteres especiales JSON en nombres
combined_df.columns = combined_df.columns.str.replace('[', '_', regex=False)
combined_df.columns = combined_df.columns.str.replace(']', '_', regex=False)
combined_df.columns = combined_df.columns.str.replace('<', '_', regex=False)
combined_df.columns = combined_df.columns.str.replace('>', '_', regex=False)
combined_df.columns = combined_df.columns.str.replace('{', '_', regex=False)
combined_df.columns = combined_df.columns.str.replace('}', '_', regex=False)
combined_df.columns = combined_df.columns.str.replace('"', '_', regex=False)
combined_df.columns = combined_df.columns.str.replace("'", '_', regex=False)
combined_df.columns = combined_df.columns.str.replace(':', '_', regex=False)
combined_df.columns = combined_df.columns.str.replace(',', '_', regex=False)
combined_df.columns = combined_df.columns.str.replace(' ', '_', regex=False)

In [None]:
# Separar nuevamente train y test
train_processed_df = combined_df.iloc[:train_shape].copy()
test_processed_df = combined_df.iloc[train_shape:].copy()
train_processed_df = train_processed_df.assign(total_demand=train_agg_df['total_demand'].values)

In [None]:
# Escalado de datos numéricos
print("Escalando features numéricas con StandardScaler...")
target = 'total_demand'
features = [col for col in train_processed_df.columns if col not in special_cols and col != target]

# Identificar columnas numéricas (excluyendo one-hot encoded que son 0/1)
numeric_cols = train_processed_df[features].select_dtypes(include=[np.number]).columns.tolist()
# Filtrar one-hot (típicamente tienen valores solo 0 y 1)
numeric_cols_to_scale = [c for c in numeric_cols if train_processed_df[c].nunique() > 2]

if numeric_cols_to_scale:
    scaler = StandardScaler()
    train_processed_df[numeric_cols_to_scale] = scaler.fit_transform(train_processed_df[numeric_cols_to_scale])
    test_processed_df[numeric_cols_to_scale] = scaler.transform(test_processed_df[numeric_cols_to_scale])
    print(f"Escaladas {len(numeric_cols_to_scale)} columnas numéricas")

## Paso 5: Definir features y target

In [None]:
# Definir features y target
print("Paso 5: Definiendo features y target...")
target = 'total_demand'
features = [col for col in train_processed_df.columns if col not in special_cols and col != target]
# Ya no necesitamos categorical features porque hicimos one-hot encoding
X_train_final = train_processed_df[features]
y_train_final = train_processed_df[target]
X_test_final = test_processed_df[features]
X_test_final = X_test_final[X_train_final.columns]
print(f"Usando {len(features)} features (incluyendo one-hot encoded).")

## Paso 6: Entrenar modelo final

In [None]:
# Entrenar modelo LightGBM final con 100% de datos
print(f"Paso 6: Entrenando LightGBM final con 100% de datos...")
print(f"Parámetros: objective='regression_l1', n_estimators={N_ESTIMATORS_OPTIMO}")
lgbm_params = {
    'objective': 'regression_l1', 'metric': 'mae',
    'n_estimators': N_ESTIMATORS_OPTIMO, 'learning_rate': 0.05,
    'n_jobs': -1, 'random_state': 42, 'verbose': -1
}
model_final = lgb.LGBMRegressor(**lgbm_params)
# Sin categorical_feature porque ya hicimos one-hot encoding
model_final.fit(X_train_final, y_train_final)
print("Entrenamiento final completado.")

## Paso 7: Generar submissions con diferentes factores

In [None]:
# Generar predicciones base
print("Paso 7: Generando predicciones base (MAE)...")
# Generar UNA SOLA VEZ las predicciones de "precisión"
base_predictions = model_final.predict(X_test_final)
base_predictions[base_predictions < 0] = 0 # Limpiar negativos

In [None]:
# Generar archivos de submission con diferentes factores
print(f"¡Generando {len(RISK_FACTORS)} archivos de submission!")
for (name, factor) in RISK_FACTORS.items():
    
    # Aplicar el "Factor de Riesgo"
    final_predictions = base_predictions * factor
    
    submission_df = pd.DataFrame({'ID': test_df['ID'], 'Production': final_predictions})
    submission_df['Production'] = submission_df['Production'].round().astype(int)
    
    filename = f"submission_{name}_factor_{factor:.2f}.csv"
    submission_df.to_csv(filename, index=False)
    
    print(f"Archivo guardado: {filename}")
    print(submission_df.head())
    print("---")
print(f"{len(RISK_FACTORS)} archivos de submission generados. ¡Súbelos y compara los scores!")