In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import os
from glob import glob
import warnings
warnings.filterwarnings('ignore')

# Configurar dispositivo (GPU si está disponible)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Usando dispositivo: {device}")

In [None]:
# Configurar ruta y cargar subset pequeño de datos
train_path = '/home/stargix/Desktop/hackathons/datathon/train/train'

# Obtener lista de archivos parquet
parquet_files = glob(os.path.join(train_path, '**/part-*.parquet'), recursive=True)
print(f"Total de archivos parquet: {len(parquet_files)}")

# Usar solo los primeros 5 archivos para entrenar (subset pequeño)
sample_files = parquet_files[:5]
print(f"Usando {len(sample_files)} archivos para entrenamiento")

# Cargar y combinar datos
dfs = []
for file in sample_files:
    df = pd.read_parquet(file)
    dfs.append(df)
    print(f"Cargado: {os.path.basename(os.path.dirname(file))}, shape: {df.shape}")

data = pd.concat(dfs, ignore_index=True)
print(f"\nDatos combinados - Shape: {data.shape}")
print(f"Columnas: {data.columns.tolist()}")
print(f"\nPrimeras filas:\n{data.head()}")
print(f"\nInfo de tipos:\n{data.dtypes}")
print(f"\nValores faltantes:\n{data.isnull().sum()}")

In [None]:
# Preparar datos para la red neuronal
# Separar features y target (asumiendo que 'revenue' es el target)

# Seleccionar features numéricos
numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()

# Identificar la columna target (revenue)
if 'revenue' in numeric_cols:
    target_col = 'revenue'
    feature_cols = [col for col in numeric_cols if col != 'revenue']
else:
    print("Columnas disponibles:", numeric_cols)
    target_col = numeric_cols[-1]
    feature_cols = numeric_cols[:-1]

print(f"Target: {target_col}")
print(f"Features ({len(feature_cols)}): {feature_cols}")

# Preparar X (features) e y (target)
X = data[feature_cols].values.astype(np.float32)
y = data[target_col].values.astype(np.float32)

print(f"\nShape de X: {X.shape}")
print(f"Shape de y: {y.shape}")
print(f"Rango de revenue: [{y.min():.2f}, {y.max():.2f}]")

In [None]:
# Normalizar features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X).astype(np.float32)

# Dividir en train y test
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

print(f"Shape de entrenamiento: {X_train.shape}, {y_train.shape}")
print(f"Shape de prueba: {X_test.shape}, {y_test.shape}")

# Convertir a tensores de PyTorch
X_train_tensor = torch.FloatTensor(X_train).to(device)
y_train_tensor = torch.FloatTensor(y_train).reshape(-1, 1).to(device)
X_test_tensor = torch.FloatTensor(X_test).to(device)
y_test_tensor = torch.FloatTensor(y_test).reshape(-1, 1).to(device)

# Crear DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

print(f"\nTensores creados en dispositivo: {device}")

In [None]:
# Definir arquitectura de red neuronal con PyTorch
class RevenueNN(nn.Module):
    def __init__(self, input_dim):
        super(RevenueNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.2)
        
        self.fc2 = nn.Linear(64, 32)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.2)
        
        self.fc3 = nn.Linear(32, 16)
        self.relu3 = nn.ReLU()
        
        self.fc4 = nn.Linear(16, 1)
    
    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.dropout1(x)
        
        x = self.relu2(self.fc2(x))
        x = self.dropout2(x)
        
        x = self.relu3(self.fc3(x))
        x = self.fc4(x)
        return x

# Crear modelo
input_dim = X_train.shape[1]
model = RevenueNN(input_dim).to(device)

# Definir función de pérdida y optimizador
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

print("Arquitectura del modelo:")
print(model)
print(f"\nTotal de parámetros: {sum(p.numel() for p in model.parameters())}")

In [None]:
# Entrenar el modelo
num_epochs = 50
train_losses = []
val_losses = []

model.train()
for epoch in range(num_epochs):
    epoch_loss = 0
    for X_batch, y_batch in train_loader:
        # Forward pass
        predictions = model(X_batch)
        loss = criterion(predictions, y_batch)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    # Validación
    model.eval()
    with torch.no_grad():
        val_predictions = model(X_test_tensor)
        val_loss = criterion(val_predictions, y_test_tensor)
    
    avg_loss = epoch_loss / len(train_loader)
    train_losses.append(avg_loss)
    val_losses.append(val_loss.item())
    
    if (epoch + 1) % 10 == 0:
        print(f"Época {epoch + 1}/{num_epochs} - Loss: {avg_loss:.4f} - Val Loss: {val_loss.item():.4f}")
    
    model.train()

print("\nEntrenamiento completado!")

In [None]:
# Evaluar en datos de prueba
model.eval()
with torch.no_grad():
    y_pred_tensor = model(X_test_tensor)
    test_loss = criterion(y_pred_tensor, y_test_tensor)

y_pred = y_pred_tensor.cpu().numpy()
test_mse = test_loss.item()
test_mae = torch.abs(y_pred_tensor - y_test_tensor).mean().item()
test_rmse = np.sqrt(test_mse)

print(f"Resultados en datos de prueba:")
print(f"Pérdida (MSE): {test_mse:.4f}")
print(f"MAE: {test_mae:.4f}")
print(f"RMSE: {test_rmse:.4f}")

# Mostrar algunas predicciones vs valores reales
print(f"\nPrimeras 10 predicciones vs valores reales:")
comparison = pd.DataFrame({
    'Real': y_test[:10],
    'Predicho': y_pred[:10].flatten(),
    'Error': np.abs(y_test[:10] - y_pred[:10].flatten())
})
print(comparison)

In [None]:
import matplotlib.pyplot as plt

# Graficar el histórico de entrenamiento
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Loss
axes[0].plot(train_losses, label='Pérdida de Entrenamiento')
axes[0].plot(val_losses, label='Pérdida de Validación')
axes[0].set_xlabel('Época')
axes[0].set_ylabel('Pérdida (MSE)')
axes[0].set_title('Histórico de Pérdida')
axes[0].legend()
axes[0].grid(True)

# Predicciones vs Reales
axes[1].scatter(y_test, y_pred, alpha=0.5, s=20)
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[1].set_xlabel('Valores Reales')
axes[1].set_ylabel('Predicciones')
axes[1].set_title('Predicciones vs Valores Reales')
axes[1].grid(True)

plt.tight_layout()
plt.show()

print("Gráficos de entrenamiento generados")

In [None]:
# Función para hacer predicciones en nuevos datos
def predecir_revenue(ids, features_dict):
    """
    Predecir revenue para un subset de datos
    ids: lista de IDs
    features_dict: diccionario con features
    """
    # Crear dataframe
    df_nuevo = pd.DataFrame(features_dict)
    
    # Usar solo los features que se usaron en entrenamiento
    X_nuevo = df_nuevo[feature_cols].values.astype(np.float32)
    
    # Normalizar con el mismo scaler
    X_nuevo_scaled = scaler.transform(X_nuevo).astype(np.float32)
    
    # Convertir a tensor
    X_nuevo_tensor = torch.FloatTensor(X_nuevo_scaled).to(device)
    
    # Predecir
    model.eval()
    with torch.no_grad():
        predicciones = model(X_nuevo_tensor)
    
    # Convertir resultado a numpy
    predicciones_np = predicciones.cpu().numpy()
    
    # Crear resultado
    resultado = pd.DataFrame({
        'id': ids,
        'revenue_predicho': predicciones_np.flatten()
    })
    
    return resultado

# Ejemplo de predicción con datos nuevos
example_ids = list(range(1, 6))
example_features = {col: X_test[:5, feature_cols.index(col)] for col in feature_cols}

predicciones_ejemplo = predecir_revenue(example_ids, example_features)
print("Ejemplo de predicciones:")
print(predicciones_ejemplo)

In [None]:
# Guardar el modelo
torch.save(model.state_dict(), 'revenue_model.pth')
print("Modelo guardado como 'revenue_model.pth'")