In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# 1. Carga de datos
df = pd.read_csv("Data/video_games_sales_completo.csv")

# 2. Cuantiles q90/q60 y target multiclas
q90 = df['Global_Sales'].quantile(0.90)
q60 = df['Global_Sales'].quantile(0.60)

def clasificar_q90_60(v):
    if v >= q90:    return 'Alta'
    elif v >= q60:  return 'Media'
    else:           return 'Baja'

df['Ventas_Clase_Q'] = df['Global_Sales'].apply(clasificar_q90_60)

# 3. Features y target
features = [
    'Platform','Genre','Publisher','Year','Nota PEGI',
    'User Score','Estado_Consola','Price','Play Time',
    'Price_Platform','Year_Consola','Modo Juego', 'Años_desde_lanzamiento_consola', 'Precio_relativo', 'PEGI_categoria','Duracion_juego_cat', 'Nombre_Base', 'Es_Saga', 'Tipo_Saga'
]
X = df[features]
y = df['Ventas_Clase_Q']

# 4. División entrenamiento/prueba
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# 5. Preprocesamiento
categorical = ['Platform','Genre','Publisher','Modo Juego','Estado_Consola', 'PEGI_categoria', 'Duracion_juego_cat','Nombre_Base', 'Tipo_Saga']
numerical   = ['Year','Nota PEGI','User Score',
               'Price','Play Time','Price_Platform','Year_Consola', 'Años_desde_lanzamiento_consola', 'Precio_relativo', 'Es_Saga']

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
    ('num', StandardScaler(), numerical)
])

# 6. Pipeline base
pipeline = Pipeline([
    ('pre', preprocessor),
    ('rf', RandomForestClassifier(random_state=42))
])

# 7. Grid de hiperparámetros
param_grid = {
    'rf__n_estimators': [200, 500, 800],
    'rf__max_depth': [None, 10, 20, 30],
    'rf__max_features': ['sqrt', 'log2', 0.5],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4]
}

# 8. GridSearchCV
grid = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,
    n_jobs=-1,
    verbose=2
)

grid.fit(X_train, y_train)

# 9. Mejores parámetros y evaluación
print("Mejores parámetros:", grid.best_params_)
best_rf = grid.best_estimator_
y_pred = best_rf.predict(X_test)

print("\nClassification Report (q90/q60):")
print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 324 candidates, totalling 972 fits


KeyboardInterrupt: 