In [1]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
import joblib

In [2]:
df = pd.read_csv('Data/video_games_sales_completo.csv')
X = df.drop(columns=['Ventas_Clase', 'Global_Sales', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Name', 'Nota PEGI','Play Time','Es_Saga'])  
y = df['Ventas_Clase']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])
rf_pipeline.fit(X_train, y_train)
y_pred = rf_pipeline.predict(X_test)


In [3]:
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Random Forest Classification Report:
              precision    recall  f1-score   support

        Alta       0.57      0.46      0.51      1506
    Muy Alta       0.66      0.38      0.48       490
      Normal       0.75      0.88      0.81      2882

    accuracy                           0.70      4878
   macro avg       0.66      0.57      0.60      4878
weighted avg       0.69      0.70      0.68      4878

Confusion Matrix:
[[ 690   70  746]
 [ 208  184   98]
 [ 317   24 2541]]


In [4]:
print(df['Ventas_Clase'].value_counts())

Ventas_Clase
Normal      9607
Alta        5020
Muy Alta    1632
Name: count, dtype: int64


In [5]:
print(df.columns.unique())

Index(['Name', 'Platform', 'Year', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales', 'Ventas_Clase',
       'Nota PEGI', 'User Score', 'User Ratings Count', 'Estado_Consola',
       'Price', 'Price_Platform', 'Year_Consola', 'Play Time', 'Modo Juego',
       'Años_desde_lanzamiento_consola', 'Precio_relativo', 'PEGI_categoria',
       'Duracion_juego_cat', 'Nombre_Base', 'Es_Saga', 'Tipo_Saga',
       'Situacion_Economica'],
      dtype='object')


In [6]:
pipeline_final = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])
pipeline_final.fit(X_train, y_train)
joblib.dump(pipeline_final, 'Modelo/modelo_videojuegos.pkl')
print("Modelo guardado como 'modelo_videojuegos.pkl'")

FileNotFoundError: [Errno 2] No such file or directory: 'Modelo/modelo_videojuegos.pkl'