In [39]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

In [40]:
df = pd.read_csv('Data/video_games_sales_completo.csv')

In [41]:
X = df.drop(columns=['Ventas_Clase', 'Global_Sales', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Name', 'User Ratings Count', 'Platforms Info', 'Nota PEGI','Play Time'])  
y = df['Ventas_Clase']

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

In [43]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

In [44]:
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [None]:
rf_pipeline.fit(X_train, y_train)

In [None]:
y_pred = rf_pipeline.predict(X_test)
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Random Forest Classification Report:
              precision    recall  f1-score   support

        Alta       0.63      0.57      0.60      1225
        Baja       0.66      0.71      0.68      1938
       Media       0.51      0.49      0.50      1715

    accuracy                           0.60      4878
   macro avg       0.60      0.59      0.59      4878
weighted avg       0.60      0.60      0.60      4878

Confusion Matrix:
[[ 701  165  359]
 [  99 1382  457]
 [ 316  558  841]]
