In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [25]:
df = pd.read_csv('Data/video_games_sales_completo.csv')

In [26]:
print(df.columns)

Index(['Name', 'Platform', 'Year', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales', 'Ventas_Clase',
       'Nota PEGI', 'User Score', 'User Ratings Count', 'Platforms Info',
       'Estado_Consola', 'Price', 'Price_Platform', 'Year_Consola',
       'Play Time', 'Modo Juego'],
      dtype='object')


In [27]:
# Definir variables #
X = df.drop(columns=['Ventas_Clase', 'Global_Sales', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Name', 'User Ratings Count', 'Platforms Info'])  # Variables independientes
y = df['Ventas_Clase']                                  # Variable dependiente

# Identificar variables categóricas y numéricas #
cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [28]:
# Preprocesamiento #
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),   # Imputar numéricos
    ('scaler', StandardScaler())                   # Escalar
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Imputar categóricos
    ('onehot', OneHotEncoder(handle_unknown='ignore'))     # Codificar
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols)
])

In [29]:
# Crear pipeline final #
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

In [30]:
# Separar en train/test #
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
# Entrenar el modelo #
clf.fit(X_train, y_train)

In [32]:
# Predecir y evaluar #
y_pred = clf.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

        Alta       0.36      0.28      0.32       662
        Baja       0.63      0.71      0.67      1295
  Gran Éxito       0.46      0.29      0.35       315
       Media       0.38      0.42      0.40       980

    accuracy                           0.49      3252
   macro avg       0.46      0.42      0.43      3252
weighted avg       0.48      0.49      0.48      3252

Confusion Matrix:
 [[187 131  62 282]
 [ 68 920  18 289]
 [102  31  91  91]
 [158 385  29 408]]


In [33]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy del modelo: {accuracy:.2f}")

Accuracy del modelo: 0.49


In [34]:

le = LabelEncoder()
y_numeric = le.fit_transform(df['Ventas_Clase'])

# Calcular correlación con variables numéricas
correlaciones = df.select_dtypes(include=['int64', 'float64']).corrwith(pd.Series(y_numeric, name='Ventas_Clase'))
print(correlaciones.sort_values(ascending=False))

JP_Sales              0.027646
Year_Consola          0.027208
Global_Sales          0.025641
NA_Sales              0.023559
EU_Sales              0.018640
Other_Sales           0.014730
User Ratings Count    0.010742
Price                 0.008457
Play Time            -0.004940
Price_Platform       -0.005677
User Score           -0.026655
Nota PEGI            -0.027492
Year                 -0.031406
dtype: float64
