In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [25]:
df = pd.read_csv('Data/video_games_sales_completo.csv')

In [26]:
print(df.columns)

Index(['Name', 'Platform', 'Year', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales', 'Ventas_Clase',
       'Nota PEGI', 'User Score', 'User Ratings Count', 'Platforms Info',
       'Estado_Consola', 'Price', 'Price_Platform', 'Year_Consola',
       'Play Time', 'Modo Juego'],
      dtype='object')


In [40]:
# Definicion de variables
X = df.drop(columns=['Ventas_Clase', 'Global_Sales', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Name', 'User Ratings Count', 'Platforms Info'])  # Variables independientes
y = df['Ventas_Clase']                                  # Variable dependiente

# Dividir el conjunto de datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Definir las columnas categóricas para One-Hot Encoding
categorical_columns = ['Platform', 'Year', 'Genre', 'Publisher', 'Nota PEGI', 'Estado_Consola', 'Year_Consola', 'Modo Juego']

# Crear un transformador para las columnas categóricas (One-Hot Encoding)
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_columns),  # One-Hot Encoding en las columnas categóricas
        ('num', StandardScaler(), X.select_dtypes(include=['float64', 'int64']).columns)  # Escalado en columnas numéricas
    ])

# Crear el pipeline que primero realiza la transformación y luego aplica el modelo
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Entrenar el modelo
pipeline.fit(X_train, y_train)

# Hacer predicciones
y_pred = pipeline.predict(X_test)

# Evaluar el modelo
print(f"Exactitud: {accuracy_score(y_test, y_pred)}")
print("Matriz de confusión:")
print(confusion_matrix(y_test, y_pred))
print("Reporte de clasificación:")
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy del modelo: {accuracy:.2f}")

Exactitud: 0.5015375153751538
Matriz de confusión:
[[197 128  63 274]
 [ 71 942  14 268]
 [110  34  94  77]
 [156 396  30 398]]
Reporte de clasificación:
              precision    recall  f1-score   support

        Alta       0.37      0.30      0.33       662
        Baja       0.63      0.73      0.67      1295
  Gran Éxito       0.47      0.30      0.36       315
       Media       0.39      0.41      0.40       980

    accuracy                           0.50      3252
   macro avg       0.46      0.43      0.44      3252
weighted avg       0.49      0.50      0.49      3252

Accuracy del modelo: 0.50


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [39]:
le = LabelEncoder()
y_numeric = le.fit_transform(df['Ventas_Clase'])

# Calcular correlación con variables numéricas
correlaciones = df.select_dtypes(include=['int64', 'float64']).corrwith(pd.Series(y_numeric, name='Ventas_Clase'))
print(correlaciones.sort_values(ascending=False))

JP_Sales              0.027646
Year_Consola          0.027208
Global_Sales          0.025641
NA_Sales              0.023559
EU_Sales              0.018640
Other_Sales           0.014730
User Ratings Count    0.010742
Price                 0.008457
Play Time            -0.004940
Price_Platform       -0.005677
User Score           -0.026655
Nota PEGI            -0.027492
Year                 -0.031406
dtype: float64
