In [23]:
import pandas as pd
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

In [24]:
df2 = pd.read_json(r'..\datos\steam_games2.json')

In [25]:
#------Limpieza------

df_ml = df2

df_ml.dropna(subset=['genres'], inplace=True)

# Convertir la columna 'price' a tipo numerico y eliminar filas con valores faltantes en 'price'
df_ml['price'] = pd.to_numeric(df_ml['price'], errors='coerce')
df_ml.dropna(subset=['price'], inplace=True)

# Eliminar duplicados 
df_ml.drop_duplicates(subset='id',inplace=True)

# Eliminar columnas no útiles
colum = ['publisher','app_name','title','url','release_date','tags','discount_price','reviews_url','specs','id','developer','sentiment','metascore']
df_ml.drop(colum, axis=1, inplace=True)

df_ml

Unnamed: 0,genres,price,early_access
0,"[Action, Casual, Indie, Simulation, Strategy]",4.99,False
3,"[Action, Adventure, Casual]",0.99,False
5,"[Action, Adventure, Simulation]",3.99,False
6,"[Free to Play, Indie, Simulation, Sports]",9.99,False
7,"[Free to Play, Indie, Simulation, Sports]",18.99,False
...,...,...,...
32129,"[Action, Adventure, Casual, Indie]",1.99,False
32130,"[Casual, Indie, Simulation, Strategy]",1.99,False
32131,"[Casual, Indie, Strategy]",4.99,False
32132,"[Indie, Racing, Simulation]",1.99,False


In [26]:
#------Maching learning------

# Convertir las listas en columnas de Python
#df_ml['genres'] = df_ml['genres'].apply(ast.literal_eval)

# Deshacer las listas
df_exploded = df_ml.explode('genres')
df_exploded

Unnamed: 0,genres,price,early_access
0,Action,4.99,False
0,Casual,4.99,False
0,Indie,4.99,False
0,Simulation,4.99,False
0,Strategy,4.99,False
...,...,...,...
32132,Indie,1.99,False
32132,Racing,1.99,False
32132,Simulation,1.99,False
32133,Casual,4.99,False


In [28]:
# Crear columnas dummy
df_dummies = pd.get_dummies(df_exploded['genres'])
df_dummies

Unnamed: 0,Accounting,Action,Adventure,Animation &amp; Modeling,Audio Production,Casual,Design &amp; Illustration,Early Access,Education,Free to Play,...,Photo Editing,RPG,Racing,Simulation,Software Training,Sports,Strategy,Utilities,Video Production,Web Publishing
0,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
0,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32132,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
32132,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
32132,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
32133,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [29]:
# Agregar la columna 'price' y 'ACCESO_ANTISIPADO' al DataFrame df_dummies
df_dummies['price'] = df_exploded['price']
df_dummies['early_access'] = df_exploded['early_access']
df_dummies

Unnamed: 0,Accounting,Action,Adventure,Animation &amp; Modeling,Audio Production,Casual,Design &amp; Illustration,Early Access,Education,Free to Play,...,Racing,Simulation,Software Training,Sports,Strategy,Utilities,Video Production,Web Publishing,price,early_access
0,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,4.99,False
0,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,4.99,False
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,4.99,False
0,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,4.99,False
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,4.99,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32132,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,1.99,False
32132,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,1.99,False
32132,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,1.99,False
32133,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,4.99,False


In [30]:
# Agrupar por el índice (género) y sumar las filas agrupadas
df_grouped = df_dummies.groupby(df_dummies.index).sum()
df_grouped

Unnamed: 0,Accounting,Action,Adventure,Animation &amp; Modeling,Audio Production,Casual,Design &amp; Illustration,Early Access,Education,Free to Play,...,Racing,Simulation,Software Training,Sports,Strategy,Utilities,Video Production,Web Publishing,price,early_access
0,0,1,0,0,0,1,0,0,0,0,...,0,1,0,0,1,0,0,0,24.95,0
3,0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,2.97,0
5,0,1,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,11.97,0
6,0,0,0,0,0,0,0,0,0,1,...,0,1,0,1,0,0,0,0,39.96,0
7,0,0,0,0,0,0,0,0,0,1,...,0,1,0,1,0,0,0,0,75.96,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32129,0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,7.96,0
32130,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,1,0,0,0,7.96,0
32131,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,14.97,0
32132,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,5.97,0


In [31]:
# Agregar las columnas 'price' y 'early_access' después del groupby
df_grouped['price'] = df_exploded.groupby(df_exploded.index).first()['price']
df_grouped['early_access'] = df_exploded.groupby(df_exploded.index).first()['early_access']
df_grouped

Unnamed: 0,Accounting,Action,Adventure,Animation &amp; Modeling,Audio Production,Casual,Design &amp; Illustration,Early Access,Education,Free to Play,...,Racing,Simulation,Software Training,Sports,Strategy,Utilities,Video Production,Web Publishing,price,early_access
0,0,1,0,0,0,1,0,0,0,0,...,0,1,0,0,1,0,0,0,4.99,False
3,0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0.99,False
5,0,1,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,3.99,False
6,0,0,0,0,0,0,0,0,0,1,...,0,1,0,1,0,0,0,0,9.99,False
7,0,0,0,0,0,0,0,0,0,1,...,0,1,0,1,0,0,0,0,18.99,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32129,0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1.99,False
32130,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,1,0,0,0,1.99,False
32131,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,4.99,False
32132,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,1.99,False


In [32]:
# Reemplazar df2 con el DataFrame final df_grouped
df_ml = df_grouped

df_ml = df_ml[df_ml['price'] != 0.00]
df_ml = df_ml[df_ml['Free to Play'] != 1]
df_ml = df_ml.drop('Free to Play', axis=1)
df_ml

Unnamed: 0,Accounting,Action,Adventure,Animation &amp; Modeling,Audio Production,Casual,Design &amp; Illustration,Early Access,Education,Indie,...,Racing,Simulation,Software Training,Sports,Strategy,Utilities,Video Production,Web Publishing,price,early_access
0,0,1,0,0,0,1,0,0,0,1,...,0,1,0,0,1,0,0,0,4.99,False
3,0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0.99,False
5,0,1,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,3.99,False
12,0,1,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,10.99,False
13,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,3.99,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32129,0,1,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,1.99,False
32130,0,0,0,0,0,1,0,0,0,1,...,0,1,0,0,1,0,0,0,1.99,False
32131,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,1,0,0,0,4.99,False
32132,0,0,0,0,0,0,0,0,0,1,...,1,1,0,0,0,0,0,0,1.99,False


In [35]:
X = df_ml.drop(columns=['price', 'Early Access'], axis=1)
y = df_ml['price']
X

Unnamed: 0,Accounting,Action,Adventure,Animation &amp; Modeling,Audio Production,Casual,Design &amp; Illustration,Education,Indie,Massively Multiplayer,...,RPG,Racing,Simulation,Software Training,Sports,Strategy,Utilities,Video Production,Web Publishing,early_access
0,0,1,0,0,0,1,0,0,1,0,...,0,0,1,0,0,1,0,0,0,False
3,0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,False
5,0,1,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,False
12,0,1,1,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,False
13,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32129,0,1,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,False
32130,0,0,0,0,0,1,0,0,1,0,...,0,0,1,0,0,1,0,0,0,False
32131,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,1,0,0,0,False
32132,0,0,0,0,0,0,0,0,1,0,...,0,1,1,0,0,0,0,0,0,False


In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Especificar el grado del polinomio deseado
grado_polinomio = 1

# Crea el transformador polinomial
poly = PolynomialFeatures(degree=grado_polinomio)

In [37]:
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

modelo_regresion = LinearRegression()

# Entrena el modelo utilizando las características polinomiales
modelo_regresion.fit(X_train_poly, y_train)

# Realiza predicciones en el conjunto de prueba
y_pred = modelo_regresion.predict(X_test_poly)

# Calcular el Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calcular el Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Mostrar el RMSE
rmse

11.778746950356984