In [1]:
import pandas as pd
import numpy as np
import ast
from enum import Enum
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression


EN ESTE NOTEBOOK PREPARAREMOS NUESTRO MODELO DE PREDICCION 


In [2]:
data = []
with open('steam_games.json') as steam:
    for linea in steam.readlines():
        data.append(ast.literal_eval(linea))

In [3]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,discount_price,reviews_url,specs,price,early_access,id,developer,sentiment,metascore
0,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.49,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro,,
1,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",,http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,False,643980,Secret Level SRL,Mostly Positive,
2,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",,http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,False,670290,Poolians.com,Mostly Positive,
3,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,"[Action, Adventure, Casual]",0.83,http://steamcommunity.com/app/767400/reviews/?...,[Single-player],0.99,False,767400,彼岸领域,,
4,,,Log Challenge,,http://store.steampowered.com/app/773570/Log_C...,,"[Action, Indie, Casual, Sports]",1.79,http://steamcommunity.com/app/773570/reviews/?...,"[Single-player, Full controller support, HTC V...",2.99,False,773570,,,


In [4]:
# Para este modelo utilizaremos los campos 'genres', 'metascore', 'price', por ende crearemos un 
# dataframe con estas variables
data_modelo = df.copy()
data_modelo = data_modelo[['genres','metascore', 'price']]
data_modelo.head()

Unnamed: 0,genres,metascore,price
0,"[Action, Casual, Indie, Simulation, Strategy]",,4.99
1,"[Free to Play, Indie, RPG, Strategy]",,Free To Play
2,"[Casual, Free to Play, Indie, Simulation, Sports]",,Free to Play
3,"[Action, Adventure, Casual]",,0.99
4,,,2.99


In [5]:
data_modelo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32135 entries, 0 to 32134
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   genres     28852 non-null  object
 1   metascore  2677 non-null   object
 2   price      30758 non-null  object
dtypes: object(3)
memory usage: 753.3+ KB


In [6]:
#Realizo estos cambios a los campos, que son importantes para realizar nustro modelo ML
data_modelo['price'] = pd.to_numeric(data_modelo['price'], errors='coerce').fillna(0)


In [7]:
data_modelo.dropna(subset=['price'],inplace=True)

In [14]:
data_modelo['metascore'].isnull().sum()

29528

In [13]:
data_modelo['metascore'] = data_modelo['metascore'].replace("NA",pd.NA)

In [15]:
data_modelo['metascore'] = pd.to_numeric(data_modelo['metascore'], errors='coerce')


In [16]:
data_modelo.dropna(subset='metascore', inplace=True)

In [17]:
data_modelo.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2607 entries, 28 to 32117
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   genres     2545 non-null   object 
 1   metascore  2607 non-null   float64
 2   price      2607 non-null   float64
dtypes: float64(2), object(1)
memory usage: 81.5+ KB


In [18]:
data_modelo['genres'].explode().unique()



array(['Action', 'Strategy', 'Indie', 'RPG', 'Casual', 'Adventure',
       'Racing', 'Simulation', 'Massively Multiplayer', nan, 'Sports',
       'Free to Play', 'Early Access', 'Video Production'], dtype=object)

In [19]:
#El campo genres lo convertimos a varibles dummies por medio de one-hot-encoding
dummies = data_modelo["genres"].str.join(",").str.get_dummies(sep=",")
#dummies.columns
data_modelo = pd.concat([data_modelo, dummies], axis=1)
data_modelo.head()

Unnamed: 0,genres,metascore,price,Action,Adventure,Casual,Early Access,Free to Play,Indie,Massively Multiplayer,RPG,Racing,Simulation,Sports,Strategy,Video Production
28,[Action],96.0,9.99,1,0,0,0,0,0,0,0,0,0,0,0,0
39,[Strategy],84.0,6.99,0,0,0,0,0,0,0,0,0,0,0,1,0
40,[Strategy],80.0,6.99,0,0,0,0,0,0,0,0,0,0,0,1,0
41,"[Action, Indie, RPG]",76.0,9.99,1,0,0,0,0,1,0,1,0,0,0,0,0
55,[Action],70.0,9.99,1,0,0,0,0,0,0,0,0,0,0,0,0


In [20]:
data_modelo.drop(columns='genres', axis=1, inplace=True)
data_modelo.head()

Unnamed: 0,metascore,price,Action,Adventure,Casual,Early Access,Free to Play,Indie,Massively Multiplayer,RPG,Racing,Simulation,Sports,Strategy,Video Production
28,96.0,9.99,1,0,0,0,0,0,0,0,0,0,0,0,0
39,84.0,6.99,0,0,0,0,0,0,0,0,0,0,0,1,0
40,80.0,6.99,0,0,0,0,0,0,0,0,0,0,0,1,0
41,76.0,9.99,1,0,0,0,0,1,0,1,0,0,0,0,0
55,70.0,9.99,1,0,0,0,0,0,0,0,0,0,0,0,0


In [21]:
data_modelo.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2607 entries, 28 to 32117
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   metascore              2607 non-null   float64
 1   price                  2607 non-null   float64
 2   Action                 2607 non-null   int64  
 3   Adventure              2607 non-null   int64  
 4   Casual                 2607 non-null   int64  
 5   Early Access           2607 non-null   int64  
 6   Free to Play           2607 non-null   int64  
 7   Indie                  2607 non-null   int64  
 8   Massively Multiplayer  2607 non-null   int64  
 9   RPG                    2607 non-null   int64  
 10  Racing                 2607 non-null   int64  
 11  Simulation             2607 non-null   int64  
 12  Sports                 2607 non-null   int64  
 13  Strategy               2607 non-null   int64  
 14  Video Production       2607 non-null   int64  
dtypes: floa

In [22]:
# Bueno ahora iniciremos el modelo, como primera medida asignamos los valore de x e y
X = data_modelo.drop(columns=['price']) 

y = data_modelo['price'] # asignamos  'price' a la variable y



In [23]:
# Instanciamos un objeto de la clase linearregression, para crear nuestro modelo predictivo
modelo_lineal = LinearRegression()

In [24]:
# Es hora de entrenar el modelo y hacer un 'train_test_split', en el cual sepera nuestros datos en dos
# subcojuntos uno de entrenamiento y otro de testeo
X_train,X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=42)

In [25]:
#Entrenamos el modelo
modelo_lineal.fit(X_train,y_train)

In [26]:
# Predecimos las etiquetas de salida Y en el subset de entrenamiento y en el subset de testeo
#para cada una de las instancias
#y_train_predict = modelo_lineal.predict(X_train)
y_test_predict = modelo_lineal.predict(X_test)

In [27]:
# Aquí observamos cómo el modelo ha aprendido de los datos, a partir de la pendiente y 
#la ordenada al origen que calculó
print('La pendiente es :', modelo_lineal.coef_)
print('La ordenada al origen es:', modelo_lineal.intercept_)

La pendiente es : [  0.15073383   0.88489138   1.57541147  -4.13559368   3.7289906
 -12.33461191  -3.11651994   0.77651777   2.06071118   1.11896903
   2.55595362  10.34082629   0.97753388   0.        ]
La ordenada al origen es: 3.733896301279451


In [28]:
#Evaluaremos el modelo para cuantificar su performence y poder determinar la calidad de sus predicciones
# Mean Squared Error
mean_squared_error = mean_squared_error(y_test,y_test_predict)


In [29]:
mean_squared_error

98.07829594258044

In [30]:
r2 = r2_score(y_test, y_test_predict)
r2

0.1365658153798105

In [31]:
root_mean_squared_erro = mean_squared_error ** 0.5
root_mean_squared_erro

9.903448689349606

In [40]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2085 entries, 123 to 5930
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   metascore              2085 non-null   float64
 1   Action                 2085 non-null   int64  
 2   Adventure              2085 non-null   int64  
 3   Casual                 2085 non-null   int64  
 4   Early Access           2085 non-null   int64  
 5   Free to Play           2085 non-null   int64  
 6   Indie                  2085 non-null   int64  
 7   Massively Multiplayer  2085 non-null   int64  
 8   RPG                    2085 non-null   int64  
 9   Racing                 2085 non-null   int64  
 10  Simulation             2085 non-null   int64  
 11  Sports                 2085 non-null   int64  
 12  Strategy               2085 non-null   int64  
 13  Video Production       2085 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 244.3 KB


: 

In [32]:

class genero(Enum):
    Action = 'Action'
    Adventura = 'Adventure'
    Casual = 'Casual'
    Early_Access = 'Early Access'
    Free_to_Play = "Free to Play"
    Indie = "Indie"
    Massively_Multiplayer = "Massively Multiplayer"
    RPG = "RPG"
    Racing = "Racing"
    Simulation = "Simulation"
    Sports = "Sports"
    Strategy = "Strategy"
    Video_Production = "Video Production"        





In [38]:
resultado = pd.DataFrame([[metascore, *[1 if genre.value == i else 0 for i in genero._member_names_]]],
     columns=[data_modelo['metascore'], *genero._member_names_])
     

NameError: name 'metascore' is not defined

In [35]:
def prediccion(metascore:float=None, genre :genero=None ):

      
    
    if metascore is None or genre is None:
        print('error')

    
    
    resultado = pd.DataFrame([[metascore, *[1 if genre.value == i else 0 for i in genero._member_names_]]],
     columns=['metascore', *genero._member_names_])
     
    if genre == genero.Free_to_Play:
        
        return {"price": 0}
    else:
        price = modelo_lineal.predict(resultado)[0]
    
               
        return {"price": price}
    

In [37]:
prediccion(82, genero.Action)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Adventura
- Early_Access
- Free_to_Play
- Massively_Multiplayer
- Video_Production
Feature names seen at fit time, yet now missing:
- Adventure
- Early Access
- Free to Play
- Massively Multiplayer
- Video Production
