In [1]:
import pandas as pd
import numpy as np
from surprise import SVD
from surprise import accuracy
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
import pickle

In [2]:
reader = Reader()

In [3]:
df_user_reviews = pd.read_parquet('../../data/user_reviews.parquet', engine='pyarrow')
df_games = pd.read_parquet('../../data/steam_games.parquet', engine='pyarrow')

In [4]:
df_user_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59305 entries, 0 to 59332
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          59305 non-null  object
 1   item_id          59305 non-null  object
 2   helpful          59305 non-null  object
 3   recommend        59305 non-null  bool  
 4   review           59305 non-null  object
 5   sentiment_score  59305 non-null  int64 
 6   posted year      49186 non-null  object
dtypes: bool(1), int64(1), object(5)
memory usage: 3.2+ MB


In [5]:
df_games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27657 entries, 88310 to 120443
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   genres        27657 non-null  object 
 1   app_name      27657 non-null  object 
 2   release_date  27657 non-null  object 
 3   price         27657 non-null  float32
 4   id            27657 non-null  object 
 5   developer     27657 non-null  object 
dtypes: float32(1), object(5)
memory usage: 1.4+ MB


In [6]:
# Select only the desired columns
df_reviews = df_user_reviews[['user_id', 'item_id', 'sentiment_score', 'recommend']]
df_games = df_games[['id', 'app_name']]

In [7]:
# Merge df_games and df_user_reviews
df = df_reviews.merge(df_games, left_on='item_id', right_on='id', how='inner')

In [8]:
df.drop('id', inplace=True, axis=1)

In [9]:
df.shape

(49699, 5)

In [10]:
# Create a new 'rating' column by adding 'recommend' (multiplied by a factor) and 'sentiment_score'
df['rating'] = df['recommend'] * 2 + df['sentiment_score']

In [11]:
df.head(1)

Unnamed: 0,user_id,item_id,sentiment_score,recommend,app_name,rating
0,76561197970982479,1250,2,True,killing floor,4


In [12]:
df[df["user_id"]== "GamerFag"]

Unnamed: 0,user_id,item_id,sentiment_score,recommend,app_name,rating
5,GamerFag,1250,2,True,killing floor,4
6340,GamerFag,220,1,True,half-life 2,3
6597,GamerFag,440,2,True,team fortress 2,4
26833,GamerFag,70,0,True,half-life,2
30123,GamerFag,420,1,True,half-life 2: episode two,3
30142,GamerFag,380,2,True,half-life 2: episode one,4


In [13]:
df.drop(['item_id', 'sentiment_score', 'recommend'], inplace=True, axis=1)

In [14]:
model = df[['user_id', 'app_name', 'rating']]

In [15]:
model

Unnamed: 0,user_id,app_name,rating
0,76561197970982479,killing floor,4
1,death-hunter,killing floor,4
2,DJKamBer,killing floor,2
3,diego9031,killing floor,3
4,76561198081962345,killing floor,3
...,...,...,...
49694,llDracuwulf,asteria,3
49695,ChrisCoroner,street fighter x tekken,4
49696,MeloncraftLP,the journey down: chapter one,3
49697,MeloncraftLP,the journey down: chapter two,3


In [16]:
Scala = Reader(rating_scale=(0, 5))

In [17]:
data = Dataset.load_from_df(model, Scala)

In [18]:
trainset, testset = train_test_split(data, test_size=0.25)

In [19]:
algorithm = SVD()
trained_model = algorithm.fit(trainset)

In [20]:
trained_model

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fce41942d90>

In [21]:
# Asumiendo que 'model' es tu modelo entrenado
with open("ML_model.pkl", "wb") as model_file:
    pickle.dump(trained_model, model_file)

In [22]:
with open("ML_model.pkl", 'rb') as model_file:
    pickle_model = pickle.load(model_file)

In [24]:
pickle_model

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fce41941d50>

In [25]:
predictions = algorithm.test(testset)

In [27]:
accuracy.rmse(predictions)

RMSE: 0.9429


0.9429085846039366

In [28]:
played_games = df[df["user_id"] == "76561197970982479"]["app_name"].unique()
played_games

array(['killing floor', 'zeno clash'], dtype=object)

In [29]:
games = df["app_name"].unique()

In [30]:
games

array(['killing floor', 'zeno clash', 'euro truck simulator 2', ...,
       'the journey down: chapter one', 'the journey down: chapter two',
       'mystic destinies: serendipity of aeons'], dtype=object)

In [43]:
not_played_games = list(set(games) - set(played_games))

In [32]:
predic_not_played = [pickle_model.predict("76561197970982479", line) for line in not_played_games]

In [34]:
top5_recomm = sorted(predic_not_played, key=lambda x: x.est, reverse=True)[:5]

In [35]:
top5_recomm

[Prediction(uid='76561197970982479', iid='bastion', r_ui=None, est=3.723853738508814, details={'was_impossible': False}),
 Prediction(uid='76561197970982479', iid='gunpoint', r_ui=None, est=3.721030485617725, details={'was_impossible': False}),
 Prediction(uid='76561197970982479', iid='the wolf among us', r_ui=None, est=3.6821367858519802, details={'was_impossible': False}),
 Prediction(uid='76561197970982479', iid='star wars™ knights of the old republic™ ii - the sith lords™', r_ui=None, est=3.6619771895120095, details={'was_impossible': False}),
 Prediction(uid='76561197970982479', iid="king arthur's gold", r_ui=None, est=3.658554023904421, details={'was_impossible': False})]

In [41]:
games_dict = {}
for i, rec in enumerate(top5_recomm, start=1):
    games_dict[f'game {i}'] = rec.iid

In [42]:
games_dict

{'game 1': 'bastion',
 'game 2': 'gunpoint',
 'game 3': 'the wolf among us',
 'game 4': 'star wars™ knights of the old republic™ ii - the sith lords™',
 'game 5': "king arthur's gold"}