In [None]:
import pandas as pd

file_name = "final.csv"

df = pd.read_csv(file_name)

df

Unnamed: 0.1,Unnamed: 0,user_id,item_id,recommend,playtime_forever,playtime_2weeks
0,0,76561197970982479,1250,True,10006,0
1,1,76561197970982479,22200,True,271,0
2,2,76561197970982479,43110,True,834,0
3,3,js41637,227300,True,551,0
4,4,js41637,239030,True,349,0
...,...,...,...,...,...,...
40310,40310,Ghoustik,730,True,3969,0
40311,40311,76561198312638244,233270,True,533,0
40312,40312,76561198312638244,130,True,221,0
40313,40313,76561198312638244,70,True,1010,0


In [None]:
df.shape

(40315, 6)

In [None]:
number_of_games = df[['item_id']].nunique()
number_of_users = df[['user_id']].nunique()

print(f'Total number of games:')
print(number_of_games)
print("\n")
print(f'Total number of games:')
print(number_of_users)

Total number of games:
item_id    912
dtype: int64


Total number of games:
user_id    18741
dtype: int64


In [None]:
# check Nan
nan_count = df.isna().sum().sum()
nan_count

0

In [None]:
!pip install scikit-surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


**Algorithm attempt: Singular Value Decomposition (SVD)**

In [None]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [None]:
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(df[['user_id', 'item_id', 'recommend']], reader)

In [None]:
svd = SVD()

In [None]:
results = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=20, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 20 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Fold 11 Fold 12 Fold 13 Fold 14 Fold 15 Fold 16 Fold 17 Fold 18 Fold 19 Fold 20 Mean    Std     
RMSE (testset)    0.2894  0.2814  0.2775  0.2815  0.2813  0.2861  0.2842  0.2795  0.2750  0.2691  0.2780  0.2878  0.2884  0.2885  0.2949  0.2943  0.2859  0.2836  0.2746  0.2843  0.2833  0.0064  
MAE (testset)     0.1640  0.1572  0.1580  0.1571  0.1577  0.1585  0.1634  0.1570  0.1521  0.1502  0.1597  0.1648  0.1616  0.1628  0.1639  0.1621  0.1615  0.1606  0.1563  0.1626  0.1596  0.0039  
Fit time          1.94    2.71    4.56    3.39    3.16    4.61    1.14    0.85    0.84    0.84    0.87    0.86    0.86    0.86    0.83    0.85    1.05    3.19    1.93    1.27    1.83    1.27    
Test time         0.05    0.20    0.10    0.08    0.05    0.02    0.01    0.02    0.02    0.01    0.01    0.01    0.01    0.01    0.01    0.01    0.03    0.06    0.0

In [None]:
print(f"Mean RMSE: {results['test_rmse'].mean():.4f}")
print(f"Mean MAE: {results['test_mae'].mean():.4f}")

Mean RMSE: 0.2833
Mean MAE: 0.1596


In [None]:
trainset = data.build_full_trainset()
svd.fit(trainset)

#for specific user and game
user_id = '111222333444555666888'
game_id = '730'

predicted_recommend = svd.predict(user_id, game_id)
print(f"Recommendation value for user {user_id} - game {game_id}: {predicted_recommend.est:.4f}")


Recommendation value for user 111222333444555666888 - game 730: 0.8552


In [None]:
all_game_ids = set(df['item_id'].unique())

#Get rated games (recommended or not)
user_rated_games = set(df.loc[df['user_id'] == user_id, 'item_id'])

#Get unrated games for the user
user_unrated_games = all_game_ids - user_rated_games


In [None]:
recommendations = []
for game_id in user_unrated_games:
  predicted_recommend = svd.predict(user_id, game_id)
  recommendations.append((game_id, predicted_recommend.est))


In [None]:
#Sort the recommendations
recommendations.sort(key=lambda x: x[1], reverse=True)

#Top n recommendations:
top_n = 300
top_n_recommendations = recommendations[:top_n]

print(f"Top {top_n} recommendations for user {user_id}:")
for game_id, predicted_value in top_n_recommendations:
  print(f"Game ID: {game_id}, Predicted recommendation value: {predicted_value:.4f}")


Top 300 recommendations for user 111222333444555666888:
Game ID: 311340, Predicted recommendation value: 1.0000
Game ID: 20530, Predicted recommendation value: 1.0000
Game ID: 204880, Predicted recommendation value: 1.0000
Game ID: 102500, Predicted recommendation value: 1.0000
Game ID: 305260, Predicted recommendation value: 1.0000
Game ID: 444560, Predicted recommendation value: 1.0000
Game ID: 233630, Predicted recommendation value: 1.0000
Game ID: 78000, Predicted recommendation value: 1.0000
Game ID: 200900, Predicted recommendation value: 1.0000
Game ID: 352460, Predicted recommendation value: 1.0000
Game ID: 39120, Predicted recommendation value: 1.0000
Game ID: 240, Predicted recommendation value: 1.0000
Game ID: 2300, Predicted recommendation value: 1.0000
Game ID: 65800, Predicted recommendation value: 1.0000
Game ID: 207170, Predicted recommendation value: 1.0000
Game ID: 420, Predicted recommendation value: 1.0000
Game ID: 238010, Predicted recommendation value: 1.0000
Game

**Method 1 -  threshold:**

In [None]:
threshold = 0.5

print(f"Top {top_n} recommendations for user {user_id}:")
for game_id, predicted_value in top_n_recommendations:
  if predicted_value > threshold:
    print(f"Game ID: {game_id}, Predicted recommendation value: {predicted_value:.4f} - Recommend")
  else:
    print(f"Game ID: {game_id}, Predicted recommendation value: {predicted_value:.4f} - Not recommend")

Top 300 recommendations for user 111222333444555666888:
Game ID: 311340, Predicted recommendation value: 1.0000 - Recommend
Game ID: 20530, Predicted recommendation value: 1.0000 - Recommend
Game ID: 204880, Predicted recommendation value: 1.0000 - Recommend
Game ID: 102500, Predicted recommendation value: 1.0000 - Recommend
Game ID: 305260, Predicted recommendation value: 1.0000 - Recommend
Game ID: 444560, Predicted recommendation value: 1.0000 - Recommend
Game ID: 233630, Predicted recommendation value: 1.0000 - Recommend
Game ID: 78000, Predicted recommendation value: 1.0000 - Recommend
Game ID: 200900, Predicted recommendation value: 1.0000 - Recommend
Game ID: 352460, Predicted recommendation value: 1.0000 - Recommend
Game ID: 39120, Predicted recommendation value: 1.0000 - Recommend
Game ID: 240, Predicted recommendation value: 1.0000 - Recommend
Game ID: 2300, Predicted recommendation value: 1.0000 - Recommend
Game ID: 65800, Predicted recommendation value: 1.0000 - Recommend
G

In [None]:
threshold = 0.85

print(f"Top {top_n} recommendations for user {user_id}:")
for game_id, predicted_value in top_n_recommendations:
  if predicted_value > threshold:
    print(f"Game ID: {game_id}, Predicted recommendation value: {predicted_value:.4f} - Recommend")
  else:
    print(f"Game ID: {game_id}, Predicted recommendation value: {predicted_value:.4f} - Not recommend")

Top 300 recommendations for user 111222333444555666888:
Game ID: 311340, Predicted recommendation value: 1.0000 - Recommend
Game ID: 20530, Predicted recommendation value: 1.0000 - Recommend
Game ID: 204880, Predicted recommendation value: 1.0000 - Recommend
Game ID: 102500, Predicted recommendation value: 1.0000 - Recommend
Game ID: 305260, Predicted recommendation value: 1.0000 - Recommend
Game ID: 444560, Predicted recommendation value: 1.0000 - Recommend
Game ID: 233630, Predicted recommendation value: 1.0000 - Recommend
Game ID: 78000, Predicted recommendation value: 1.0000 - Recommend
Game ID: 200900, Predicted recommendation value: 1.0000 - Recommend
Game ID: 352460, Predicted recommendation value: 1.0000 - Recommend
Game ID: 39120, Predicted recommendation value: 1.0000 - Recommend
Game ID: 240, Predicted recommendation value: 1.0000 - Recommend
Game ID: 2300, Predicted recommendation value: 1.0000 - Recommend
Game ID: 65800, Predicted recommendation value: 1.0000 - Recommend
G

In [None]:
check_user_data = df[df['user_id'] == user_id]

#Count the number of recommended and not recommended games
recommend_count = check_user_data[check_user_data['recommend'] == True].shape[0]
not_recommend_count = check_user_data[check_user_data['recommend'] == False].shape[0]

print(f"User {user_id} has {recommend_count} recommended games and {not_recommend_count} not recommended games.")


User 111222333444555666888 has 24 recommended games and 4 not recommended games.


**Method 2 - threshold for the specific user:**

In [None]:
total_rated_games = recommend_count + not_recommend_count

threshold_for_user = recommend_count / total_rated_games
threshold_for_user

0.8571428571428571

In [None]:
top_n = 500
top_n_recommendations = recommendations[:top_n]

print(f"Top {top_n} recommendations for user {user_id}:")
for game_id, predicted_value in top_n_recommendations:
  if predicted_value > threshold:
    print(f"Game ID: {game_id}, Predicted recommendation value: {predicted_value:.4f} - Recommend")
  else:
    print(f"Game ID: {game_id}, Predicted recommendation value: {predicted_value:.4f} - Not recommend")

Top 500 recommendations for user 111222333444555666888:
Game ID: 311340, Predicted recommendation value: 1.0000 - Recommend
Game ID: 20530, Predicted recommendation value: 1.0000 - Recommend
Game ID: 204880, Predicted recommendation value: 1.0000 - Recommend
Game ID: 102500, Predicted recommendation value: 1.0000 - Recommend
Game ID: 305260, Predicted recommendation value: 1.0000 - Recommend
Game ID: 444560, Predicted recommendation value: 1.0000 - Recommend
Game ID: 233630, Predicted recommendation value: 1.0000 - Recommend
Game ID: 78000, Predicted recommendation value: 1.0000 - Recommend
Game ID: 200900, Predicted recommendation value: 1.0000 - Recommend
Game ID: 352460, Predicted recommendation value: 1.0000 - Recommend
Game ID: 39120, Predicted recommendation value: 1.0000 - Recommend
Game ID: 240, Predicted recommendation value: 1.0000 - Recommend
Game ID: 2300, Predicted recommendation value: 1.0000 - Recommend
Game ID: 65800, Predicted recommendation value: 1.0000 - Recommend
G

**KNN** 

In [None]:
from surprise import Reader, Dataset, KNNBasic

reader = Reader(rating_scale=(0, 1))

data = Dataset.load_from_df(df[['user_id', 'item_id', 'recommend']], reader)

In [None]:
sim_options = {
  'name': 'pearson',  # can use 'cosine' or 'pearson_baseline'
  'user_based': False  # True - user-based | False - item-based
}

knn = KNNBasic(sim_options=sim_options)

cv_results = cross_validate(knn, data, measures=['RMSE', 'MAE'], cv=10, verbose=True)

print(f"Mean RMSE: {cv_results['test_rmse'].mean():.4f}")
print(f"Mean MAE: {cv_results['test_mae'].mean():.4f}")


Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.3111  0.3003  0.3053  0.3103  0.311

In [None]:
trainset = data.build_full_trainset()
knn.fit(trainset)

user_id = '111222333444555666888'
top_n = 15

all_game_ids = set(df['item_id'].unique())

#Get rated games (recommended or not)
user_rated_games = set(df.loc[df['user_id'] == user_id, 'item_id'])

#Get unrated games for the user
user_unrated_games = all_game_ids - user_rated_games

recommendations = []
for game_id in user_unrated_games:
  predicted_recommend = knn.predict(user_id, game_id)
  recommendations.append((game_id, predicted_recommend.est))

recommendations.sort(key=lambda x: x[1], reverse=True)

# Get the top_n recommendations
top_n_recommendations = recommendations[:top_n]

print(f"Top {top_n} recommendations for user {user_id}:")
for game_id, predicted_value in top_n_recommendations:
  print(f"Game ID: {game_id}, Predicted recommendation value: {predicted_value:.4f}")


Computing the pearson similarity matrix...
Done computing similarity matrix.
Top 15 recommendations for user 111222333444555666888:
Game ID: 34900, Predicted recommendation value: 1.0000
Game ID: 243870, Predicted recommendation value: 1.0000
Game ID: 295110, Predicted recommendation value: 1.0000
Game ID: 63710, Predicted recommendation value: 1.0000
Game ID: 240, Predicted recommendation value: 1.0000
Game ID: 104700, Predicted recommendation value: 1.0000
Game ID: 225540, Predicted recommendation value: 1.0000
Game ID: 303390, Predicted recommendation value: 1.0000
Game ID: 207140, Predicted recommendation value: 1.0000
Game ID: 102700, Predicted recommendation value: 1.0000
Game ID: 299360, Predicted recommendation value: 1.0000
Game ID: 49520, Predicted recommendation value: 1.0000
Game ID: 24960, Predicted recommendation value: 1.0000
Game ID: 20900, Predicted recommendation value: 1.0000
Game ID: 65980, Predicted recommendation value: 1.0000


# Create the game rating by playtime

create a new column - playtime_hours

In [None]:
df

Unnamed: 0.1,Unnamed: 0,user_id,item_id,recommend,playtime_forever,playtime_2weeks
0,0,76561197970982479,1250,True,10006,0
1,1,76561197970982479,22200,True,271,0
2,2,76561197970982479,43110,True,834,0
3,3,js41637,227300,True,551,0
4,4,js41637,239030,True,349,0
...,...,...,...,...,...,...
40310,40310,Ghoustik,730,True,3969,0
40311,40311,76561198312638244,233270,True,533,0
40312,40312,76561198312638244,130,True,221,0
40313,40313,76561198312638244,70,True,1010,0


In [None]:
time = df['playtime_forever']

time_to_hours = []

for t in time:
  time_to_hours.append(t/60)

In [None]:
df['total_playtime_hours'] = time_to_hours

In [None]:
df

Unnamed: 0.1,Unnamed: 0,user_id,item_id,recommend,playtime_forever,playtime_2weeks,total_playtime_hours
0,0,76561197970982479,1250,True,10006,0,166.766667
1,1,76561197970982479,22200,True,271,0,4.516667
2,2,76561197970982479,43110,True,834,0,13.900000
3,3,js41637,227300,True,551,0,9.183333
4,4,js41637,239030,True,349,0,5.816667
...,...,...,...,...,...,...,...
40310,40310,Ghoustik,730,True,3969,0,66.150000
40311,40311,76561198312638244,233270,True,533,0,8.883333
40312,40312,76561198312638244,130,True,221,0,3.683333
40313,40313,76561198312638244,70,True,1010,0,16.833333


**Create the rating:**

Based on the user's review, if user recommend the game, the value would be 1.5. If not, the value would be 0.5.

Based on the playtime, the average playtime is 168.27484145686054. Devide the cases into:


1.   playtime_hours < 10 => score = 1
2.   10 <= playtime_hours < 100 => score = 2
1.   100 <= playtime_hours < 200 => score = 3
2.   200 <= playtime_hours < 500 => score = 4
1.   500 <= playtime_hours => score = 5








Multiple the recommend value with the playtime, to get the personal rating.

In [None]:
total_time_hour = df['total_playtime_hours'].sum()
avg_playtime = total_time_hour / len(df['total_playtime_hours'])

avg_playtime

168.27484145686054

In [None]:
playtime_hours = df['total_playtime_hours']
score = []

for t in playtime_hours:
  if t <10:
    score.append(1)
  elif 10 <= t < 100:
    score.append(2)
  elif 100 <= t < 200:
    score.append(3)
  elif 200 <= t < 500:
    score.append(4)
  else:
    score.append(5)

In [None]:
recommend = df['recommend']

recommend_ratings = []

for r in recommend:
  if r == True:
    recommend_ratings.append(1.5)
  else:
    recommend_ratings.append(0.5)

In [None]:
set_ratings = [0]*len(recommend_ratings)

for i in range(len(recommend_ratings)):
  set_ratings[i] = recommend_ratings[i]*score[i]

In [None]:
df['ratings'] = set_ratings

In [None]:
df

Unnamed: 0.1,Unnamed: 0,user_id,item_id,recommend,playtime_forever,playtime_2weeks,total_playtime_hours,ratings
0,0,76561197970982479,1250,True,10006,0,166.766667,4.5
1,1,76561197970982479,22200,True,271,0,4.516667,1.5
2,2,76561197970982479,43110,True,834,0,13.900000,3.0
3,3,js41637,227300,True,551,0,9.183333,1.5
4,4,js41637,239030,True,349,0,5.816667,1.5
...,...,...,...,...,...,...,...,...
40310,40310,Ghoustik,730,True,3969,0,66.150000,3.0
40311,40311,76561198312638244,233270,True,533,0,8.883333,1.5
40312,40312,76561198312638244,130,True,221,0,3.683333,1.5
40313,40313,76561198312638244,70,True,1010,0,16.833333,3.0


SVD using generated ratings:

In [None]:
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(df[['user_id', 'item_id', 'ratings']], reader)

In [None]:
svd = SVD()

In [None]:
results = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=20, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 20 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Fold 11 Fold 12 Fold 13 Fold 14 Fold 15 Fold 16 Fold 17 Fold 18 Fold 19 Fold 20 Mean    Std     
RMSE (testset)    2.9988  3.0170  3.0010  3.0631  3.0754  2.9919  3.0345  3.0023  2.9928  2.9331  2.9561  3.0102  3.0304  2.9938  2.9784  3.0467  2.9945  2.9705  3.0349  3.0175  3.0071  0.0338  
MAE (testset)     2.3487  2.3664  2.3496  2.3944  2.4009  2.3386  2.3606  2.3441  2.3411  2.2575  2.2981  2.3302  2.3728  2.3659  2.3372  2.3758  2.3184  2.2892  2.3663  2.3524  2.3454  0.0339  
Fit time          0.89    0.86    0.85    0.85    0.86    0.87    0.91    1.22    1.22    1.25    0.90    1.02    1.26    1.28    2.35    2.23    1.37    1.24    1.27    0.84    1.18    0.41    
Test time         0.01    0.01    0.01    0.02    0.01    0.01    0.02    0.02    0.02    0.02    0.02    0.02    0.02    0.03    0.04    0.05    0.02    0.02    0.0

In [None]:
print(f"Mean RMSE: {results['test_rmse'].mean():.4f}")
print(f"Mean MAE: {results['test_mae'].mean():.4f}")

Mean RMSE: 3.0071
Mean MAE: 2.3454


In [None]:
trainset = data.build_full_trainset()
svd.fit(trainset)

#for specific user and game
user_id = '111222333444555666888'
game_id = '362890'

predicted_recommend = svd.predict(user_id, game_id)
print(f"Recommendation value for user {user_id} - game {game_id}: {predicted_recommend.est:.4f}")


Recommendation value for user 111222333444555666888 - game 362890: 1.0000
