In [1]:
!pip install scikit-learn pandas



In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
import numpy as np

In [2]:
df = pd.read_csv('dataset/games_cleaned.csv', index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 80385 entries, 0 to 80384
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   AppID                80385 non-null  int64  
 1   Name                 80385 non-null  object 
 2   Release date         80385 non-null  object 
 3   Peak CCU             80385 non-null  int64  
 4   Required age         80385 non-null  int64  
 5   Price                80385 non-null  float64
 6   DLC count            80385 non-null  int64  
 7   Supported languages  80385 non-null  object 
 8   Windows              80385 non-null  bool   
 9   Mac                  80385 non-null  bool   
 10  Linux                80385 non-null  bool   
 11  Metacritic score     80385 non-null  int64  
 12  User score           80385 non-null  int64  
 13  Developers           80385 non-null  object 
 14  Publishers           80385 non-null  object 
 15  Categories           80385 non-null  obje

In [3]:
genres = df['Genres'].str.get_dummies(sep=',')
categories = df['Categories'].str.get_dummies(sep=',')
tags = df['Tags'].str.get_dummies(sep=',').add_prefix('T')

In [5]:
feat = genres.join(categories).join(tags)

## KMeans

In [136]:
kmeans = KMeans(n_clusters=750, verbose=1, random_state=42)
df['Cluster'] = kmeans.fit_predict(feat.values)


KeyboardInterrupt



In [98]:
print(sorted(df['Metacritic score'].unique()))

[np.int64(0), np.int64(20), np.int64(23), np.int64(24), np.int64(27), np.int64(29), np.int64(30), np.int64(32), np.int64(33), np.int64(34), np.int64(35), np.int64(36), np.int64(37), np.int64(38), np.int64(39), np.int64(40), np.int64(41), np.int64(42), np.int64(43), np.int64(44), np.int64(45), np.int64(46), np.int64(47), np.int64(48), np.int64(49), np.int64(50), np.int64(51), np.int64(52), np.int64(53), np.int64(54), np.int64(55), np.int64(56), np.int64(57), np.int64(58), np.int64(59), np.int64(60), np.int64(61), np.int64(62), np.int64(63), np.int64(64), np.int64(65), np.int64(66), np.int64(67), np.int64(68), np.int64(69), np.int64(70), np.int64(71), np.int64(72), np.int64(73), np.int64(74), np.int64(75), np.int64(76), np.int64(77), np.int64(78), np.int64(79), np.int64(80), np.int64(81), np.int64(82), np.int64(83), np.int64(84), np.int64(85), np.int64(86), np.int64(87), np.int64(88), np.int64(89), np.int64(90), np.int64(91), np.int64(92), np.int64(93), np.int64(94), np.int64(95), np.int

In [83]:
df.to_csv('games_clustered.csv')

In [84]:
def recommend_games_by_name(game_name, df):
    if game_name not in df['Name'].values:
        return f"Game '{game_name}' not found in the dataset."
    cluster_label = df[df['Name'] == game_name]['Cluster'].values[0]
    if cluster_label == -1:
        return f"Game '{game_name}' is considered noise and doesn't belong to any cluster."
    recommended_games = df[df['Cluster'] == cluster_label]['Name'].tolist()
    recommended_games.remove(game_name)
    return recommended_games if recommended_games else "No other games found in the same cluster."

In [85]:
game_to_recommend = "Half-Life 2"
recommendations = recommend_games_by_name(game_to_recommend, df)
print(f"Recommendations for '{game_to_recommend}': {recommendations}")

Recommendations for 'Half-Life 2': ['Half-Life 2: Episode Two', 'STAR WARS™ Jedi Knight II - Jedi Outcast™', 'Half-Life 2: Episode One', 'Black Mesa', 'Half-Life: Opposing Force', 'Half-Life', 'Half-Life: Blue Shift', 'STAR WARS™ Jedi Knight - Jedi Academy™', 'Half-Life 2: Lost Coast', 'Half-Life: Source']


## KNN

In [6]:
knn = NearestNeighbors(n_neighbors=11)
#knn.fit(df[['Peak CCU', 'Metacritic score', 'User score']].join(feat))
knn.fit(feat)

In [7]:
def recommend_games_by_name_knn(game_name, df):
    if game_name not in df['Name'].values:
        return f"Game '{game_name}' not found in the dataset."
    info = []
    pred, ind = knn.kneighbors([feat.loc[df[df['Name'] == game_name].index[0]].tolist()])
    predict = (pred[0].tolist(), ind[0].tolist())
    s = sorted(predict, key=lambda x: x[0])
    for i in s[1]:
        info.append(df.loc[i]['Name'])
    return info if info else "No other games found in the same cluster."

In [8]:
game_to_recommend = "Half-Life 2"
recommendations = recommend_games_by_name_knn(game_to_recommend, df)
print(f"Recommendations for '{game_to_recommend}': {recommendations}")

Recommendations for 'Half-Life 2': ['Half-Life 2', 'Half-Life 2: Episode Two', 'Half-Life 2: Episode One', 'Half-Life: Source', 'Half-Life: Blue Shift', 'Red Alliance', 'Black Mesa', 'Half-Life 2: Lost Coast', 'Red Faction', 'Portal', 'Half-Life: Opposing Force']




## Save best

In [9]:
import pickle

best = knn
with open('knn.model', 'wb') as sf:
    pickle.dump(best, sf)

In [10]:
df['Feat'] = [item.tolist() for item in feat.values]

In [217]:
df.to_csv('dataset/games_knn.csv')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 80385 entries, 0 to 80384
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   AppID                80385 non-null  int64  
 1   Name                 80385 non-null  object 
 2   Release date         80385 non-null  object 
 3   Peak CCU             80385 non-null  int64  
 4   Required age         80385 non-null  int64  
 5   Price                80385 non-null  float64
 6   DLC count            80385 non-null  int64  
 7   Supported languages  80385 non-null  object 
 8   Windows              80385 non-null  bool   
 9   Mac                  80385 non-null  bool   
 10  Linux                80385 non-null  bool   
 11  Metacritic score     80385 non-null  int64  
 12  User score           80385 non-null  int64  
 13  Developers           80385 non-null  object 
 14  Publishers           80385 non-null  object 
 15  Categories           80385 non-null  obje