In [1]:
# https://nik-davis.github.io/posts/2019/steam-data-collection/
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from sklearn.neighbors import NearestNeighbors
import re

import ast
# from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [112]:
def extract_values(x, key, default):
    try:
        data = ast.literal_eval(x) if isinstance(x, str) else x
        if key =='total':
            return data['total']
        else:
            result = [item[key] for item in data if key in item]
            return result if result else default
    except:
        return default

In [None]:
def safe_concat(*cols):
    result = []
    for col in cols:
        if isinstance(col, list):
            result.extend(col)
        elif pd.notna(col):
            result.append(str(col))
    return result

df = pd.read_csv('./data/steam.csv')

In [68]:
df = pd.read_csv('./data/steam_app_data.csv')

In [69]:
df1 = df.copy()

## Data manipulation

In [169]:
df = df1.copy()

In [170]:
df['detailed_description'] = df['detailed_description'].astype(str)
df['text'] = df['detailed_description'].apply(lambda x: re.sub(r'\<.*?>', '', x))
df = df[df['type']=='game'].copy()
df['categories'] = df['categories'].apply(lambda x: extract_values(x, key='description', default=[]))
df['categories'] = df['categories'].apply(lambda x: [s.replace('-','') for s in x])
df['categories'] = df['categories'].apply(lambda x: [s.replace('/','') for s in x])
df['categories'] = df['categories'].apply(lambda x: [s.replace(' ','') for s in x])
df['categories'] = df['categories'].apply(lambda x: [s.lower() for s in x])
df['genres'] = df['genres'].apply(lambda x: extract_values(x, key='description', default=[]))
df['genres'] = df['genres'].apply(lambda x: [s.replace(' ','') for s in x])
df['genres'] = df['genres'].apply(lambda x: [s.replace('&','') for s in x])
df['genres'] = df['genres'].apply(lambda x: [s.lower() for s in x])
df['publishers'] = df['publishers'].str.replace(' ','',regex=False)
df['developers'] = df['developers'].str.replace(' ','',regex=False)
df['developers'] = df['developers'].str.replace('[','',regex=False)
df['developers'] = df['developers'].str.replace(']','',regex=False)
df['developers'] = df['developers'].apply(lambda x: 'Unknown' if pd.isna(x) else x if isinstance(x, list) else str(x))
df['developers'] = df['developers'].apply(lambda x: x.split())
df['publishers'] = df['publishers'].str.replace('[','',regex=False)
df['publishers'] = df['publishers'].str.replace(']','',regex=False)
df['publishers'] = df['publishers'].apply(lambda x: x.split())
df['recommendations'] = df['recommendations'].apply(lambda x: extract_values(x, key='total', default=0))
df = df[~(df['supported_languages'].isna())]
df = df[df['supported_languages'].str.startswith('English')]
df['detailed_description'] = df['detailed_description'].astype(str)
df['text'] = df['detailed_description'].apply(lambda x: re.sub(r'\<.*?>', '', x))
df['text'] = df['text'].str.replace('-', ' ', regex=False)
df['text'] = df['text'].apply(lambda x: x.split())
df['text'] = df['text'].apply(lambda x: [s.lower() for s in x])
#df['text'] = df['text'].apply(lambda x: [s.replace('-',' ') for s in x])
# drop unnecesary columns
df.drop(columns=['type', 'required_age', 'is_free', 'controller_support', 'dlc', 'about_the_game', 'detailed_description', 'short_description', 'fullgame', 'supported_languages', 'header_image', 'website','pc_requirements','mac_requirements','linux_requirements',
                  'legal_notice', 'drm_notice', 'ext_user_account_notice', 'demos', 'price_overview', 'packages','package_groups', 'platforms', 'metacritic', 'reviews', 'screenshots', 'movies', 'achievements', 'release_date',
                  'support_info', 'background', 'content_descriptors'], inplace=True)

## Data Normalization

In [171]:
# will not take into account the ratings as they should be normalized together
recommendations_df = df[['recommendations']]
# Data scaling
min_max_scaler = preprocessing.MinMaxScaler()
inputs_scaled=min_max_scaler.fit_transform(recommendations_df)
inputs_n=pd.DataFrame(inputs_scaled,columns=recommendations_df.columns)
df2 = pd.merge(df, inputs_n, left_index=True, right_index=True)
df2.drop(columns=['recommendations_x'], inplace=True)
df2.rename(columns={'recommendations_y':'recommendations'}, inplace=True)

# adding the tags
df2['tags'] = df2.apply(lambda row: safe_concat(row['developers'], row['publishers'], row['categories'], row['genres'], row['text']), axis=1)
df2['tags'] = df2['tags'].apply(lambda x: ' '.join(map(str, x)))
df2['tags'] = df2['tags'].apply(lambda x: x.lower())

In [172]:
df2

Unnamed: 0,name,steam_appid,developers,publishers,categories,genres,text,recommendations,tags
0,Counter-Strike,10,['Valve'],['Valve'],"[multiplayer, onlinemultiplayer, localmultipla...",[action],"[play, the, world's, number, 1, online, action...",0.079284,'valve' 'valve' multiplayer onlinemultiplayer ...
1,Team Fortress Classic,20,['Valve'],['Valve'],"[multiplayer, onlinemultiplayer, localmultipla...",[action],"[one, of, the, most, popular, online, action, ...",0.003380,'valve' 'valve' multiplayer onlinemultiplayer ...
2,Day of Defeat,30,['Valve'],['Valve'],"[multiplayer, valveanticheatenabled]",[action],"[enlist, in, an, intense, brand, of, axis, vs....",0.002403,'valve' 'valve' multiplayer valveanticheatenab...
3,Deathmatch Classic,40,['Valve'],['Valve'],"[multiplayer, onlinemultiplayer, localmultipla...",[action],"[enjoy, fast, paced, multiplayer, gaming, with...",0.001123,'valve' 'valve' multiplayer onlinemultiplayer ...
4,Half-Life: Opposing Force,50,['GearboxSoftware'],['Valve'],"[singleplayer, multiplayer, valveanticheatenab...",[action],"[return, to, the, black, mesa, research, facil...",0.005253,'gearboxsoftware' 'valve' singleplayer multipl...
...,...,...,...,...,...,...,...,...,...
28113,Deez,997730,['UPandQ'],['UPandQ'],[singleplayer],[indie],"[this, is, a, unique, indie, game, in, which, ...",0.000000,'upandq' 'upandq' singleplayer indie this is a...
28114,Scream of the Viking,997790,['LTrust'],['LTrust'],[singleplayer],"[action, adventure, casual, indie, simulation]","[scream, of, the, viking, is, a, hardcore, 2d,...",0.000000,'ltrust' 'ltrust' singleplayer action adventur...
28115,City Defense,997870,"['Studio48','TurykinNikolay']",['Studio48'],[singleplayer],"[indie, strategy]","[in, city, defense, you, need, to, protect, yo...",0.000000,"'studio48','turykinnikolay' 'studio48' singlep..."
28118,The Dungeon of Lulu Farea,998220,['GalaxyWars'],['DLsite'],"[singleplayer, steamcloud]","[indie, rpg]","[this, game's, system, is, based, on, a, game,...",0.000000,'galaxywars' 'dlsite' singleplayer steamcloud ...


## Getting data ready for vectorization

In [7]:
# concatenate numerical columns
num_cols = ['achievements', 'average_playtime', 'median_playtime', 'owners', 'price']
scaler = preprocessing.MinMaxScaler()
numerical_data = scaler.fit_transform(df4[num_cols])
numerical_data_sparse = csr_matrix(numerical_data)
# concatenate text and remove stop words
stop_words = ENGLISH_STOP_WORDS
text_data = df4['tags'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
vectorizer = TfidfVectorizer(max_features=5000)
text_data = vectorizer.fit_transform(text_data)
# Finally, the dataframe with all the features normalized and represented as numbers

all_features = hstack([numerical_data_sparse, text_data])


In [8]:
all_features.shape

(27075, 5005)

In [9]:
model = NearestNeighbors(n_neighbors=6, metric='cosine', algorithm='brute')
model.fit(all_features)

In [10]:
def recommend(game, top_n=5):
    index = df4.index[df4['name'] == game].tolist()[0]
    distances, indices = model.kneighbors(all_features[index], n_neighbors=top_n+1)
    recommended_indices = indices.flatten()[1:]
    return df4.iloc[recommended_indices]['name'].tolist()

In [12]:
recommend('Team Fortress 2')

['Counter-Strike: Global Offensive',
 'Counter-Strike: Source',
 'Day of Defeat: Source',
 'Portal 2',
 'Alien Swarm']

In [38]:
game_to_recommend = "Team Fortress 2"
recommended_games = recommend(game_to_recommend)

print(f"Recommended games for {game_to_recommend}:")
for game in recommended_games:
    print(game)

Recommended games for Team Fortress 2:
Day of Defeat: Source
Counter-Strike: Source
Counter-Strike: Global Offensive
Alien Swarm
Portal
