In [1]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import sklearn
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [2]:
df = pd.read_csv('/home/javier/Documents/Projects/Steam/archive/steam.csv')

In [3]:
df1 = df.copy()

In [4]:
# drop unnecesary columns
df1.drop(columns=['release_date', 'english', 'platforms', 'required_age', 'steamspy_tags'], inplace=True)

## Data manipulation

In [10]:
duplicates = df1[df1.duplicated(subset=['appid'], keep=False)]

In [11]:
# Group to aggregate all text rows into one
df_aggregated = df1.groupby('appid')['categories'].agg(lambda x: ';'.join(x)).reset_index()

In [12]:
df1.drop_duplicates(subset=['appid'], inplace=True)

In [13]:
df_merged = pd.merge(df1, df_aggregated, on='appid', how='left', suffixes=('_original', '_aggregated'))

In [14]:
df_merged['categories_original'] = df_merged['categories_aggregated']

In [15]:
df_merged = df_merged.drop(columns=['categories_aggregated'])

In [16]:
df_merged.rename(columns={'categories_original':'categories'}, inplace=True)

In [17]:
df1 = df_merged.copy()

In [18]:
df_desc = pd.read_csv('/home/javier/Documents/Projects/Steam/archive/steam_description_data.csv')

In [19]:
df_desc.rename(columns={'steam_appid':'appid'},inplace=True)

In [20]:
final_df = df1.merge(df_desc, on = 'appid', how= 'inner')

In [21]:
df3 = final_df.copy()

In [22]:
# string manipulation
df3['developer'] = df3['developer'].astype(str)
df3['publisher'] = df3['publisher'].astype(str)
df3['developer'] = df3['developer'].str.replace(' ', '').str.split(';')
df3['publisher'] = df3['publisher'].str.replace(' ', '').str.split(';')
df3['owners'] = df3['owners'].apply(lambda x: int(x.split('-')[1]) if '-' in x else int(x))
df3['categories'] = df3['categories'].str.split(';')
df3['genres'] = df3['genres'].str.split(';')
df3['categories'] = df3['categories'].apply(lambda x: [i.replace(' ', '') for i in x])
df3['categories'] = df3['categories'].apply(lambda x: [i.replace('-', '') for i in x])
df3['detailed_description'] = df3['detailed_description'].apply(lambda x: x.split())
df3['about_the_game'] = df3['about_the_game'].apply(lambda x: x.split())
df3['short_description'] = df3['short_description'].apply(lambda x: x.split())

## Data Normalization

In [23]:
# will not take into account the ratings as they should be normalized together
numerical_columns = ['achievements', 'average_playtime', 'median_playtime', 'owners', 'price']
ratings_df = df3[['positive_ratings', 'negative_ratings']]

In [24]:
# Data scaling
min_max_scaler = preprocessing.MinMaxScaler()
for column in numerical_columns:
    df3[column] = min_max_scaler.fit_transform(df3[[column]])

inputs_scaled=min_max_scaler.fit_transform(ratings_df)
inputs_n=pd.DataFrame(inputs_scaled,columns=ratings_df.columns)

In [25]:
df_merged2 = pd.merge(df3, inputs_n, left_index=True, right_index=True)

In [26]:
df_merged2.drop(columns=['positive_ratings_x','negative_ratings_x'], inplace=True)
df_merged2.rename(columns={'positive_ratings_y':'positive_ratings', 'negative_ratings_y': 'negative_ratings'}, inplace=True)

In [27]:
# taking only the short description into account as its the only one without the <> characters
df_merged2['tags'] = df_merged2['developer'] + df_merged2['publisher'] + df_merged2['categories'] + df_merged2['short_description'] + df_merged2['genres']

In [28]:
# convert the values from a list to a string but add a space in between while also making it all lowercase
df_merged2['tags'] = df_merged2['tags'].apply(lambda x: ' '.join(map(str, x)))
df_merged2['tags'] = df_merged2['tags'].apply(lambda x: x.lower())

In [29]:
df4 = df_merged2[['appid', 'name', 'tags', 'achievements', 'average_playtime', 'median_playtime', 'owners', 'price']].copy()

## Getting data ready for vectorization

In [31]:
# concatenate numerical columns
num_cols = ['achievements', 'average_playtime', 'median_playtime', 'owners', 'price']
scaler = preprocessing.MinMaxScaler()
numerical_data = scaler.fit_transform(df4[num_cols])

In [32]:
# concatenate text and remove stop words
stop_words = ENGLISH_STOP_WORDS
text_data = df4['tags'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
vectorizer = CountVectorizer()
text_data = vectorizer.fit_transform(text_data).toarray()

In [33]:
# Finally, the dataframe with all the features normalized and represented as numbers
all_features = np.concatenate((numerical_data, text_data), axis=1)

In [34]:
all_features.shape

(27075, 56708)

In [35]:
# longest step, takes a while due to the amount of values. Could be improved
similarity = cosine_similarity(all_features)

In [36]:
def recommend(game, top_n=5):
    index = df4.index[df4['name'] == game].tolist()[0]
    similar_games = sorted(enumerate(similarity[index]), key=lambda x: x[1], reverse=True)[1:top_n+1]

    recommended_games = [df4['name'][i[0]] for i in similar_games]
    return recommended_games

In [38]:
game_to_recommend = "Team Fortress 2"
recommended_games = recommend(game_to_recommend)

print(f"Recommended games for {game_to_recommend}:")
for game in recommended_games:
    print(game)

Recommended games for Team Fortress 2:
Day of Defeat: Source
Counter-Strike: Source
Counter-Strike: Global Offensive
Alien Swarm
Portal
