In [4]:
import pandas as pd

games_dataframe = pd.read_csv("dataset/games.csv", on_bad_lines="skip")

# Iterate over smaller chunks at a time as the data is very large in these files.
users_iterator = pd.read_csv("dataset/users.csv", on_bad_lines="skip", chunksize=100_000)
recs_iterator = pd.read_csv("dataset/recommendations.csv", on_bad_lines="skip", iterator=True, chunksize=1_000_000)

In [5]:
# Preprocessing.
games_duplicates = 0
users_duplicates = 0
recs_duplicates = 0

games_duplicates = games_dataframe.duplicated(subset="title").sum()
for chunk in users_iterator:
    users_duplicates += chunk.duplicated(subset="user_id").sum()
for chunk in recs_iterator:
    recs_duplicates += chunk.duplicated(subset="app_id").sum()

print(f"DUPLICATE GAMES: {games_duplicates}")
print(f"DUPLICATE USERS: {users_duplicates}")
print(f"DUPLICATE RECS: {recs_duplicates}")

# Removing duplicates.
games_dataframe = games_dataframe.drop_duplicates(subset="title")
print(f"DUPLICATE GAMES NOW: {games_dataframe.duplicated(subset='title')}")

DUPLICATE GAMES: 121
DUPLICATE USERS: 0
DUPLICATE RECS: 38223375
DUPLICATE GAMES NOW: 0        False
1        False
2        False
3        False
4        False
         ...  
50791    False
50792    False
50793    False
50794    False
50795    False
Length: 50675, dtype: bool


In [6]:
# Random sampling.
sample_size = 15_000
games_dataframe = games_dataframe.sample(n=sample_size, replace=False, random_state=490)

games_dataframe = games_dataframe.reset_index()
games_dataframe = games_dataframe.drop("index", axis=1)

In [7]:
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. Process textual data
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(games_dataframe['title'].fillna(''))

# 2. Calculate similarity scores
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


def get_content_based_recommendations(title, games_dataframe, cosine_sim=cosine_sim):
    idx = games_dataframe.index[games_dataframe['title'] == title].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    game_indices = [i[0] for i in sim_scores]
    return games_dataframe['title'].iloc[game_indices]


print(get_content_based_recommendations("Call of Duty", games_dataframe))

IndexError: list index out of range