In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from ast import literal_eval
pd.options.mode.chained_assignment = None

## Import and clean data

In [79]:
metadata = pd.read_csv('final_dataset.csv', converters={'platform': literal_eval})
features = pd.read_csv('comeon.csv')

In [3]:
features.head(100)

Unnamed: 0,developer,genre,type,rating
0,Nintendo,"['Action Adventure', 'Fantasy']",singleplayer,E
1,Nintendo,"['Action', 'Platformer', '3D']",singleplayer,E
2,Nintendo EAD Tokyo,"['Action', 'Platformer', '3D']",singleplayer,E
3,Retro Studios,"['Action', 'Shooter', 'First-Person', 'Sci-Fi']",singleplayer,T
4,Nintendo,"['Action', 'Platformer', '3D']",singleplayer,E10+
...,...,...,...,...
95,Bandai Namco Games,"['Action', 'Fighting', '3D', '2D']",multiplayer,E10+
96,TOSE,"['Role-Playing', 'Console-style RPG', 'Japanes...",singleplayer,E10+
97,Monolith Soft,"['Role-Playing', 'Action RPG', 'Console-style ...",singleplayer,T
98,Rare Ltd.,"['Action', 'Platformer', '3D']",,M


In [4]:
metadata = metadata.join(features)

In [5]:
len(metadata)

8831

In [6]:
metadata = metadata.dropna(subset=['genre'])

In [7]:
len(metadata)

8827

In [8]:
metadata['genre'] = metadata['genre'].apply(literal_eval)

In [None]:
# Sort games based on score calculated above
metadata = metadata.sort_values('meta_score', ascending=False).reset_index(inplace=False, drop=True)

## Start recommender

In [80]:
pd.options.display.max_colwidth = 100
metadata['description'].head()

0    As a young boy, Link is tricked by Ganondorf, the King of the Gerudo Thieves. The evil human use...
1    [Metacritic's 2007 Wii Game of the Year] The ultimate Nintendo hero is taking the ultimate step ...
2    Super Mario Galaxy 2, the sequel to the galaxy-hopping original game, includes the gravity-defyi...
3    Samus returns in a new mission to unravel the mystery behind the ruined walls scattered across T...
4    New Evolution of Mario Sandbox-Style Gameplay. Mario embarks on a new journey through unknown wo...
Name: description, dtype: object

In [26]:
# Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

# Replace NaN with an empty string
metadata['description'] = metadata['description'].fillna('')

# Construct the required TF_IDF matrix by fitting and transformating the data
tfidf_matrix = tfidf.fit_transform(metadata['description'])

# Output the shape of tfidf_matrix
tfidf_matrix.shape

(8827, 32326)

In [27]:
# Array mapping from feature integer indices to feature name.
tfidf.get_feature_names_out()[5000:5010]

array(['ceremonial', 'ceremonies', 'ceremony', 'ceres', 'cersei',
       'certain', 'certainly', 'certified', 'cesar', 'cessna'],
      dtype=object)

In [28]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [29]:
tfidf_matrix.shape

(8827, 32326)

In [30]:
# Construct a reverse map of indices and games titles
indices = pd.Series(metadata.index, index=metadata['name_game'])

In [31]:
indices[:10]

name_game
The Legend of Zelda: Ocarina of Time                             0
Super Mario Odyssey                                              1
Halo: Combat Evolved                                             2
The House in Fata Morgana - Dreams of the Revenants Edition -    3
NFL 2K1                                                          4
Super Mario Galaxy                                               5
Super Mario Galaxy 2                                             6
Metroid Prime                                                    7
Grand Theft Auto V                                               8
The Legend of Zelda: Breath of the Wild                          9
dtype: int64

In [32]:
# Function that takes in game title as input and outputs most similar games

def get_recommendations(name_game, cosine_sim=cosine_sim):
    # Get the index of the game thath matches the title 
    
    idx = indices[name_game]
    
    # Get the pairwise similarity scores of all games with that game
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the games based on the similarity scores
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar games
    
    sim_scores = sim_scores[1:11]
    
    # Get the games indices
    
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar games
    
    return metadata['name_game'].iloc[movie_indices]

In [33]:
metadata.head()

Unnamed: 0,name_game,meta_score,user_score,platform,description,developer,genre,type,rating
0,The Legend of Zelda: Ocarina of Time,99.0,91.0,[nintendo-64],"As a young boy, Link is tricked by Ganondorf, the King of the Gerudo Thieves. The evil human use...",Nintendo,"[Action Adventure, Fantasy]",singleplayer,E
1,Super Mario Odyssey,97.0,89.0,[switch],New Evolution of Mario Sandbox-Style Gameplay. Mario embarks on a new journey through unknown wo...,Nintendo,"[Action, Platformer, 3D]",singleplayer,E10+
2,Halo: Combat Evolved,97.0,87.0,[xbox],"Enter the mysterious world of Halo, an alien planet shaped like a ring. As mankind's super sold...",Bungie Software,"[Action, Shooter, First-Person, Sci-Fi]",multiplayer,M
3,The House in Fata Morgana - Dreams of the Revenants Edition -,97.0,83.0,[switch],A gothic suspense tale set in a cursed mansion. ‘The House in Fata Morgana’ is a full-length vis...,HuneX,"[Adventure, Visual Novel]",singleplayer,M
4,NFL 2K1,97.0,62.0,[dreamcast],"In the end, NFL 2K1 is a deeper, more refined version of the original game.",Visual Concepts,"[Sports, Traditional, Football, Sim]",,E


In [None]:
metadata['developer']

In [65]:
def create_soup(x):
    return ' '.join(x['platform']).lower() + ' ' + ''.join(str(x['developer']).split()) + ' ' + ' '.join(x['genre']) + ' ' + str(x['type']) + ' ' + str(x['rating'])
    #' ' + str(x['developer']).lower() + ' ' + str(' '.join(x['genre'])).lower() + ' ' + str(x['type']).lower() + ' ' + str(x['rating']).lower()

In [66]:
# Create a new soup feature
metadata['soup'] = metadata.apply(create_soup, axis=1)

In [67]:
metadata[['soup']].head()

Unnamed: 0,soup
0,nintendo-64 Nintendo Action Adventure Fantasy singleplayer E
1,switch Nintendo Action Platformer 3D singleplayer E10+
2,xbox BungieSoftware Action Shooter First-Person Sci-Fi multiplayer M
3,switch HuneX Adventure Visual Novel singleplayer M
4,dreamcast VisualConcepts Sports Traditional Football Sim nan E


In [68]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(metadata['soup'])

In [69]:
count_matrix.shape

(8827, 3190)

In [70]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [71]:
# Reset index of your main DataFrame and construct reverse mapping as before
metadata = metadata.reset_index()
indices = pd.Series(metadata.index, index=metadata['name_game'])

In [76]:
get_recommendations('Dota 2', cosine_sim2)

6930                                    Kingdom Under Fire II
8278                                                    Tryst
115                              Warcraft III: Reign of Chaos
181                                    Myth: The Fallen Lords
271                                             Black & White
329                                          Age of Mythology
419                                     Myth II: Soulblighter
446                           Warcraft III: The Frozen Throne
846                              Age of Mythology: The Titans
940     The Lord of the Rings: The Battle for Middle-Earth II
Name: name_game, dtype: object