In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
metadata = pd.read_csv('final_dataset.csv', low_memory=False)
#metadata = metadata[1:25000]

In [3]:
metadata.head()

Unnamed: 0,name_game,meta_score,user_score,platform,description
0,The Legend of Zelda: Ocarina of Time,99.0,91.0,Nintendo 64,"As a young boy, Link is tricked by Ganondorf, ..."
1,Super Mario Galaxy,97.0,91.0,Wii,[Metacritic's 2007 Wii Game of the Year] The u...
2,Super Mario Galaxy 2,97.0,91.0,Wii,"Super Mario Galaxy 2, the sequel to the galaxy..."
3,Metroid Prime,97.0,89.0,GameCube,Samus returns in a new mission to unravel the ...
4,Super Mario Odyssey,97.0,89.0,Switch,New Evolution of Mario Sandbox-Style Gameplay....


In [4]:
# Calculate mean of score average column
C_1 = metadata['meta_score'].mean()
C_2 = metadata['user_score'].mean()

# Calculate the minimum score required to be in the chart
m_1 = metadata['meta_score'].quantile(0.80)
m_2 = metadata['user_score'].quantile(0.80)
print(C_1)
print(C_2)
print(m_1)
print(m_2)

70.86292718831389
69.8855395764919
81.0
81.0


In [5]:
# Filter out all qualified games into a new DataFrame
best_games = metadata[(metadata.meta_score> m_1) & (metadata.user_score > m_2)]
best_games.shape

(737, 5)

In [6]:
#est_games['score'] = best_games.apply(weighted_rating, axis=1)

In [7]:
# Sort games based on score calculated above

best_games = best_games.sort_values('meta_score', ascending=False)

# Print the top 15 movies
best_games.head(20)

Unnamed: 0,name_game,meta_score,user_score,platform,description
0,The Legend of Zelda: Ocarina of Time,99.0,91.0,Nintendo 64,"As a young boy, Link is tricked by Ganondorf, ..."
1,Super Mario Galaxy,97.0,91.0,Wii,[Metacritic's 2007 Wii Game of the Year] The u...
2,Super Mario Galaxy 2,97.0,91.0,Wii,"Super Mario Galaxy 2, the sequel to the galaxy..."
3,Metroid Prime,97.0,89.0,GameCube,Samus returns in a new mission to unravel the ...
4,Super Mario Odyssey,97.0,89.0,Switch,New Evolution of Mario Sandbox-Style Gameplay....
5,Halo: Combat Evolved,97.0,87.0,Xbox,"Enter the mysterious world of Halo, an alien p..."
6,The House in Fata Morgana - Dreams of the Reve...,97.0,83.0,Switch,A gothic suspense tale set in a cursed mansion...
4263,10 Second Ninja X,96.8,81.4,"['Xbox One', 'PlayStation 3', 'Xbox 360', 'Pla...",Los Santos is a sprawling sun-soaked metropoli...
4265,11-11: Memories Retold,96.5,85.0,"['Switch', 'Wii U']",Forget everything you know about The Legend of...
9,Tekken 3,96.0,91.0,PlayStation,"An ancient evil force has reawakened, attackin..."


In [8]:
pd.options.display.max_colwidth = 100
metadata['description'].head()

0    As a young boy, Link is tricked by Ganondorf, the King of the Gerudo Thieves. The evil human use...
1    [Metacritic's 2007 Wii Game of the Year] The ultimate Nintendo hero is taking the ultimate step ...
2    Super Mario Galaxy 2, the sequel to the galaxy-hopping original game, includes the gravity-defyi...
3    Samus returns in a new mission to unravel the mystery behind the ruined walls scattered across T...
4    New Evolution of Mario Sandbox-Style Gameplay. Mario embarks on a new journey through unknown wo...
Name: description, dtype: object

In [9]:
# Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

# Replace NaN with an empty string
metadata['description'] = metadata['description'].fillna('')

# Construct the required TF_IDF matrix by fitting and transformating the data
tfidf_matrix = tfidf.fit_transform(metadata['description'])

# Output the shape of tfidf_matrix
tfidf_matrix.shape

(8831, 32332)

In [10]:
# Array mapping from feature integer indices to feature name.
tfidf.get_feature_names_out()[5000:5010]

array(['ceremonial', 'ceremonies', 'ceremony', 'ceres', 'cersei',
       'certain', 'certainly', 'certified', 'cesar', 'cessna'],
      dtype=object)

In [11]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [12]:
tfidf_matrix.shape

(8831, 32332)

In [13]:
# Construct a reverse map of indices and games titles
indices = pd.Series(metadata.index, index=metadata['name_game'])

In [14]:
indices[:10]

name_game
The Legend of Zelda: Ocarina of Time                             0
Super Mario Galaxy                                               1
Super Mario Galaxy 2                                             2
Metroid Prime                                                    3
Super Mario Odyssey                                              4
Halo: Combat Evolved                                             5
The House in Fata Morgana - Dreams of the Revenants Edition -    6
NFL 2K1                                                          7
Uncharted 2: Among Thieves                                       8
Tekken 3                                                         9
dtype: int64

In [15]:
# Function that takes in game title as input and outputs most similar games

def get_recommendations(name_game, cosine_sim=cosine_sim):
    # Get the index of the game thath matches the title 
    
    idx = indices[name_game]
    
    # Get the pairwise similarity scores of all games with that game
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the games based on the similarity scores
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar games
    
    sim_scores = sim_scores[1:11]
    
    # Get the games indices
    
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar games
    
    return metadata['name_game'].iloc[movie_indices]

In [16]:
def create_soup(x):
    return ' '.join(x['platform'])

In [17]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" "," "))
        else:
            return ''

In [18]:
metadata.head()

Unnamed: 0,name_game,meta_score,user_score,platform,description
0,The Legend of Zelda: Ocarina of Time,99.0,91.0,Nintendo 64,"As a young boy, Link is tricked by Ganondorf, the King of the Gerudo Thieves. The evil human use..."
1,Super Mario Galaxy,97.0,91.0,Wii,[Metacritic's 2007 Wii Game of the Year] The ultimate Nintendo hero is taking the ultimate step ...
2,Super Mario Galaxy 2,97.0,91.0,Wii,"Super Mario Galaxy 2, the sequel to the galaxy-hopping original game, includes the gravity-defyi..."
3,Metroid Prime,97.0,89.0,GameCube,Samus returns in a new mission to unravel the mystery behind the ruined walls scattered across T...
4,Super Mario Odyssey,97.0,89.0,Switch,New Evolution of Mario Sandbox-Style Gameplay. Mario embarks on a new journey through unknown wo...


In [19]:
def get_list(x):
    if isinstance(x, list): # For tested
        return 1
    else:
        x = x.replace(" ", "",).replace(",", " ").replace("'", "").replace("[","").replace("]","").split()
        return list(dict.fromkeys(x))
            
        
    
    # Return empty list in case of missing/malformed data
    
    return []

In [20]:
# Convert string into list

metadata['platform'] = metadata['platform'].apply(get_list)

In [21]:
# Apply clean_data functions to your features.
features = ['platform']

for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)

In [22]:
metadata.head()

Unnamed: 0,name_game,meta_score,user_score,platform,description
0,The Legend of Zelda: Ocarina of Time,99.0,91.0,[nintendo64],"As a young boy, Link is tricked by Ganondorf, the King of the Gerudo Thieves. The evil human use..."
1,Super Mario Galaxy,97.0,91.0,[wii],[Metacritic's 2007 Wii Game of the Year] The ultimate Nintendo hero is taking the ultimate step ...
2,Super Mario Galaxy 2,97.0,91.0,[wii],"Super Mario Galaxy 2, the sequel to the galaxy-hopping original game, includes the gravity-defyi..."
3,Metroid Prime,97.0,89.0,[gamecube],Samus returns in a new mission to unravel the mystery behind the ruined walls scattered across T...
4,Super Mario Odyssey,97.0,89.0,[switch],New Evolution of Mario Sandbox-Style Gameplay. Mario embarks on a new journey through unknown wo...


In [23]:
# Create a new soup feature
metadata['soup'] = metadata.apply(create_soup, axis=1)

In [24]:
metadata[['soup']].head()

Unnamed: 0,soup
0,nintendo64
1,wii
2,wii
3,gamecube
4,switch


In [25]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(metadata['soup'])

In [26]:
count_matrix.shape

(8831, 22)

In [27]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [28]:
# Reset index of your main DataFrame and construct reverse mapping as before
metadata = metadata.reset_index()
indices = pd.Series(metadata.index, index=metadata['name_game'])

In [31]:
get_recommendations('FIFA 21', cosine_sim2)

188    NieR: Automata - Become as Gods Edition
229                Minecraft: Xbox One Edition
299                            Forza Horizon 2
349          Halo: The Master Chief Collection
381                             Gears of War 4
382                          Halo 5: Guardians
443                                        FRU
530                              Madden NFL 17
558                 Castle Crashers Remastered
598          Path of Exile: The Fall of Oriath
Name: name_game, dtype: object