In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
metadata = pd.read_csv('final_dataset.csv', low_memory=False)
#metadata = metadata[1:25000]

In [3]:
metadata.head()

Unnamed: 0,name_game,meta_score,user_score,platform,description
0,The Legend of Zelda: Ocarina of Time,99.0,91.0,Nintendo 64,"As a young boy, Link is tricked by Ganondorf, ..."
1,Super Mario Galaxy,97.0,91.0,Wii,[Metacritic's 2007 Wii Game of the Year] The u...
2,Super Mario Galaxy 2,97.0,91.0,Wii,"Super Mario Galaxy 2, the sequel to the galaxy..."
3,Metroid Prime,97.0,89.0,GameCube,Samus returns in a new mission to unravel the ...
4,Super Mario Odyssey,97.0,89.0,Switch,New Evolution of Mario Sandbox-Style Gameplay....


In [4]:
# Calculate mean of score average column
C_1 = metadata['meta_score'].mean()
C_2 = metadata['user_score'].mean()

# Calculate the minimum score required to be in the chart
m_1 = metadata['meta_score'].quantile(0.80)
m_2 = metadata['user_score'].quantile(0.80)
print(C_1)
print(C_2)
print(m_1)
print(m_2)

70.86292718831389
69.8855395764919
81.0
81.0


In [5]:
# Filter out all qualified games into a new DataFrame
best_games = metadata[(metadata.meta_score> m_1) & (metadata.user_score > m_2)]
best_games.shape

(737, 5)

In [6]:
# Function that computes the weighted rating of each game

def weighted_rating(x, m=m_1, C=C_1):
    v = x['vote_count']
    R = x['vote_average']
    
    # Calculation based on the IMDB formula
    
    return round((v/(v+m) * R) + (m/(m+v) * C), 2)

In [7]:
#est_games['score'] = best_games.apply(weighted_rating, axis=1)

In [8]:
# Sort movies based on score calculated above

best_games = best_games.sort_values('meta_score', ascending=False)

# Print the top 15 movies
best_games.head(20)

Unnamed: 0,name_game,meta_score,user_score,platform,description
0,The Legend of Zelda: Ocarina of Time,99.0,91.0,Nintendo 64,"As a young boy, Link is tricked by Ganondorf, ..."
1,Super Mario Galaxy,97.0,91.0,Wii,[Metacritic's 2007 Wii Game of the Year] The u...
2,Super Mario Galaxy 2,97.0,91.0,Wii,"Super Mario Galaxy 2, the sequel to the galaxy..."
3,Metroid Prime,97.0,89.0,GameCube,Samus returns in a new mission to unravel the ...
4,Super Mario Odyssey,97.0,89.0,Switch,New Evolution of Mario Sandbox-Style Gameplay....
5,Halo: Combat Evolved,97.0,87.0,Xbox,"Enter the mysterious world of Halo, an alien p..."
6,The House in Fata Morgana - Dreams of the Reve...,97.0,83.0,Switch,A gothic suspense tale set in a cursed mansion...
4263,10 Second Ninja X,96.8,81.4,"['Xbox One', 'PlayStation 3', 'Xbox 360', 'Pla...",Los Santos is a sprawling sun-soaked metropoli...
4265,11-11: Memories Retold,96.5,85.0,"['Switch', 'Wii U']",Forget everything you know about The Legend of...
9,Tekken 3,96.0,91.0,PlayStation,"An ancient evil force has reawakened, attackin..."


In [9]:
# Print plot overviews of the first 5 movies.
pd.options.display.max_colwidth = 100
metadata['description'].head()

0    As a young boy, Link is tricked by Ganondorf, the King of the Gerudo Thieves. The evil human use...
1    [Metacritic's 2007 Wii Game of the Year] The ultimate Nintendo hero is taking the ultimate step ...
2    Super Mario Galaxy 2, the sequel to the galaxy-hopping original game, includes the gravity-defyi...
3    Samus returns in a new mission to unravel the mystery behind the ruined walls scattered across T...
4    New Evolution of Mario Sandbox-Style Gameplay. Mario embarks on a new journey through unknown wo...
Name: description, dtype: object

In [10]:
# Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

# Replace NaN with an empty string
metadata['description'] = metadata['description'].fillna('')

# Construct the required TF_IDF matrix by fitting and transformating the data
tfidf_matrix = tfidf.fit_transform(metadata['description'])

# Output the shape of tfidf_matrix
tfidf_matrix.shape

(8831, 32332)

In [11]:
# Array mapping from feature integer indices to feature name.
tfidf.get_feature_names_out()[2500:2505]

array(['avenue', 'avenues', 'average', 'avernus', 'averse'], dtype=object)

In [12]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [13]:
tfidf_matrix.shape

(8831, 32332)

In [14]:
# Construct a reverse map of indices and games titles
indices = pd.Series(metadata.index, index=metadata['name_game'])

In [15]:
indices[:10]

name_game
The Legend of Zelda: Ocarina of Time                             0
Super Mario Galaxy                                               1
Super Mario Galaxy 2                                             2
Metroid Prime                                                    3
Super Mario Odyssey                                              4
Halo: Combat Evolved                                             5
The House in Fata Morgana - Dreams of the Revenants Edition -    6
NFL 2K1                                                          7
Uncharted 2: Among Thieves                                       8
Tekken 3                                                         9
dtype: int64

In [16]:
# Function that takes in game title as input and outputs most similar games

def get_recommendations(name_game, cosine_sim=cosine_sim):
    # Get the index of the movie thath matches the title 
    
    idx = indices[title]
    
    # Get the pairwise similarity scores of all movies with that movie
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the similarity scores
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies
    
    sim_scores = sim_scores[1:11]
    
    # Get the movie indices
    
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar movies
    
    return metadata['name_game'].iloc[movie_indices]

In [27]:
def create_soup(x):
    return ' '.join(x['platform']) + ' ' + ' '.join(x['description'])

In [28]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        # Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ",""))
        else:
            return ''

In [35]:
from ast import literal_eval

features = ['description', 'platform']

for feature in features:
    metadata[feature] = metadata[feature].apply(literal_eval)

SyntaxError: unexpected EOF while parsing (<unknown>, line 1)

In [37]:
# Apply clean_data functions to your features.
features = ['description']

for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)

In [38]:
# Create a new soup feature
metadata['soup'] = metadata.apply(create_soup, axis=1)

In [40]:
metadata['soup']

0       n i n t e n d o 6 4 a s a y o u n g b o y , l i n k i s t r i c k e d b y g a n o n d o r f , t ...
1       w i i [ m e t a c r i t i c ' s 2 0 0 7 w i i g a m e o f t h e y e a r ] t h e u l t i m a t e ...
2       w i i s u p e r m a r i o g a l a x y 2 , t h e s e q u e l t o t h e g a l a x y - h o p p i n ...
3       g a m e c u b e s a m u s r e t u r n s i n a n e w m i s s i o n t o u n r a v e l t h e m y s ...
4       s w i t c h n e w e v o l u t i o n o f m a r i o s a n d b o x - s t y l e g a m e p l a y . m ...
                                                       ...                                                 
8826    [ ' x b o x ' , ' x b o x ' ] y o u r f a v o r i t e c h a r a c t e r s f r o m t h e m o v i ...
8827    [ ' p l a y s t a t i o n 3 ' , ' p l a y s t a t i o n 3 ' , ' x b o x 3 6 0 ' ] d u e f o r r ...
8828    [ ' x b o x 3 6 0 ' , ' p c ' , ' p l a y s t a t i o n 3 ' ] t h e l e i s u r e s u i t l a r ...
8829    [ ' x b o x 3 6 0 ' 

In [41]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(metadata['soup'])

ValueError: empty vocabulary; perhaps the documents only contain stop words