In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from ast import literal_eval
pd.options.mode.chained_assignment = None

## Import and clean data

In [3]:
metadata = pd.read_csv('datasets/final_dataset.csv', converters={'platform': literal_eval})
features = pd.read_csv('datasets/comeon.csv')

In [4]:
features.head(100)

Unnamed: 0,developer,genre,type,rating
0,Nintendo,"['Action Adventure', 'Fantasy']",singleplayer,E
1,Nintendo,"['Action', 'Platformer', '3D']",singleplayer,E
2,Nintendo EAD Tokyo,"['Action', 'Platformer', '3D']",singleplayer,E
3,Retro Studios,"['Action', 'Shooter', 'First-Person', 'Sci-Fi']",singleplayer,T
4,Nintendo,"['Action', 'Platformer', '3D']",singleplayer,E10+
...,...,...,...,...
95,Bandai Namco Games,"['Action', 'Fighting', '3D', '2D']",multiplayer,E10+
96,TOSE,"['Role-Playing', 'Console-style RPG', 'Japanes...",singleplayer,E10+
97,Monolith Soft,"['Role-Playing', 'Action RPG', 'Console-style ...",singleplayer,T
98,Rare Ltd.,"['Action', 'Platformer', '3D']",,M


In [5]:
metadata = metadata.join(features)

In [6]:
len(metadata)

8831

In [7]:
metadata = metadata.dropna(subset=['genre'])

In [8]:
len(metadata)

8827

In [None]:
metadata['genre'] = metadata['genre'].apply(literal_eval)

In [None]:
# Sort games based on score calculated above
metadata = metadata.sort_values('meta_score', ascending=False).reset_index(inplace=False, drop=True)

## Start recommender

In [None]:
pd.options.display.max_colwidth = 100
metadata['description'].head()

In [None]:
# Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

# Replace NaN with an empty string
metadata['description'] = metadata['description'].fillna('')

# Construct the required TF_IDF matrix by fitting and transformating the data
tfidf_matrix = tfidf.fit_transform(metadata['description'])

# Output the shape of tfidf_matrix
tfidf_matrix.shape

In [None]:
# Array mapping from feature integer indices to feature name.
tfidf.get_feature_names_out()[5000:5010]

In [None]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
tfidf_matrix.shape

In [None]:
# Construct a reverse map of indices and games titles
indices = pd.Series(metadata.index, index=metadata['name_game'])

In [None]:
indices[:10]

In [None]:
# Function that takes in game title as input and outputs most similar games

def get_recommendations(name_game, cosine_sim=cosine_sim):
    # Get the index of the game thath matches the title 
    idx = indices[name_game]
    
    # Get the pairwise similarity scores of all games with that game
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the games based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar games
    sim_scores = sim_scores[1:11]
    
    # Get the games indices
    movie_indices = [i[0] for i in sim_scores]
    

    # Return the top 10 most similar games
    return metadata['name_game'].iloc[movie_indices]

In [None]:
metadata.head()

In [None]:
metadata['developer']

In [None]:
def create_soup(x):
    return ' '.join(x['platform']).lower() + ' ' + ''.join(str(x['developer']).split()) + ' ' + ' '.join(x['genre']) + ' ' + str(x['type']) + ' ' + str(x['rating'])

In [None]:
# Create a new soup feature
metadata['soup'] = metadata.apply(create_soup, axis=1)

In [None]:
metadata[['soup']].head()

In [None]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(metadata['soup'])

In [None]:
count_matrix.shape

In [None]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [None]:
# Reset index of your main DataFrame and construct reverse mapping as before
metadata = metadata.reset_index()
indices = pd.Series(metadata.index, index=metadata['name_game'])

In [None]:
get_recommendations('Minecraft', cosine_sim2)