# Game Finder

### Importing Libraries

In [385]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

### Data Exploration & Preprocessing

In [386]:
df = pd.read_csv('game.csv')

In [387]:
df.head()

Unnamed: 0,Title,User Score,Meta Score,Genre,Age Rating
0,The Legend of Zelda: Ocarina of Time,9.1,99,Open-World Action,Rated E
1,SoulCalibur,8.2,98,3D Fighting,Rated T
2,Grand Theft Auto IV,8.1,98,Open-World Action,Rated M
3,Super Mario Galaxy,9.1,97,3D Platformer,Rated E
4,Super Mario Galaxy 2,9.0,97,3D Platformer,Rated E


In [388]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 936 entries, 0 to 935
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Title       936 non-null    object
 1   User Score  936 non-null    object
 2   Meta Score  936 non-null    object
 3   Genre       936 non-null    object
 4   Age Rating  845 non-null    object
dtypes: object(5)
memory usage: 36.7+ KB


In [389]:
df.isna().sum()

Title          0
User Score     0
Meta Score     0
Genre          0
Age Rating    91
dtype: int64

In [390]:
df.fillna('None', inplace=True)

In [391]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 936 entries, 0 to 935
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Title       936 non-null    object
 1   User Score  936 non-null    object
 2   Meta Score  936 non-null    object
 3   Genre       936 non-null    object
 4   Age Rating  936 non-null    object
dtypes: object(5)
memory usage: 36.7+ KB


In [392]:
df.shape

(936, 5)

In [393]:
#find unique values in target columns
cols = ['User Score', 'Meta Score', 'Age Rating', 'Genre']
for col in cols:
    print(df[col].unique())

['9.1' '8.2' '8.1' '9.0' '8.7' '7.7' '8.4' '8.9' '8.3' '6.1' '8.8' '8.6'
 '7.9' '9.2' '7.0' '4.7' '8.5' '9.3' '8.0' '7.5' '5.8' '7.8' '7.6' '7.3'
 '7.2' '6.9' '7.1' '5.9' '6.5' '6.7' '3.3' '6.4' '9.7' '6.0' '6.2' '2.3'
 '9.5' '9.4' '7.4' '5.7' '6.8' '6.3' '3.1' '5.6' '5.5' '9.8' 'tbd' '5.2'
 '4.9' '3.6' '2.0' '6.6' '5.0' '4.2' '3.8' '4.8' '2.2' '5.1']
['99' '98' '97' '96' '95' '94' '93' '92' '91' '90' '89' '88' 'tbd' '87'
 '86']
['Rated E' 'Rated T' 'Rated M' 'Rated E +10' 'None']
['Open-World Action' '3D Fighting' '3D Platformer' 'Skating' 'FPS'
 'Football Sim' 'Linear Action Adventure' 'Survival' 'Western RPG'
 'Compilation' 'Action RPG' 'Visual Novel' 'Auto Racing Sim'
 'Action Adventure' '2D Platformer' 'JRPG' 'Third Person Shooter'
 '4X Strategy' 'Third-Person Adventure' 'Auto Racing' '2D Fighting'
 'Real-Time Strategy' 'Arcade' 'Management' 'Command RTS' 'Skiing'
 'Soccer Sim' 'Rhythm' 'Basketball Sim' "2D Beat-'Em-Up" 'Roguelike'
 'Future Racing' 'MMORPG' 'Application' 'Hockey S

In [394]:
#look for titles with the 'tbd' values
matching_titles = df[df['User Score'] == 'tbd']['Title']
matching_titles

384                                              NHL 2K3
417                        World Soccer Winning Eleven 9
424                            Football Manager Handheld
481                                            Soul Edge
506    Pinball FX 2: Marvel Pinball - Avengers Chroni...
556                            Tiger Woods PGA Tour 2003
587                                 Bomberman Tournament
693                                  Ultimate Card Games
757                        Worldwide Soccer Manager 2007
827                                    MVP Baseball 2005
881                               ESPN College Hoops 2K5
Name: Title, dtype: object

There are a few games with 'tbd' as user reviews. Metacritic will only show user scores after 4 user reviews. Hence, it would be better to drop these games as they have too little reviews.

In [395]:
df = df.drop(df[df['User Score'] == 'tbd'].index)
df = df.drop(df[df['Meta Score'] == 'tbd'].index)
df = df.reset_index(drop=True)

I then divide the meta scores by 10 to make them consistent with the user scores.

In [396]:
df['Meta Score'] = pd.to_numeric(df['Meta Score'])/10
df['Meta Score']

0      9.9
1      9.8
2      9.8
3      9.7
4      9.7
      ... 
920    8.6
921    8.6
922    8.6
923    8.6
924    8.6
Name: Meta Score, Length: 925, dtype: float64

### Feature Extraction

In [397]:
df['User Score'] = pd.to_numeric(df['User Score'])

Next, I calculate the average score of a game by adding the user score and meta score then dividing by 2.

In [398]:
df['Average Score'] = (df['User Score'] + df['Meta Score'])/2
df['Average Score'].round(1)
df['Average Score']

0      9.50
1      9.00
2      8.95
3      9.40
4      9.35
       ... 
920    8.20
921    8.50
922    8.40
923    8.45
924    8.15
Name: Average Score, Length: 925, dtype: float64

In [399]:
#one hot encode the genres and age ratings to transform to numerical values
one_hot = OneHotEncoder()
one_hot_df = one_hot.fit_transform(df[['Genre', 'Age Rating']]).toarray()
new_one_hot_df = pd.DataFrame(one_hot_df, columns=one_hot.get_feature_names_out(['Genre', 'Age Rating']))
new_one_hot_df

Unnamed: 0,Genre_2D Beat-'Em-Up,Genre_2D Fighting,Genre_2D Platformer,Genre_3D Fighting,Genre_3D Platformer,Genre_4X Strategy,Genre_Action,Genre_Action Adventure,Genre_Action Puzzle,Genre_Action RPG,...,Genre_Vertical Shoot-'Em-Up,Genre_Virtual Life,Genre_Visual Novel,Genre_Western RPG,Genre_Wrestling,Age Rating_None,Age Rating_Rated E,Age Rating_Rated E +10,Age Rating_Rated M,Age Rating_Rated T
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
920,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
921,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
922,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [400]:
#combine the two dataframes to form one that can be used for cosine similarity
new_df = pd.concat([df, new_one_hot_df], axis=1).drop(['Genre', 'Age Rating'], axis=1)
new_df

Unnamed: 0,Title,User Score,Meta Score,Average Score,Genre_2D Beat-'Em-Up,Genre_2D Fighting,Genre_2D Platformer,Genre_3D Fighting,Genre_3D Platformer,Genre_4X Strategy,...,Genre_Vertical Shoot-'Em-Up,Genre_Virtual Life,Genre_Visual Novel,Genre_Western RPG,Genre_Wrestling,Age Rating_None,Age Rating_Rated E,Age Rating_Rated E +10,Age Rating_Rated M,Age Rating_Rated T
0,The Legend of Zelda: Ocarina of Time,9.1,9.9,9.50,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,SoulCalibur,8.2,9.8,9.00,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,Grand Theft Auto IV,8.1,9.8,8.95,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,Super Mario Galaxy,9.1,9.7,9.40,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,Super Mario Galaxy 2,9.0,9.7,9.35,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
920,Unreal Tournament 2003,7.8,8.6,8.20,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
921,Burnout 2: Point of Impact,8.4,8.6,8.50,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
922,Daytona USA,8.2,8.6,8.40,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
923,EverQuest: The Ruins of Kunark,8.3,8.6,8.45,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Compute Cosine Similarity

In [401]:
#drop the title column as it is not a feature
cosine_sim = cosine_similarity(new_df.drop('Title', axis=1))
cosine_sim

array([[1.        , 0.99155788, 0.99519314, ..., 0.99568783, 0.99151743,
        0.99132194],
       [0.99155788, 1.        , 0.99182496, ..., 0.98986869, 0.99398388,
        0.99060871],
       [0.99519314, 0.99182496, 1.        , ..., 0.98955519, 0.98931304,
        0.99493077],
       ...,
       [0.99568783, 0.98986869, 0.98955519, ..., 1.        , 0.99068552,
        0.99003842],
       [0.99151743, 0.99398388, 0.98931304, ..., 0.99068552, 1.        ,
        0.98995477],
       [0.99132194, 0.99060871, 0.99493077, ..., 0.99003842, 0.98995477,
        1.        ]])

### Creating a Function to Recommend Games

In [402]:
#using Elden Ring as an example
game_title = "Elden Ring"
def recommendGame(game_title, cosine_sim=cosine_sim):
    #check if game exists in dataset
    if game_title not in df['Title'].values:
        print(f"{game_title} not found in dataset")
    else:
        #retrieve index of game in dataset
        index = df[df.Title == game_title].index[0]
        #get pairwise similarity scores then sort based on highest first
        sim_scores = list(enumerate(cosine_sim[index]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        #get the top 10 similar games
        sim_scores = sim_scores[1:11]
        game_indices = [i[0] for i in sim_scores]
        print(df['Title'].iloc[game_indices])

In [403]:
recommendGame(game_title)

425    NieR: Automata - The End of YoRHa Edition
595               Diablo III: Eternal Collection
271            Diablo III: Ultimate Evil Edition
161                                Demon's Souls
381                                     Fable II
726                  Diablo III: Reaper of Souls
480                               Asgard's Wrath
73                                        Diablo
22                                 Mass Effect 2
900                 Nioh 2: The Complete Edition
Name: Title, dtype: object
