In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/steam-store-games/steam_support_info.csv
/kaggle/input/steam-store-games/steam_media_data.csv
/kaggle/input/steam-store-games/steamspy_tag_data.csv
/kaggle/input/steam-store-games/steam_requirements_data.csv
/kaggle/input/steam-store-games/steam_description_data.csv
/kaggle/input/steam-store-games/steam.csv


To make the project work, I need to import necessary libraries.


1.  Tfidf Vectorizer, this library will help us to convert words into vectors which we will be the basis of recommendation.
2.  Cosine similarity, this is a function which finds the cosine distance between two vectors.

scikit-learn provides both the libraries.

In [2]:
# Importing the libraries from sklearn

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

For our recommender system to work, we need two types of data:

1. Data which will provide details about game like genre, publisher, category, price, etc.
2. Second and the last type of data that we need is description of the game

We will use pandas to import our data to variables.

In [3]:
game = pd.read_csv('/kaggle/input/steam-store-games/steam.csv')
description = pd.read_csv('/kaggle/input/steam-store-games/steam_description_data.csv')

In [4]:
game.head()

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,1273,267,258,184,5000000-10000000,3.99
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,FPS;Action;Sci-fi,0,5250,288,624,415,5000000-10000000,3.99


As we can see there are multiple column that will not help in our recommender system, columns being ratings, playtime, owners etc. Reason for not choosing playtime and ratings is that it comes under collaborative filtering, and owners are not defined in terms of number but is defined in terms of range. Therefore, not choosing owners column.

If this dataset has a problem then I have to check for the description dataset.

In [5]:
description.head()

Unnamed: 0,steam_appid,detailed_description,about_the_game,short_description
0,10,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...
1,20,One of the most popular online action games of...,One of the most popular online action games of...,One of the most popular online action games of...
2,30,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...
3,40,Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...
4,50,Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...


I either want 'detailed_description' or 'short_description'. I will gor detailed_description as it will provide me more data to recommend on.

Now, changing the dataset.

In [6]:
game = game[['appid', 'name', 'english', 'developer', 'publisher', 'platforms', 'categories', 'genres', 'price']]
description = description[['steam_appid', 'detailed_description', 'about_the_game']]

We need to work on a single dataset. So, I will join both of them through merge and fit it in a new variable.

In [7]:
description.rename(columns = {'steam_appid':'appid'}, inplace = True) # had to rename it because merge won't work on different column name
games_df = pd.merge(game, description, on = ['appid'])

In [8]:
games_df.head()

Unnamed: 0,appid,name,english,developer,publisher,platforms,categories,genres,price,detailed_description,about_the_game
0,10,Counter-Strike,1,Valve,Valve,windows;mac;linux,Multi-player;Online Multi-Player;Local Multi-P...,Action,7.19,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...
1,20,Team Fortress Classic,1,Valve,Valve,windows;mac;linux,Multi-player;Online Multi-Player;Local Multi-P...,Action,3.99,One of the most popular online action games of...,One of the most popular online action games of...
2,30,Day of Defeat,1,Valve,Valve,windows;mac;linux,Multi-player;Valve Anti-Cheat enabled,Action,3.99,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...
3,40,Deathmatch Classic,1,Valve,Valve,windows;mac;linux,Multi-player;Online Multi-Player;Local Multi-P...,Action,3.99,Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...
4,50,Half-Life: Opposing Force,1,Gearbox Software,Valve,windows;mac;linux,Single-player;Multi-player;Valve Anti-Cheat en...,Action,3.99,Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...


Now, both the dataset are merged, I want to drop NaN values.

In [9]:
games_df.dropna(inplace = True)

We need to copy the dataset to new variable because we want to retain appid column and the name column

In [10]:
new_df = games_df.copy()

Price columns are float type data, we need to convert it into a string because of Tfidf Vectorizer

In [11]:
games_df['price'] = games_df['price'].apply(lambda x: str(x))

Due to some problems few data were integers or float, I converted all of them to strings

In [12]:
columns = games_df.columns.tolist()
del columns[0]

In [13]:
for x in columns:
    
    games_df[x] = games_df[x].apply(lambda x: str(x))
    games_df[x] = games_df[x].apply(lambda x:x.split())

After converting them to string objects, we need join all the columns and store them in a new column.
Then we will convert all the list into one single sentence and use 'lower' function to convert them into lower characters.

In [14]:
games_df["conversion"] = games_df['name'] # conversion is our new column and name column will be the first one and others will join on it

for x in range(1, len(columns)):
    
    games_df['conversion'] = games_df['conversion'] + games_df[columns[x]] # this program make them join

In [15]:
games_df['conversion'] = games_df['conversion'].apply(lambda x: ' '.join(x))
games_df['conversion'] = games_df['conversion'].apply(lambda x: x.lower())

I will use TfidfVectorizer, with max features (maximum amount of unique words) being 5000 and stop words(words like 'to', 'and', 'I' to be removed) parameter to 'english'

In [16]:
vectorizer = TfidfVectorizer(max_features = 5000, stop_words = 'english')
vectorizer = vectorizer.fit_transform(games_df['conversion'])

Cosine similarity computes the cosine distance between two vectors. It will compute cosine distance of all vectors between each other.

In [17]:
similarity = cosine_similarity(vectorizer)

Creating a function that will recommend top 10 games on the basis of names

1. it will get the index of the game
2. it will find the similarity array
3. then it will sort the array in decreasing order
4. finally it will display one by one

In [18]:
def recommend(game_name):
    
    game_index = new_df[new_df['name'] == game_name].index[0]
    distance = similarity[game_index]
    games_list = sorted(list(enumerate(distance)), reverse = True, key = lambda x: x[1])[1:10]
    
    for x in games_list:
        print(new_df['name'].iloc[x[0]])

In [19]:
recommend('Counter-Strike')

Team Fortress Classic
Half-Life
Ricochet
Day of Defeat: Source
Deathmatch Classic
Counter-Strike: Condition Zero
Counter-Strike: Global Offensive
Counter-Strike: Source
Insurgency
