In [52]:
import numpy as np
import pandas as pd
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [53]:
games = pd.read_csv('steam.csv')

In [54]:
games.shape # Check the shape of the data

(65111, 9)

In [55]:
games = games[['App ID', 'Title', 'Reviews Total', 'Reviews Score Fancy', 'Release Date', 'Launch Price','Tags', 'Modified Tags', 'Steam Page']]

In [56]:
games.isnull().sum() # Check for missing values

App ID                 0
Title                  0
Reviews Total          0
Reviews Score Fancy    0
Release Date           0
Launch Price           0
Tags                   0
Modified Tags          0
Steam Page             0
dtype: int64

In [57]:
games.dropna(inplace=True)

In [58]:
games['Tags'] = games['Modified Tags'] + ' ' + games['Tags']
games['Tags'] = games['Tags'].astype(str)

In [59]:
games.head() # Check the first few rows

Unnamed: 0,App ID,Title,Reviews Total,Reviews Score Fancy,Release Date,Launch Price,Tags,Modified Tags,Steam Page
0,10,Counter-Strike,137421,97%,01-11-2000,"$9,99","Action_, FPS_, Multiplayer_, Shooter_, Classic...","Action_, FPS_, Multiplayer_, Shooter_, Classic...",https://store.steampowered.com/app/10
1,20,Team Fortress Classic,5475,85%,01-04-1999,"$4,99","Action_, FPS_, Multiplayer_, Classic_, Hero Sh...","Action_, FPS_, Multiplayer_, Classic_, Hero Sh...",https://store.steampowered.com/app/20
2,30,Day of Defeat,3692,87%,01-05-2003,"$4,99","FPS_, World War II_, Multiplayer_, Shooter_, A...","FPS_, World War II_, Multiplayer_, Shooter_, A...",https://store.steampowered.com/app/30
3,40,Deathmatch Classic,1923,80%,01-06-2001,"$4,99","Action_, FPS_, Classic_, Multiplayer_, Shooter...","Action_, FPS_, Classic_, Multiplayer_, Shooter...",https://store.steampowered.com/app/40
4,50,Half-Life: Opposing Force,15498,95%,01-11-1999,"$4,99","FPS_, Action_, Classic_, Sci fi_, Singleplayer...","FPS_, Action_, Classic_, Sci fi_, Singleplayer...",https://store.steampowered.com/app/50


In [60]:
games.shape # Check the shape of the data

(65111, 9)

In [61]:
new_df = games[['Title', 'Reviews Score Fancy', 'Tags', 'Steam Page']]

In [62]:
ps = PorterStemmer()

In [63]:
new_df.head() # Check the first few rows

Unnamed: 0,Title,Reviews Score Fancy,Tags,Steam Page
0,Counter-Strike,97%,"Action_, FPS_, Multiplayer_, Shooter_, Classic...",https://store.steampowered.com/app/10
1,Team Fortress Classic,85%,"Action_, FPS_, Multiplayer_, Classic_, Hero Sh...",https://store.steampowered.com/app/20
2,Day of Defeat,87%,"FPS_, World War II_, Multiplayer_, Shooter_, A...",https://store.steampowered.com/app/30
3,Deathmatch Classic,80%,"Action_, FPS_, Classic_, Multiplayer_, Shooter...",https://store.steampowered.com/app/40
4,Half-Life: Opposing Force,95%,"FPS_, Action_, Classic_, Sci fi_, Singleplayer...",https://store.steampowered.com/app/50


In [64]:
new_df.shape # Check the shape of the data

(65111, 4)

In [65]:
def stem(text): 
    y = [] 
    for i in text.split(): 
        y.append(ps.stem(i)) 
    return ' '.join(y)

In [66]:
new_df['Tags'] = new_df['Tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Tags'] = new_df['Tags'].apply(stem)


In [67]:
cv = CountVectorizer(max_features=8000, stop_words='english')

In [68]:
vectors = cv.fit_transform(new_df['Tags']).toarray()

In [69]:
import faiss

# Convert to float32 as faiss expects this data type
vectors = vectors.astype('float32')

# Initialize the index
index = faiss.IndexFlatL2(vectors.shape[1])  # L2 similarity index

# Add the vectors to the index
index.add(vectors)

# Search for the top-10 nearest neighbors
k = 10  # You can adjust this based on your needs
D, I = index.search(vectors, k)  # D is the distances, I is the indices


In [70]:
def recommend(name):
    # Check if the game exists in the DataFrame
    if name not in new_df['Title'].values:
        return f"Game '{name}' not found in the dataset."
    
    # Get the index of the game
    game_index = new_df[new_df['Title'] == name].index[0]
    
    # Get the vector for the selected game
    query_vector = vectors[game_index].reshape(1, -1)
    
    # Search the faiss index for the top k nearest neighbors
    D, I = index.search(query_vector, k)  # D = distances, I = indices
    
    # Exclude the first match since it's the game itself
    similar_games_indices = I[0][1:]  # Exclude the first match (itself)
    
    # Get the titles of the similar games
    recommended_games = new_df.iloc[similar_games_indices]['Title'].values.tolist()
    
    return recommended_games


In [71]:
new_df.head()
game_name = new_df[new_df['Title'] == 'Grand Theft Auto V']['Title'].values[0]
print(game_name)

Grand Theft Auto V


In [72]:
new_df.head()

Unnamed: 0,Title,Reviews Score Fancy,Tags,Steam Page
0,Counter-Strike,97%,"action_, fps_, multiplayer_, shooter_, classic...",https://store.steampowered.com/app/10
1,Team Fortress Classic,85%,"action_, fps_, multiplayer_, classic_, hero sh...",https://store.steampowered.com/app/20
2,Day of Defeat,87%,"fps_, world war ii_, multiplayer_, shooter_, a...",https://store.steampowered.com/app/30
3,Deathmatch Classic,80%,"action_, fps_, classic_, multiplayer_, shooter...",https://store.steampowered.com/app/40
4,Half-Life: Opposing Force,95%,"fps_, action_, classic_, sci fi_, singleplayer...",https://store.steampowered.com/app/50


In [73]:
recommend('Grand Theft Auto V') # Recommend games similar to Grand Theft Auto V

['Grand Theft Auto: San Andreas',
 'Grand Theft Auto: Vice City',
 'Driver® Parallel Lines',
 'Just Cause™ 3',
 'Grand Theft Auto III',
 'Bionic Commando',
 'Grand Theft Auto IV: The Complete Edition',
 'Mafia II (Classic)',
 'Sleeping Dogs: Definitive Edition']