In [1]:
import pandas as pd
import ast
import numpy as np

# Load the dataset
games_df = pd.read_csv('games.csv')

# Step 1: Remove Unnecessary Columns
# Drop 'Unnamed: 0' column if not useful
games_df.drop(columns=['Unnamed: 0'], inplace=True)
# Remove duplicate titles
games_df = games_df.drop_duplicates(subset=['Title'], keep='first')

# Reset index after removing duplicates
games_df = games_df.reset_index(drop=True)

# Step 2: Data Type Conversion
# Convert list-like columns from strings to actual lists
games_df['Team'] = games_df['Team'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])
games_df['Genres'] = games_df['Genres'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])

# Convert numerical columns from strings to numeric data types
games_df['Times Listed'] = games_df['Times Listed'].str.replace('K', '000', regex=False).str.replace('.', '', regex=False).astype(int)
games_df['Number of Reviews'] = games_df['Number of Reviews'].str.replace('K', '000', regex=False).str.replace('.', '', regex=False).astype(int)
games_df['Plays'] = games_df['Plays'].str.replace('K', '000', regex=False).str.replace('.', '', regex=False).astype(int)
games_df['Playing'] = games_df['Playing'].str.replace('K', '000', regex=False).str.replace('.', '', regex=False).astype(int)
games_df['Backlogs'] = games_df['Backlogs'].str.replace('K', '000', regex=False).str.replace('.', '', regex=False).astype(int)
games_df['Wishlist'] = games_df['Wishlist'].str.replace('K', '000', regex=False).str.replace('.', '', regex=False).astype(int)

games_df['Rating'] = pd.to_numeric(games_df['Rating'], errors='coerce')

# Step 3: Handle Missing Values
# Fill or remove missing values
# For simplicity, we'll fill numerical missing values with the mean and drop rows where essential data is missing
games_df.fillna({'Rating': games_df['Rating'].mean()}, inplace=True)
games_df.dropna(subset=['Title', 'Genres'], inplace=True)

# Step 4: Feature Engineering
# Create binary features for each genre
genres = set([genre for sublist in games_df['Genres'] for genre in sublist])
for genre in genres:
    games_df[f'Genre_{genre}'] = games_df['Genres'].apply(lambda x: 1 if genre in x else 0)

# Step 5: Cleaned Data Overview
# Display the cleaned dataframe
games_df.head()


Unnamed: 0,Title,Release Date,Team,Rating,Times Listed,Number of Reviews,Genres,Summary,Reviews,Plays,...,Genre_Arcade,Genre_Point-and-Click,Genre_Music,Genre_Platform,Genre_Racing,Genre_Pinball,Genre_Real Time Strategy,Genre_Shooter,Genre_Fighting,Genre_Tactical
0,Elden Ring,"Feb 25, 2022","[Bandai Namco Entertainment, FromSoftware]",4.5,39000,39000,"[Adventure, RPG]","Elden Ring is a fantasy, action and open world...","[""The first playthrough of elden ring is one o...",17000,...,0,0,0,0,0,0,0,0,0,0
1,Hades,"Dec 10, 2019",[Supergiant Games],4.3,29000,29000,"[Adventure, Brawler, Indie, RPG]",A rogue-lite hack and slash dungeon crawler in...,['convinced this is a roguelike for people who...,21000,...,0,0,0,0,0,0,0,0,0,0
2,The Legend of Zelda: Breath of the Wild,"Mar 03, 2017","[Nintendo, Nintendo EPD Production Group No. 3]",4.4,43000,43000,"[Adventure, RPG]",The Legend of Zelda: Breath of the Wild is the...,['This game is the game (that is not CS:GO) th...,30000,...,0,0,0,0,0,0,0,0,0,0
3,Undertale,"Sep 15, 2015","[tobyfox, 8-4]",4.2,35000,35000,"[Adventure, Indie, RPG, Turn Based Strategy]","A small child falls into the Underground, wher...",['soundtrack is tied for #1 with nier automata...,28000,...,0,0,0,0,0,0,0,0,0,0
4,Hollow Knight,"Feb 24, 2017",[Team Cherry],4.4,3000,3000,"[Adventure, Indie, Platform]",A 2D metroidvania with an emphasis on close co...,"[""this games worldbuilding is incredible, with...",21000,...,0,0,0,1,0,0,0,0,0,0


In [16]:
games_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1099 entries, 0 to 1098
Data columns (total 41 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Title                      1099 non-null   object        
 1   Release Date               1096 non-null   datetime64[ns]
 2   Team                       1099 non-null   object        
 3   Rating                     1099 non-null   float64       
 4   Times Listed               1099 non-null   float64       
 5   Number of Reviews          1099 non-null   float64       
 6   Genres                     1099 non-null   object        
 7   Summary                    1098 non-null   object        
 8   Reviews                    1099 non-null   object        
 9   Plays                      1099 non-null   float64       
 10  Playing                    1099 non-null   float64       
 11  Backlogs                   1099 non-null   float64       
 12  Wishli

In [2]:
def min_max_normalize(series):
    min_val = series.min()
    max_val = series.max()
    return (series - min_val) / (max_val - min_val)

# Create aggregate metrics
games_df['Engagement_Score'] = (
    games_df['Times Listed'] + 
    games_df['Number of Reviews'] + 
    games_df['Plays'] + 
    games_df['Playing'] + 
    games_df['Backlogs'] + 
    games_df['Wishlist']
) / 6

# Normalize numerical features
numerical_features = ['Rating', 'Times Listed', 'Number of Reviews', 'Plays', 
                     'Playing', 'Backlogs', 'Wishlist', 'Engagement_Score']

for feature in numerical_features:
    games_df[feature] = min_max_normalize(games_df[feature])

In [3]:
import numpy as np

def cosine_similarity_manual(matrix):
    # Compute the dot product
    dot_product = np.dot(matrix, matrix.T)
    
    # Compute the norms
    norms = np.sqrt(np.sum(matrix**2, axis=1))
    
    # Compute similarity
    similarity = dot_product / np.outer(norms, norms)
    
    # Handle division by zero
    similarity = np.nan_to_num(similarity, 0)
    
    return similarity

# Create feature matrix using genre binary columns and normalized numerical features
feature_cols = [col for col in games_df.columns if col.startswith('Genre_')] + numerical_features
feature_matrix = games_df[feature_cols].values

# Calculate similarity matrix
similarity_matrix = cosine_similarity_manual(feature_matrix)

In [4]:
def get_recommendations(game_title, n_recommendations=5):
    try:
        # Find the index of the game
        idx = games_df[games_df['Title'] == game_title].index[0]
        
        # Get similarity scores
        sim_scores = [(i, similarity_matrix[idx][i]) for i in range(len(games_df))]
        
        # Sort games based on similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
        # Get top N most similar games (excluding the input game)
        sim_scores = sim_scores[1:n_recommendations+1]
        
        # Get game indices
        game_indices = [i[0] for i in sim_scores]
        similarity_scores = [i[1] for i in sim_scores]
        
        # Create recommendations dataframe
        recommendations = games_df.iloc[game_indices][['Title', 'Genres', 'Rating']]
        recommendations['Similarity_Score'] = similarity_scores
        
        return recommendations
        
    except IndexError:
        return "Game not found! Please check the spelling."

In [5]:
def create_user_engagement_matrix():
    # Create engagement matrix using reviews and other interaction data
    engagement_features = ['Playing', 'Backlogs', 'Wishlist']
    
    # Normalize and combine engagement metrics
    engagement_matrix = games_df[engagement_features].values
    
    # Add review engagement
    def get_review_engagement(reviews):
        if pd.isna(reviews):
            return 0
        try:
            return len(ast.literal_eval(reviews))
        except:
            return 0
    
    games_df['Review_Engagement'] = games_df['Reviews'].apply(get_review_engagement)
    games_df['Review_Engagement'] = min_max_normalize(games_df['Review_Engagement'])
    
    # Add review engagement to matrix
    engagement_matrix = np.column_stack([
        engagement_matrix, 
        games_df['Review_Engagement'].values.reshape(-1, 1)
    ])
    
    return engagement_matrix

def hybrid_recommendations(game_title, n_recommendations=5, content_weight=0.7):
    try:
        # Get game index
        idx = games_df[games_df['Title'] == game_title].index[0]
        
        # Get content-based similarity
        content_scores = similarity_matrix[idx]
        
        # Get collaborative scores
        engagement_matrix = create_user_engagement_matrix()
        collab_similarity = cosine_similarity_manual(engagement_matrix)
        collab_scores = collab_similarity[idx]
        
        # Combine scores
        hybrid_scores = (content_weight * content_scores) + ((1 - content_weight) * collab_scores)
        
        # Get recommendations
        sim_scores = [(i, hybrid_scores[i]) for i in range(len(games_df))]
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:n_recommendations+1]
        
        # Create recommendations dataframe
        game_indices = [i[0] for i in sim_scores]
        similarity_scores = [i[1] for i in sim_scores]
        
        recommendations = games_df.iloc[game_indices][['Title', 'Genres', 'Rating']]
        recommendations['Similarity_Score'] = similarity_scores
        
        return recommendations
        
    except IndexError:
        return "Game not found! Please check the spelling."

In [6]:
# Test with some popular games
test_games = ['Elden Ring', 'Minecraft', 'Hollow Knight']

print("Testing Hybrid Recommendations:")
for game in test_games:
    print(f"\nRecommendations for {game}:")
    recommendations = hybrid_recommendations(game)
    print(recommendations)

Testing Hybrid Recommendations:

Recommendations for Elden Ring:
                                      Title                            Genres  \
2   The Legend of Zelda: Breath of the Wild                  [Adventure, RPG]   
14                               Bloodborne                  [Adventure, RPG]   
17                    Red Dead Redemption 2         [Adventure, RPG, Shooter]   
1                                     Hades  [Adventure, Brawler, Indie, RPG]   
12                               God of War         [Adventure, Brawler, RPG]   

      Rating  Similarity_Score  
2   0.902439          0.952053  
14  0.926829          0.944455  
17  0.902439          0.904022  
1   0.878049          0.897055  
12  0.853659          0.879012  

Recommendations for Minecraft:
                                           Title                  Genres  \
551                Animal Crossing: New Horizons             [Simulator]   
831                               Tomodachi Life  [Adventure, Simu

  similarity = dot_product / np.outer(norms, norms)
  similarity = dot_product / np.outer(norms, norms)
  similarity = dot_product / np.outer(norms, norms)


adding temporal features for better feature engineering

In [10]:
import datetime
from datetime import datetime

# Print sample of dates to verify format
print("Sample Release Dates:", games_df['Release Date'].head())

# Convert Release Date to datetime
games_df['Release Date'] = pd.to_datetime(games_df['Release Date'], format='%b %d, %Y', errors='coerce')

# Get today's date
today = datetime.now()

# Calculate days since release
games_df['Days_Since_Release'] = (today - games_df['Release Date']).dt.days

# Calculate recent popularity (handle potential division by zero)
games_df['Recent_Popularity'] = games_df['Playing'] / (games_df['Days_Since_Release'].clip(lower=1))

# Normalize the Recent_Popularity
games_df['Recent_Popularity'] = (games_df['Recent_Popularity'] - games_df['Recent_Popularity'].min()) / \
                               (games_df['Recent_Popularity'].max() - games_df['Recent_Popularity'].min())

# Handle any NaN values
games_df['Recent_Popularity'] = games_df['Recent_Popularity'].fillna(0)

# Print some statistics to verify the calculations
print("\nTemporal Features Statistics:")
print("\nDays Since Release:")
print(games_df['Days_Since_Release'].describe())
print("\nRecent Popularity:")
print(games_df['Recent_Popularity'].describe())

Sample Release Dates: 0   2022-02-25
1   2019-12-10
2   2017-03-03
3   2015-09-15
4   2017-02-24
Name: Release Date, dtype: datetime64[ns]

Temporal Features Statistics:

Days Since Release:
count     1096.000000
mean      4583.239051
std       3129.493316
min       -136.000000
25%       1994.000000
50%       4012.000000
75%       6573.000000
max      16248.000000
Name: Days_Since_Release, dtype: float64

Recent Popularity:
count    1099.000000
mean        0.007641
std         0.046573
min         0.000000
25%         0.000159
50%         0.000542
75%         0.002062
max         1.000000
Name: Recent_Popularity, dtype: float64


adding reviews and summary for text based similarity

In [11]:
# Add text similarity features
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine Summary and Reviews for text analysis
games_df['Text_Content'] = games_df['Summary'].fillna('') + ' ' + games_df['Reviews'].fillna('')

# Create TF-IDF features
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
text_features = tfidf.fit_transform(games_df['Text_Content'])

In [15]:
# 1. Calculate collaborative filtering similarity matrix
def create_engagement_matrix():
    # Use engagement features for collaborative filtering
    engagement_features = ['Playing', 'Backlogs', 'Wishlist']
    
    # Create and normalize engagement matrix
    engagement_matrix = games_df[engagement_features].values
    
    # Add review engagement
    games_df['Review_Engagement'] = games_df['Reviews'].apply(
        lambda x: len(str(x)) if pd.notna(x) else 0
    )
    games_df['Review_Engagement'] = (
        games_df['Review_Engagement'] - games_df['Review_Engagement'].min()
    ) / (games_df['Review_Engagement'].max() - games_df['Review_Engagement'].min())
    
    # Add review engagement to matrix
    engagement_matrix = np.column_stack([
        engagement_matrix, 
        games_df['Review_Engagement'].values.reshape(-1, 1)
    ])
    
    return engagement_matrix

# Calculate all similarity matrices
engagement_matrix = create_engagement_matrix()
collab_similarity = cosine_similarity(engagement_matrix)

# 2. Create enhanced hybrid recommendations function
def enhanced_hybrid_recommendations(game_title, n_recommendations=5, 
                                  content_weight=0.5, 
                                  text_weight=0.3,
                                  engagement_weight=0.2):
    try:
        # Get game index
        idx = games_df[games_df['Title'] == game_title].index[0]
        
        # Get different similarity scores
        genre_scores = similarity_matrix[idx]  # Original content-based similarity
        text_scores = text_similarity_matrix[idx]  # Text-based similarity
        engagement_scores = collab_similarity[idx]  # Collaborative filtering scores
        
        # Combine scores
        final_scores = (
            content_weight * genre_scores +
            text_weight * text_scores +
            engagement_weight * engagement_scores
        )
        
        # Get recommendations
        sim_scores = list(enumerate(final_scores))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = [s for s in sim_scores if s[0] != idx][:n_recommendations]
        
        game_indices = [i[0] for i in sim_scores]
        similarity_scores = [i[1] for i in sim_scores]
        
        # Create recommendations dataframe
        recommendations = games_df.iloc[game_indices][['Title', 'Genres', 'Rating']]
        recommendations['Similarity_Score'] = similarity_scores
        
        # Add individual similarity scores for transparency
        recommendations['Genre_Similarity'] = genre_scores[game_indices]
        recommendations['Text_Similarity'] = text_scores[game_indices]
        recommendations['Engagement_Similarity'] = engagement_scores[game_indices]
        
        return recommendations
        
    except IndexError:
        return f"Game '{game_title}' not found! Please check the spelling."

# Test the enhanced recommendations
print("\nTesting enhanced hybrid recommendations:")
test_game = "Elden Ring"
recommendations = enhanced_hybrid_recommendations(test_game)
print(f"\nRecommendations for {test_game}:")
print(recommendations)


Testing enhanced hybrid recommendations:

Recommendations for Elden Ring:
                                      Title                            Genres  \
2   The Legend of Zelda: Breath of the Wild                  [Adventure, RPG]   
14                               Bloodborne                  [Adventure, RPG]   
1                                     Hades  [Adventure, Brawler, Indie, RPG]   
51                 The Witcher 3: Wild Hunt                  [Adventure, RPG]   
63                               Dark Souls                  [Adventure, RPG]   

      Rating  Similarity_Score  Genre_Similarity  Text_Similarity  \
2   0.902439          0.704382          0.960515         0.122890   
14  0.926829          0.698502          0.960658         0.145585   
1   0.878049          0.666133          0.860649         0.137647   
51  0.878049          0.657124          0.820392         0.290275   
63  0.878049          0.655351          0.868436         0.260665   

    Engagement_Similari

In [13]:
# Create a function to calculate team similarity between two games
def calculate_team_similarity(team1, team2):
    team1_set = set(team1)
    team2_set = set(team2)
    
    if len(team1_set) == 0 or len(team2_set) == 0:
        return 0
    
    # Jaccard similarity
    return len(team1_set & team2_set) / len(team1_set | team2_set)

# Function to add team similarity for a specific game
def add_team_similarity(games_df, game_title):
    try:
        # Get the index of the reference game
        idx = games_df[games_df['Title'] == game_title].index[0]
        reference_team = games_df.loc[idx, 'Team']
        
        # Calculate similarity for all games
        games_df['Team_Similarity'] = games_df['Team'].apply(
            lambda x: calculate_team_similarity(x, reference_team)
        )
        
        return games_df
        
    except IndexError:
        print(f"Game '{game_title}' not found!")
        return games_df

# Example usage:
# When you want to get recommendations for a specific game:
def get_recommendations_with_team(game_title, n_recommendations=5):
    try:
        # Add team similarity scores
        temp_df = add_team_similarity(games_df.copy(), game_title)
        
        # Get the index of the game
        idx = temp_df[temp_df['Title'] == game_title].index[0]
        
        # Calculate final similarity score (combining existing similarity with team similarity)
        similarity_scores = 0.7 * similarity_matrix[idx] + 0.3 * temp_df['Team_Similarity'].values
        
        # Get recommendations
        sim_scores = list(enumerate(similarity_scores))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = [s for s in sim_scores if s[0] != idx][:n_recommendations]
        
        game_indices = [i[0] for i in sim_scores]
        similarity_scores = [i[1] for i in sim_scores]
        
        # Create recommendations dataframe
        recommendations = temp_df.iloc[game_indices][['Title', 'Genres', 'Rating', 'Team']]
        recommendations['Similarity_Score'] = similarity_scores
        recommendations['Team_Similarity'] = temp_df.iloc[game_indices]['Team_Similarity']
        
        return recommendations
        
    except IndexError:
        return f"Game '{game_title}' not found! Please check the spelling."

# Test the function
print("\nTesting recommendations with team similarity:")
test_game = "Elden Ring"
recommendations = get_recommendations_with_team(test_game)
print(f"\nRecommendations for {test_game}:")
print(recommendations)


Testing recommendations with team similarity:

Recommendations for Elden Ring:
                                       Title            Genres    Rating  \
30                            Dark Souls III  [Adventure, RPG]  0.853659   
63                                Dark Souls  [Adventure, RPG]  0.878049   
121  Dark Souls II: Scholar of the First Sin  [Adventure, RPG]  0.682927   
14                                Bloodborne  [Adventure, RPG]  0.926829   
220                            Dark Souls II  [Adventure, RPG]  0.634146   

                                            Team  Similarity_Score  \
30    [Bandai Namco Entertainment, FromSoftware]          0.932358   
63    [FromSoftware, Bandai Namco Entertainment]          0.907905   
121   [Bandai Namco Entertainment, FromSoftware]          0.782928   
14   [FromSoftware, Sony Computer Entertainment]          0.772461   
220   [FromSoftware, Bandai Namco Entertainment]          0.768705   

     Team_Similarity  
30          1.00000