In [2]:
import pandas as pd
import re

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path = '/content/drive/MyDrive/Project/Video_Games_5.json'

In [None]:
df = pd.read_json(file_path, lines=True)
print(df.head())

   overall  verified   reviewTime      reviewerID        asin  \
0        5      True  10 17, 2015  A1HP7NVNPFMA4N  0700026657   
1        4     False  07 27, 2015  A1JGAP0185YJI6  0700026657   
2        3      True  02 23, 2015  A1YJWEXHQBWK2B  0700026657   
3        2      True  02 20, 2015  A2204E1TH211HT  0700026657   
4        5      True  12 25, 2014  A2RF5B5H74JLPE  0700026657   

        reviewerName                                         reviewText  \
0        Ambrosia075  This game is a bit hard to get the hang of, bu...   
1             travis  I played it a while but it was alright. The st...   
2  Vincent G. Mezera                                           ok game.   
3         Grandma KR  found the game a bit too complicated, not what...   
4                jon  great game, I love it and have played it since...   

                                       summary  unixReviewTime vote style  \
0                  but when you do it's great.      1445040000  NaN   NaN   
1  B

**Data Preprocessing**

In [None]:
df['reviewTime'] = pd.to_datetime(df['reviewTime'])

In [None]:
df = df.dropna(subset=['reviewerID', 'asin', 'overall', 'reviewText'])

In [None]:
user_counts = df['reviewerID'].value_counts()
df = df[df['reviewerID'].isin(user_counts[user_counts >= 5].index)]

game_counts = df['asin'].value_counts()
df = df[df['asin'].isin(game_counts[game_counts >= 10].index)]

In [None]:
print(df['overall'].value_counts())

overall
5    275806
4     84598
3     43986
1     27440
2     21411
Name: count, dtype: int64


In [None]:
df = df.drop_duplicates(subset=['reviewerID', 'asin'])

In [None]:
def clean_text(text):
    if isinstance(text, str):
        # Lowercase
        text = text.lower()
        # Remove punctuation and special characters
        text = re.sub(r'[^\w\s]', '', text)
        return text
    else:
        return ''

In [None]:
df['reviewText'] = df['reviewText'].apply(clean_text)
df['summary'] = df['summary'].apply(clean_text)

In [None]:
df = df.drop(columns=['vote', 'style', 'image'], errors='ignore')

In [None]:
df['review_year'] = df['reviewTime'].dt.year
df['review_month'] = df['reviewTime'].dt.month

In [None]:
df['review_length'] = df['reviewText'].apply(len)
df['summary_length'] = df['summary'].apply(len)

In [None]:
df.to_csv('/content/drive/MyDrive/Project/cleaned_amazon_videogames_reviews.csv', index=False)

In [None]:
print(df.head())

   overall  verified reviewTime      reviewerID        asin  \
0        5      True 2015-10-17  A1HP7NVNPFMA4N  0700026657   
1        4     False 2015-07-27  A1JGAP0185YJI6  0700026657   
2        3      True 2015-02-23  A1YJWEXHQBWK2B  0700026657   
3        2      True 2015-02-20  A2204E1TH211HT  0700026657   
4        5      True 2014-12-25  A2RF5B5H74JLPE  0700026657   

        reviewerName                                         reviewText  \
0        Ambrosia075  this game is a bit hard to get the hang of but...   
1             travis  i played it a while but it was alright the ste...   
2  Vincent G. Mezera                                            ok game   
3         Grandma KR  found the game a bit too complicated not what ...   
4                jon  great game i love it and have played it since ...   

                                      summary  unixReviewTime  review_year  \
0                   but when you do its great      1445040000         2015   
1  but in spit

In [None]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 429544 entries, 0 to 497575
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   overall         429544 non-null  int64         
 1   verified        429544 non-null  bool          
 2   reviewTime      429544 non-null  datetime64[ns]
 3   reviewerID      429544 non-null  object        
 4   asin            429544 non-null  object        
 5   reviewerName    429475 non-null  object        
 6   reviewText      429544 non-null  object        
 7   summary         429544 non-null  object        
 8   unixReviewTime  429544 non-null  int64         
 9   review_year     429544 non-null  int32         
 10  review_month    429544 non-null  int32         
 11  review_length   429544 non-null  int64         
 12  summary_length  429544 non-null  int64         
dtypes: bool(1), datetime64[ns](1), int32(2), int64(4), object(5)
memory usage: 39.7+ MB
None


In [4]:
clean_path = '/content/drive/MyDrive/Project/cleaned_amazon_videogames_reviews.csv'
df = pd.read_csv(clean_path, parse_dates=['reviewTime'])
print(df.head())

   overall  verified reviewTime      reviewerID        asin  \
0        5      True 2015-10-17  A1HP7NVNPFMA4N  0700026657   
1        4     False 2015-07-27  A1JGAP0185YJI6  0700026657   
2        3      True 2015-02-23  A1YJWEXHQBWK2B  0700026657   
3        2      True 2015-02-20  A2204E1TH211HT  0700026657   
4        5      True 2014-12-25  A2RF5B5H74JLPE  0700026657   

        reviewerName                                         reviewText  \
0        Ambrosia075  this game is a bit hard to get the hang of but...   
1             travis  i played it a while but it was alright the ste...   
2  Vincent G. Mezera                                            ok game   
3         Grandma KR  found the game a bit too complicated not what ...   
4                jon  great game i love it and have played it since ...   

                                      summary  unixReviewTime  review_year  \
0                   but when you do its great      1445040000         2015   
1  but in spit

**Feature Engineering**

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

user_item_matrix = df.pivot_table(index='reviewerID', columns='asin', values='overall')
print(f"User-Item matrix shape: {user_item_matrix.shape}")

User-Item matrix shape: (55173, 10671)


In [6]:
df['reviewText'] = df['reviewText'].fillna('')

In [7]:
game_reviews = df.groupby('asin')['reviewText'].apply(lambda x: " ".join(x)).reset_index()

In [8]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(game_reviews['reviewText'])

print(f"TF-IDF matrix shape (games x features): {tfidf_matrix.shape}")

TF-IDF matrix shape (games x features): (10671, 5000)


In [9]:
metadata_features = df[['asin', 'review_year', 'review_month', 'review_length', 'summary_length']].drop_duplicates(subset='asin').set_index('asin')

print("Sample metadata features for games:")
print(metadata_features.head())

Sample metadata features for games:
            review_year  review_month  review_length  summary_length
asin                                                                
0700026657         2015            10             68              25
0700099867         2011             8            414              42
0804161380         2017             2            205               8
6050036071         2009            11            560              27
7293000936         2014             7             18              10


**Collaborative Filtering - Matrix Factorization**

In [11]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=2469549 sha256=8cd01701acadd8e267e2d75696b2a5b3b8bdb6c6f53a5db573d91410c9da05b2
  Stored in directory: /root/.cache/pip/wheels/2a/8f/6e/7e2899163

In [None]:
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split

In [10]:
!pip install 'numpy<2'



In [10]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['reviewerID', 'asin', 'overall']], reader)

In [11]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [12]:
algo = SVD(random_state=42)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x78adbd5434d0>

In [13]:
predictions = algo.test(testset)
print("RMSE:", accuracy.rmse(predictions))

RMSE: 1.0378
RMSE: 1.0377639166594441


In [14]:
def get_svd_recommendations(user_id, n=5):
    user_games = set(df[df['reviewerID']==user_id]['asin'])
    all_games = set(df['asin'].unique())
    games_to_predict = list(all_games - user_games)
    pred = [ (game, algo.predict(user_id, game).est) for game in games_to_predict ]
    pred_sorted = sorted(pred, key=lambda x: x[1], reverse=True)[:n]
    return [x[0] for x in pred_sorted]

In [15]:
user_id = df['reviewerID'].iloc[0]
print("SVD Recommendations for user:", get_svd_recommendations(user_id))

SVD Recommendations for user: ['B0144K8KQW', 'B00GGUPCFQ', 'B00005UK88', 'B00BAWXD88', 'B0014WJ78E']


**Content-Based Filtering - TF-IDF**

In [16]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [17]:
asin_to_index = pd.Series(game_reviews.index, index=game_reviews['asin'])

In [18]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [19]:
def get_similar_games(asin, n=5):
    idx = asin_to_index[asin]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_games_idx = [i[0] for i in sim_scores[1:n+1]]
    return list(game_reviews.loc[top_games_idx, 'asin'])

In [20]:
sample_asin = game_reviews['asin'].iloc[0]
print("Content-based similar games:", get_similar_games(sample_asin))

Content-based similar games: ['B001TOQ8R0', 'B001E2I4H4', 'B00BGAA29M', 'B0001X5YN4', 'B00DB84XQK']


**Content-Based Filtering - Cold Start**

In [21]:
from sklearn.preprocessing import StandardScaler

In [22]:
meta = metadata_features.fillna(metadata_features.mean())
scaler = StandardScaler()
meta_scaled = scaler.fit_transform(meta)

In [23]:
meta_cos_sim = cosine_similarity(meta_scaled, meta_scaled)
meta_index = pd.Series(meta.index, index=range(len(meta.index)))

In [24]:
def recommend_by_metadata(asin, n=5):
    idx = meta.index.get_loc(asin)
    scores = list(enumerate(meta_cos_sim[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    top_idx = [i[0] for i in scores[1:n+1]]
    return list(meta_index[top_idx])

In [25]:
print("Metadata-based similar games:", recommend_by_metadata(sample_asin))

Metadata-based similar games: ['B005LN5UOC', 'B0090PX7KQ', 'B00BBU8VFY', 'B00F6YISIQ', 'B01DJPA9YE']


**Hybrid Approach**

In [26]:
def hybrid_recommend(user_id, n=5, alpha=0.5):
    """alpha: weight for collaborative, (1-alpha) for content-based"""
    user_games = set(df[df['reviewerID']==user_id]['asin'])
    all_games = set(df['asin'].unique())
    candidate_games = list(all_games - user_games)

    collab_scores = {g: algo.predict(user_id, g).est for g in candidate_games}

    played_idx = [asin_to_index[g] for g in user_games if g in asin_to_index]
    content_scores = {}
    for g in candidate_games:
        if g in asin_to_index and played_idx:
            idx = asin_to_index[g]
            sim = np.mean([cosine_sim[idx][pi] for pi in played_idx])
            content_scores[g] = sim
        else:
            content_scores[g] = 0

    hybrid_scores = {g: alpha*collab_scores.get(g,0) + (1-alpha)*content_scores.get(g,0) for g in candidate_games}
    top_games = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)[:n]
    return [g for g,s in top_games]

In [27]:
print("Hybrid recommendations:", hybrid_recommend(user_id, n=5, alpha=0.7))

Hybrid recommendations: ['B005T5OBWY', 'B00BAWXCP2', 'B00KVSQAGO', 'B00000F1GM', 'B017AGIDT6']


**Performance Metrics**

In [28]:
from surprise import accuracy

rmse = accuracy.rmse(predictions)
print(f'RMSE: {rmse:.4f}')

RMSE: 1.0378
RMSE: 1.0378


In [29]:
from collections import defaultdict
import numpy as np

def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n

top_n_pred = get_top_n(predictions, n=10)

ground_truth = defaultdict(set)
for uid, iid, true_r, est, _ in predictions:
    if true_r >= 4.0:
        ground_truth[uid].add(iid)

def precision_recall_at_k(top_n_pred, ground_truth, k=5):
    precisions = []
    recalls = []
    for uid, pred_ratings in top_n_pred.items():
        pred_items = [iid for iid, est in pred_ratings[:k]]
        true_items = ground_truth.get(uid, set())
        if not true_items:
            continue
        n_relevant = len(set(pred_items) & true_items)
        precisions.append(n_relevant / k)
        recalls.append(n_relevant / len(true_items))
    return np.mean(precisions), np.mean(recalls)

precision_5, _ = precision_recall_at_k(top_n_pred, ground_truth, k=5)
_, recall_10 = precision_recall_at_k(top_n_pred, ground_truth, k=10)

print(f'Precision@5: {precision_5:.4f}')
print(f'Recall@10: {recall_10:.4f}')

Precision@5: 0.3572
Recall@10: 0.9977
