In [3]:
import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt
import nltk

# Loading the dataset

In [4]:
revs_og = pd.read_csv('kaggle/dataset.csv')
revs_og.head()

Unnamed: 0,app_id,app_name,review_text,review_score,review_votes
0,10,Counter-Strike,Ruined my life.,1,0
1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1
2,10,Counter-Strike,This game saved my virginity.,1,0
3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0
4,10,Counter-Strike,"Easy to learn, hard to master.",1,1


Unnamed: 0,app_id,app_name,review_text,review_score,review_votes
0,10,Counter-Strike,Ruined my life.,1,0
1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1
2,10,Counter-Strike,This game saved my virginity.,1,0
3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0
4,10,Counter-Strike,"Easy to learn, hard to master.",1,1


In [None]:
revs_og.shape

In [None]:
revs_og.info()

# 1. Data Cleaning

**Null Value Checking**

In [None]:
revs_og.isnull().sum()

In [None]:
revs = revs_og[['app_id', 'app_name', 'review_score','review_text']]

In [None]:
revs.isnull().sum()

In [None]:
revs.dropna(inplace = True)

In [None]:
revs.head()

In [None]:
revs['review_score'].unique()

In [None]:
revs.shape

In [None]:
new_df = revs.sample(n = 50000)

In [None]:
new_df.shape

In [None]:
new_df.isnull().sum()

In [None]:
new_df.info()

In [None]:
new_df.reset_index(drop = True, inplace = True)

In [None]:
new_df.head()

**Checking if there is any duplicated row**

In [None]:
# Print the duplicated
# new_df[new_df.duplicated()]
new_df.duplicated().sum()

**Dropping the duplicated rows**

In [None]:
new_df.drop_duplicates(inplace=True)

In [None]:
new_df.duplicated().sum()

In [None]:
new_df.shape

In [None]:
new_df.rename(columns={'review_score':'target','review_text':'text'},inplace=True)

In [None]:
new_df.head()

# 2. EDA

In [None]:
new_df['target'].value_counts()

In [None]:
plt.pie(new_df['target'].value_counts(),labels=['positive','negative'],autopct='%0.2f')
plt.show()

> Data is imbalanced

In [None]:
nltk.download('punkt')

In [None]:
new_df['char_num'] = new_df['text'].apply(len)

In [None]:
new_df.head()

In [None]:
#word nums
new_df['word_num'] = new_df['text'].apply(lambda x: len(nltk.word_tokenize(x)))
new_df.head()

In [None]:
#num of sentences
new_df['sent_num'] = new_df['text'].apply(lambda x: len(nltk.sent_tokenize(x)))
new_df.head()

In [None]:
new_df[['char_num','word_num','sent_num']].describe()

In [None]:
#neg. reviews description
new_df[new_df['target'] == -1][['char_num','word_num','sent_num']].describe()

In [None]:
#pos. reviews description
new_df[new_df['target'] == 1][['char_num','word_num','sent_num']].describe()

In [None]:
import seaborn as sns

In [None]:
sns.histplot(new_df[new_df['target'] == 1]['char_num'])
sns.histplot(new_df[new_df['target'] == -1]['char_num'],color='red')

plt.show()

In [None]:
sns.histplot(new_df[new_df['target'] == 1]['word_num'])
sns.histplot(new_df[new_df['target'] == -1]['word_num'],color='red')

plt.show()

In [None]:
sns.histplot(new_df[new_df['target'] == 1]['sent_num'])
sns.histplot(new_df[new_df['target'] == -1]['sent_num'],color='red')

plt.show()

In [None]:
#2D plots with the, using the target value as the X-axis, while the rest of the DF as the Y-axis
# sns.pairplot(new_df,hue='target')
# plt.show()

In [None]:
# TODO Doesn't work(?) -> convert a String into a Float for whathever reason
# sns.heatmap(new_df.corr(),annot=True)
# plt.show()

# 3. Data Preprocessing
* Lower case
* Tokenization
* Removing special characters
* Removing stop words and punctuation
* stemming

In [None]:
new_df.reset_index(drop = True, inplace = True)

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
import string
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
string.punctuation

In [None]:
#Self-contained function for pandarallel to work
import nltk
nltk.download('stopwords')

def convert_text(text):
    #Importing libraries to work with pandarallel
    import nltk
    from nltk.corpus import stopwords
    import string
    from nltk.stem.porter import PorterStemmer

    ps = PorterStemmer()

    text = text.lower()
    text = nltk.word_tokenize(text)
    
    x = []
    y =[]
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            x.append(i)
    for i in x:
        y.append(ps.stem(i))
    return ' '.join(y)

In [None]:
new_df['text'][8]

In [None]:
from pandarallel import pandarallel
pandarallel.initialize()
new_df['converted_text']=new_df['text'].parallel_apply(convert_text)
# new_df['converted_text']=new_df['text'].apply(convert_text)

In [None]:
new_df.head()

In [None]:
from wordcloud import WordCloud
wc = WordCloud(width = 500,height=500,min_font_size = 10,background_color = 'white')

In [None]:
pos_wc = wc.generate(new_df[new_df['target']==1]['converted_text'].str.cat(sep=" "))
plt.imshow(pos_wc)
plt.show()

In [None]:
neg_wc = wc.generate(new_df[new_df['target']==-1]['converted_text'].str.cat(sep=" "))
plt.imshow(neg_wc)
plt.show()

# 4. Model Building

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer(max_features = 3000)
tfidf = TfidfVectorizer(max_features=5000)

In [None]:
# X_cv = cv.fit_transform(new_df['converted_text']).toarray()
X_tfid = tfidf.fit_transform(new_df['converted_text']).toarray()
X = X_tfid

In [None]:
X.shape

In [None]:
y = new_df['target'].values

In [None]:
#y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state = 34)

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
X_test.shape

In [None]:
y_test.shape

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn import tree
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, f1_score, recall_score


In [None]:
# gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()
tree = tree.DecisionTreeClassifier()
sgd = SGDClassifier(loss="modified_huber")

In [None]:
#Training the 5 defined models

# gnb.fit(X_train,y_train)
# print("Gaussian Naive Bayes fitted.")
mnb.fit(X_train,y_train)
print("Multinomial Naive Bayes fitted.")
bnb.fit(X_train,y_train)
print("Bernoulli Naive Bayes fitted.")
# tree.fit(X_train, y_train)
print("Decisions tree fitted.")
sgd.fit(X_train, y_train)
print("Stochastic Gradient Descent fitted.")

In [None]:
stats = {}
def add_stats(algorithm, test, pred):
    stats[algorithm] = {
                        "F1 %" : round(f1_score(test,pred)*100,2),
                        "Precision %" : round(precision_score(test,pred)*100,2),
                        "Recall %" : round(recall_score(test,pred)*100,2),
                        "Accuracy %" : round(accuracy_score(test, pred)*100,2),
                        # "Confusion Matrix" : confusion_matrix(test,pred)
                        }

In [None]:
#Predictions and adding stats to the dictionary.
# y_pred1 = gnb.predict(X_test)
# add_stats("Gaussian Native Bayes", y_pred1, y_test)

y_pred2 = mnb.predict(X_test)
add_stats("Multinomial Native Bayes", y_pred2, y_test)

y_pred3 = bnb.predict(X_test)
add_stats("Bernoulli Native Bayes", y_pred3, y_test)

# y_pred4 = tree.predict(X_test)
# add_stats("Decisions Tree", y_pred4, y_test)

y_pred5 = sgd.predict(X_test)
add_stats("Stochastic Gradient Descent", y_pred5, y_test)

#Building table
df_stats = pd.DataFrame.from_dict(stats, orient='index')
df_stats = df_stats.sort_values(by="F1 %", ascending=False)
fig, ax = plt.subplots(figsize=(8, 4))

# Hide axes
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)

# Hide axes' labels
ax.set_frame_on(False)
ax.table(cellText=df_stats.values, colLabels=df_stats.columns, rowLabels=df_stats.index, loc='center')

plt.show()

In [None]:
import math
def execute_sentiment_analysis(text, sa_algorithm):
    if text is None or text == "" or (math.isnan(text) if type(text) == float else False):
        return 0
    test = convert_text(text)
    X = tfidf.transform([test])
    return sa_algorithm.predict(X)[0] * max(sa_algorithm.predict_proba(X)[0])

In [None]:
print(execute_sentiment_analysis('game is a waste of time', sgd))
print(execute_sentiment_analysis('I really loved this game', sgd))
print(execute_sentiment_analysis('', sgd))

In [None]:
#benchmark the algorithms
reviews = [
    ["Imagine waiting for an hour and ten minutes and still not being able to get into a private game.",-1],
    ["I have been playing this game fairly consistently for the past 8 or so years. I also lead an in-game Fleet (Guild) that has had a constant player base that has grown and shrunk over time with sometimes hundreds of people. In other words, I truly love this game and almost always have enjoyed playing it. Which makes what I'm about to say very hard.Over these years the game has steadily become one of the most greediest, dirtiest cash grabs in the current Free-to-Play MMO market. It takes literal months to grind (and I mean GRIND) for a free, premium ship. Even other free ships, like the summer/winter event ones, still require weeks of grinding - and it NEVER changes. They don't change these events, so every year it's the same activities. which becomes extremely dull extremely fast.Not to mention EVERYTHING else in the game requires a purchase. Uniforms, lockbox keys, bridge officers, playable factions, inventory slots, bank slots, ship slots, duty officers, shuttles, equipment upgrades, etc. While some fans may attack me and say that not ALL of these items are only attainable through real money purchases, it is MUCH easier to buy them than to grind for them.Another in-game currency, Dilithium, has had a very small cap that has never changed for countless years. This currency allows you to upgrade your gear, support your Fleet, buy special equipment you unlock through Reputations (which, of course, require a little bit of grinding, but not nearly as bad as the previously mentioned gripes) and other useful items. The cap is insultingly low, and once again, some fans may say 'well, you can refine a little extra from your Fleet mine!' or 'If you're a lifetime member, you can refine an extra 1,000 every few days'. Of course, lifetime subscriptions cost $200-$300, assuming they're on sale. Obviously a small price to pay for a little extra dilithium that can only be initiated through a certain mission on a certain planet.This is only scratching the surface of this game's issues. A lot of the older missions have loads of bugs and are simply outdated (think Nimbus), some of the newer ones have some too, the engine has been out of date for years and they show no sign of upgrading it, and there's just so many bugs sometimes. I can recall when expansions would drop and certain missions or activities would literally cause your game to crash. I understand some of these times bugs will happen, but Cryptic never seems to truly test their new content.I am truly saddened and sickened by how awful the monetisation methods have become in this game. Like I said, I still do love this game, but I don't recommend new players to try it right now. I PRAY that they will eventually realize the insanity behind their practices and actually put some love and effort into this game, but until they do, you're better off playing older Star Trek games like Armada. ",-1],
    ["The demo ran flawlessly on Linux via Proton. After release, Denuvo absolutely destroys performance. Micro-stutters abound, the death of a game in this genre. On windows, there are fewer stutters, so it's a bit more playable, but they are still present at times. This game looks like a masterpiece, shame it can't be played normally. ",-1],
    ["I COULDNT GO DOWN THE SLIDE! THIS GAME SUCKED DONKEY BALLS!",-1],
    ["I should have refunded this a bit sooner. ",-1],
    ["It's a good game ruined by jank and the boring loot mechanics. Also no music??. ",-1],
    ["they like turned a sick ass tactics mmo into a crappy mobile game, it's insane",-1],
    ["Server issues aside, Game is solid 8/10",1],
    ["To sum up this game in a way only players will understand: This game has a unique ability to make you think about your next play through while you're still playing your current one.",1],
    ["Don't play if you enjoy remembering to eat or drink, remembering there is an outside or having free time. Other than that amazing game that can keep you entertained for hours on end",1],
    ["Don't spend real money on this. You can easily get to rank 5 by playing nothing but story missions, get some fanservice, fly the Enterprise-E around, do all the things you really want to do as a Star Trek fan, all within a few dozen hours. If, at the end of that journey, you find that you enjoy MMORPGS, then knock yourself out. But if you're a non-MMO-player like me who just really loves the Star Trek franchise, free-to-play is the way to go. ",1],
    ["Phenomenal. The combat is excellent and the level design is on point. This is a soulsborne where the developers did their homework. ",1],
    ["The game is filled with bugs but I'm still recommending it because I got to witness my friends getting scared to death which was a really pleasant experience.",1],
    ["some ♥♥♥♥ called me the n word so I nuked the whole map. i love this game",1],
    ["this is cheaper then drugs and more addictive but with somehow the same damaging potential to your social life but with non of the long lasting effects and with better story telling highly recommend",1],
    ["You must run to collect more pesos",1]
    ]

sent_analysis_algs = [
    [mnb, 'MultinomialNB'],
    # [gnb, 'GaussianNB'],
    [bnb, 'BernoulliNB'],
    [tree, 'tree'],
    [sgd, 'SGD']]

benchmark = {}
for alg, name in sent_analysis_algs:
    benchmark[name] = [0, []]
    for rev, expected_res in reviews:
        res = execute_sentiment_analysis(rev, alg)
        if res == expected_res:
            benchmark[name][0] +=1
        else:
            benchmark[name][1].append(rev)
benchmark

# 5.Saving the models


In [None]:
from joblib import dump
# TODO add the text indicization (tf_idf, cv)
# dump(mnb, 'MultinomialNB.joblib')
# # dump(gnb, 'GaussianNB.joblib')
# dump(bnb, 'BernoulliNB.joblib')
# dump(tree, 'tree.joblib')
# dump(sgd, 'SGD.joblib')

# 6.Loading the models

In [None]:
from joblib import load
# TODO add the text indicization (tf_idf, cv)
# mnb = load('MultinomialNB.joblib')
# # gnb = load('GaussianNB.joblib')
# bnb = load('BernoulliNB.joblib')
# tree = load('tree.joblib')
# sgd = load('SGD.joblib')

# 7.Extract the text from the review

In [None]:
all_ids = set(revs_og['app_id'])
all_ids

In [None]:
keys = {'app_id', 'app_name'}
all_games = {key: revs_og[key] for key in keys}

In [None]:
text_extracted = revs_og.groupby('app_id')['review_text'].apply(list)

In [None]:
text_extracted.describe()

# 8.Load game categories

In [165]:
#loading game categories

gameid_categories_transported = pd.read_csv('games_tags.csv')

# Reset the index to have a numeric index
gameid_categories_transported.reset_index(drop=True, inplace=True)

# Melt the DataFrame to convert it back to the original format
game_categories = gameid_categories_transported.melt(var_name="game_id", value_name="categories")

# Drop rows with NaN values (optional, if you want to remove them)
game_categories = game_categories.dropna()

# Group by game_id and aggregate categories into a list
game_categories = game_categories.groupby('game_id')['categories'].agg(list).reset_index()
game_categories['game_id'] = game_categories['game_id'].astype('int64')

# Reset the index of the melted DataFrame
game_categories.reset_index(drop=True, inplace=True)

# Display the resulting DataFrame
game_categories.head()

Unnamed: 0,game_id,categories
0,10,"[Action, FPS, Multiplayer, Shooter, Classic, T..."
1,1002,"[2D Fighter, Martial Arts, Intentionally Awkwa..."
2,100400,"[Animation & Modeling, Software]"
3,10090,"[Zombies, World War II, FPS, Multiplayer, Acti..."
4,100980,"[Animation & Modeling, Utilities, Design & Ill..."


# 9.Load the previous analysis done to calculate the total scores.

In [166]:
game_scores = pd.read_csv("games_score.csv")
game_scores.head()

Unnamed: 0,game_id,score
0,10,21169.215589
1,20,2278.877286
2,30,1707.040703
3,40,680.946681
4,50,3060.880613


# 10.Load the user's gaming behaviour

## 10.1 Loading the steam api key

In [167]:
 # Get the stored api key
file_path = 'api_key.txt'

try:
    # Open the file in read mode
    with open(file_path, 'r') as file:
        # Read the API key from the file
        api_key = file.read().strip()
        print(f"Steam API Key found")
except FileNotFoundError:
    print(f"Api file '{file_path}' not found.")


Steam API Key found


## 10.2 Requesting the user's total playtime on videogames

In [168]:
import requests
steam_user_ids = [76561198055831348, 76561198378195772, 76561198051361780, 76561198252171946, 76561198048420079, 76561198448640683, 76561198092311292, 76561199418749721, 76561199483351680, 76561199482701362, 76561199418605621, 76561198071705583, 76561198331830889, 76561199157185623, 76561198213180295, 76561199076757859]
users_playtime = pd.DataFrame()
counter = 0
for s in steam_user_ids:
    get_owned_games_url = "http://api.steampowered.com/IPlayerService/GetOwnedGames/v0001/?key={}&steamid={}&format=json&include_played_free_games".format(api_key, s)
    response = requests.get(get_owned_games_url)
    if response.status_code == 200 and response.json().get("response").__len__() != 0:
        response = response.json()
        games_list = response["response"]["games"]
        appid_playtime_dict = {game["appid"]: game["playtime_forever"] for game in games_list}
        user_playtime = pd.DataFrame.from_dict(appid_playtime_dict, orient='index')
        user_playtime.reset_index(inplace=True)
        user_playtime.columns = ['game_id', 'playtime_forever']
        user_playtime['user_id'] = counter
        counter += 1
        users_playtime = pd.concat([users_playtime, user_playtime], ignore_index=True)
        print("Success")
    else:
        print("Something went wrong: ",response.status_code,response.text)

Success
Something went wrong:  200 {"response":{}}
Something went wrong:  200 {"response":{}}
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success
Success


In [169]:
users_playtime = users_playtime[['game_id','user_id','playtime_forever']]
users_playtime.head()

Unnamed: 0,game_id,user_id,playtime_forever
0,2100,0,531
1,2130,0,0
2,4000,0,1747
3,12900,0,904
4,19900,0,156


## Recommender system

## Games score normalizer

In [170]:
#Preparing dataset based on the sentiment_analysis that contains: user_id, item_id, score, normalized_score
new_game_scores = game_scores.copy()
game_id_counts = revs_og['app_id'].value_counts().reset_index()
game_id_counts.columns = ['app_id', 'occurrence']
game_id_counts.rename(columns={'app_id': 'game_id'}, inplace=True)

new_game_scores_normalized = pd.merge(new_game_scores, game_id_counts, on ='game_id', how='left')

#Sweet spot
k = 20

new_game_scores_normalized['normalized_score'] = (new_game_scores_normalized['score'] / new_game_scores_normalized['occurrence']) * (0.8 + 0.2 * (new_game_scores_normalized['occurrence'] / (new_game_scores_normalized['occurrence'] + k))) / 2 * 10

print(new_game_scores_normalized.head())

   game_id         score  occurrence  normalized_score
0       10  21169.215589       12353          8.565681
1       20   2278.877286        1542          7.370433
2       30   1707.040703        1030          8.255037
3       40    680.946681         457          7.387707
4       50   3060.880613        1743          8.760573


In [171]:
#Preparing feature dataset, starting by copying the game_categories dataframe
new_game_categories = game_categories.copy()
new_game_categories.head()

Unnamed: 0,game_id,categories
0,10,"[Action, FPS, Multiplayer, Shooter, Classic, T..."
1,1002,"[2D Fighter, Martial Arts, Intentionally Awkwa..."
2,100400,"[Animation & Modeling, Software]"
3,10090,"[Zombies, World War II, FPS, Multiplayer, Acti..."
4,100980,"[Animation & Modeling, Utilities, Design & Ill..."


In [172]:
#Merging the previous dataframes into one that has all the data that we've got for every game.
game_df = pd.merge(new_game_scores_normalized, new_game_categories, on ='game_id', how='inner')
game_df = game_df[['game_id','normalized_score','categories', 'occurrence', 'score']]
game_df.head()

Unnamed: 0,game_id,normalized_score,categories,occurrence,score
0,10,8.565681,"[Action, FPS, Multiplayer, Shooter, Classic, T...",12353,21169.215589
1,20,7.370433,"[Action, FPS, Multiplayer, Classic, Hero Shoot...",1542,2278.877286
2,30,8.255037,"[FPS, World War II, Multiplayer, Shooter, Acti...",1030,1707.040703
3,40,7.387707,"[Action, FPS, Classic, Multiplayer, Shooter, F...",457,680.946681
4,50,8.760573,"[FPS, Action, Classic, Sci-fi, Singleplayer, S...",1743,3060.880613


In [173]:
# Building the recommender system dataset
from lightfm.data import Dataset

new_dataset = Dataset(user_identity_features=False)
# Fitting the recommender system by giving every used_id and game_id in the datasets.
new_dataset.fit(users_playtime['user_id'], game_df['game_id'])

# Printing the shape of both the datasets
num_users, num_items = new_dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 14, num_items 9368.


## Building item features, without weights

In [174]:
# Partial fit of items with their own features (not using weights, because all the weights would be the same)

print("Starting item Partial fit")
for _, row in game_df.iterrows():
    game_id, _, categories, _, _ = row
    new_dataset.fit_partial(items=[game_id], item_features=categories)
print("Partial item fit done")

Starting item Partial fit
Partial item fit done


## Building User features with categories weight

In [175]:
# Building the dataframe that correlates (user -> category score), to fit users with their own weights

# Step 1: Join the users_playtime and game_categories df
new_game_categories_user_score = pd.merge(new_game_categories, users_playtime, on="game_id", how="right")
new_game_categories_user_score.head()

# Step 2: Explode the "categories" column
exploded_df = new_game_categories_user_score.explode("categories")

# Step 2: Group by "user_id" and "categories" and sum the "score"
aggregated_df = exploded_df.groupby(["user_id", "categories"])["playtime_forever"].sum().reset_index()

# Step 3: Create a new DataFrame with the aggregated data
user_category_score = aggregated_df.rename(columns={"categories": "category","playtime_forever": "user_category_score"})

# Display the new DataFrame
user_category_score.head()

Unnamed: 0,user_id,category,user_category_score
0,0,1980s,65
1,0,1990's,1272
2,0,2.5D,622
3,0,2D,50344
4,0,2D Fighter,2372


## Partial fit user features with weights.

In [176]:
# Partial fit of users with their own features (categories) and weights

print("Starting user Partial fit")
for _, row in user_category_score.iterrows():
    user_id, category, category_score= row
    new_dataset.fit_partial(users=[user_id], user_features={category: category_score})
print("Partial user fit done")

Starting user Partial fit
Partial user fit done


In [177]:
users_features = new_dataset.build_user_features(((x['user_id'], {x['category']: x['user_category_score']})for _,x in user_category_score.iterrows()))

In [178]:
item_features = new_dataset.build_item_features(((x['game_id'], x['categories'])for _,x in game_df.iterrows()))

In [179]:
# Clean interactions for games not in the list.
merged_df = pd.merge(users_playtime, game_df[['game_id', 'normalized_score', 'score']], on='game_id', how='inner')

print(merged_df.head())

max_playtime = merged_df['playtime_forever'].max()

# Building user -> videogames interaction dataset, using their playtime
(interactions, weights) = new_dataset.build_interactions(((x['user_id'], x['game_id'], x['playtime_forever'])for _,x in merged_df.iterrows()))

print(repr(interactions))
print(repr(weights))

   game_id  user_id  playtime_forever  normalized_score         score
0     2100        0               531          8.593249   2859.767550
1     4000        0              1747          7.021904  39242.607207
2     4000        9             32761          7.021904  39242.607207
3     4000       11              1123          7.021904  39242.607207
4     4000       12              7232          7.021904  39242.607207
<14x9368 sparse matrix of type '<class 'numpy.int32'>'
	with 1484 stored elements in COOrdinate format>
<14x9368 sparse matrix of type '<class 'numpy.float32'>'
	with 1484 stored elements in COOrdinate format>


# Train the model

In [180]:
from tqdm import tqdm
from lightfm import LightFM
new_model = LightFM()
epoch_number = 100
new_model.fit(interactions=interactions, sample_weight=weights, user_features=users_features,  item_features=item_features, epochs=epoch_number, verbose=True)
print("Training done")

Epoch: 100%|██████████| 100/100 [00:09<00:00, 10.59it/s]

Training done





In [184]:
# Step 3: Generate Recommendations

# Compute predicted scores for all games
user_id = 0  # This is a mock user since the system needs an int to predict the values for given users
predicted_scores = new_model.predict(user_ids=user_id, item_ids=np.arange(len(game_scores)))

# Exclude games the user has played or owns
played_or_owned_games = users_playtime.index.tolist()

# Recommend the top N games with the highest predicted scores
top_n = 10
top_game_indices = np.argsort(predicted_scores)[::-1]
top_recommendations = game_df.loc[top_game_indices]
top_recommendations['predicted_score'] = predicted_scores[top_game_indices]


# Step 1: Get a list of game_ids for the target user_id in users_playtime
user_game_ids = users_playtime[users_playtime['user_id'] == user_id]['game_id'].tolist()

# Step 2: Remove rows from top_recommendations where game_id is associated with the target user_id in user_game_ids
top_recommendations = top_recommendations[~top_recommendations['game_id'].isin(user_game_ids)]

# Print the top recommended games
# print("Top Recommended Games:")
# print(top_recommendations)
# top_recommendations = top_recommendations[top_recommendations['score'] > 500]
#
# Assuming both DataFrames have a 'game_id' column
merged_recommendations = pd.merge(top_recommendations, revs_og[['app_id', 'app_name']], left_on='game_id', right_on='app_id', how='left')

# Drop the 'app_id' column as it's no longer needed
merged_recommendations = merged_recommendations.drop_duplicates(subset='game_id')

# Take only the game with a normalized score bigger than
merged_recommendations = merged_recommendations[merged_recommendations['normalized_score'] > 9]

# merged_recommendations = merged_recommendations.drop(columns=['app_id', 'game_id'])
merged_recommendations = merged_recommendations[['app_name', 'predicted_score', 'normalized_score']]
merged_recommendations = merged_recommendations.head(10)
merged_recommendations.reset_index(drop = True, inplace = True)
# Print the top recommended games with game names
print("Top Recommended Games:")
print(merged_recommendations)

Top Recommended Games:
                            app_name  predicted_score  normalized_score
0                           Aseprite         1.012455          9.162002
1           The Witcher 3: Wild Hunt         0.962091          9.007140
2     METAL GEAR RISING: REVENGEANCE         0.875963          9.382562
3     Danganronpa 2: Goodbye Despair         0.869091          9.315718
4   Danganronpa: Trigger Happy Havoc         0.558649          9.080543
5      The Binding of Isaac: Rebirth         0.498337          9.147735
6         Sid Meier's Civilization V         0.442620          9.442103
7  Deus Ex: Game of the Year Edition         0.381200          9.130333
8             Golf With Your Friends         0.369921          9.048828
9   South Park™: The Stick of Truth™         0.362551          9.493941
