In [1]:
import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt
import nltk

# Loading the dataset

In [62]:
revs_og = pd.read_csv('kaggle/dataset.csv')
revs_og.head()


KeyboardInterrupt



In [None]:
revs_og.shape

In [None]:
revs_og.info()

# 1. Data Cleaning

**Null Value Checking**

In [None]:
revs_og.isnull().sum()

In [None]:
revs = revs_og[['app_id', 'app_name', 'review_score','review_text']]

In [None]:
revs.isnull().sum()

In [None]:
revs.dropna(inplace = True)

In [None]:
revs.head()

In [None]:
revs['review_score'].unique()

In [None]:
revs.shape

In [None]:
new_df = revs.sample(n = 50000)

In [None]:
new_df.shape

In [None]:
new_df.isnull().sum()

In [None]:
new_df.info()

In [None]:
new_df.reset_index(drop = True, inplace = True)

In [None]:
new_df.head()

**Checking if there is any duplicated row**

In [None]:
# Print the duplicated
# new_df[new_df.duplicated()]
new_df.duplicated().sum()

**Dropping the duplicated rows**

In [None]:
new_df.drop_duplicates(inplace=True)

In [None]:
new_df.duplicated().sum()

In [None]:
new_df.shape

In [None]:
new_df.rename(columns={'review_score':'target','review_text':'text'},inplace=True)

In [None]:
new_df.head()

# 2. EDA

In [None]:
new_df['target'].value_counts()

In [None]:
plt.pie(new_df['target'].value_counts(),labels=['positive','negative'],autopct='%0.2f')
plt.show()

> Data is imbalanced

In [None]:
nltk.download('punkt')

In [None]:
new_df['char_num'] = new_df['text'].apply(len)

In [None]:
new_df.head()

In [None]:
#word nums
new_df['word_num'] = new_df['text'].apply(lambda x: len(nltk.word_tokenize(x)))
new_df.head()

In [None]:
#num of sentences
new_df['sent_num'] = new_df['text'].apply(lambda x: len(nltk.sent_tokenize(x)))
new_df.head()

In [None]:
new_df[['char_num','word_num','sent_num']].describe()

In [None]:
#neg. reviews description
new_df[new_df['target'] == -1][['char_num','word_num','sent_num']].describe()

In [None]:
#pos. reviews description
new_df[new_df['target'] == 1][['char_num','word_num','sent_num']].describe()

In [None]:
import seaborn as sns

In [None]:
sns.histplot(new_df[new_df['target'] == 1]['char_num'])
sns.histplot(new_df[new_df['target'] == -1]['char_num'],color='red')

plt.show()

In [None]:
sns.histplot(new_df[new_df['target'] == 1]['word_num'])
sns.histplot(new_df[new_df['target'] == -1]['word_num'],color='red')

plt.show()

In [None]:
sns.histplot(new_df[new_df['target'] == 1]['sent_num'])
sns.histplot(new_df[new_df['target'] == -1]['sent_num'],color='red')

plt.show()

In [None]:
#2D plots with the, using the target value as the X-axis, while the rest of the DF as the Y-axis
# sns.pairplot(new_df,hue='target')
# plt.show()

In [None]:
# TODO Doesn't work(?) -> convert a String into a Float for whathever reason
# sns.heatmap(new_df.corr(),annot=True)
# plt.show()

# 3. Data Preprocessing
* Lower case
* Tokenization
* Removing special characters
* Removing stop words and punctuation
* stemming

In [None]:
new_df.reset_index(drop = True, inplace = True)

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
import string
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
string.punctuation

In [None]:
#Self-contained function for pandarallel to work
import nltk
nltk.download('stopwords')

def convert_text(text):
    #Importing libraries to work with pandarallel
    import nltk
    from nltk.corpus import stopwords
    import string
    from nltk.stem.porter import PorterStemmer

    ps = PorterStemmer()

    text = text.lower()
    text = nltk.word_tokenize(text)
    
    x = []
    y =[]
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            x.append(i)
    for i in x:
        y.append(ps.stem(i))
    return ' '.join(y)

In [None]:
new_df['text'][8]

In [None]:
from pandarallel import pandarallel
pandarallel.initialize()
new_df['converted_text']=new_df['text'].parallel_apply(convert_text)
# new_df['converted_text']=new_df['text'].apply(convert_text)

In [None]:
new_df.head()

In [None]:
from wordcloud import WordCloud
wc = WordCloud(width = 500,height=500,min_font_size = 10,background_color = 'white')

In [None]:
pos_wc = wc.generate(new_df[new_df['target']==1]['converted_text'].str.cat(sep=" "))
plt.imshow(pos_wc)
plt.show()

In [None]:
neg_wc = wc.generate(new_df[new_df['target']==-1]['converted_text'].str.cat(sep=" "))
plt.imshow(neg_wc)
plt.show()

# 4. Model Building

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer(max_features = 3000)
tfidf = TfidfVectorizer(max_features=5000)

In [None]:
# X_cv = cv.fit_transform(new_df['converted_text']).toarray()
X_tfid = tfidf.fit_transform(new_df['converted_text']).toarray()
X = X_tfid

In [None]:
X.shape

In [None]:
y = new_df['target'].values

In [None]:
#y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state = 34)

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
X_test.shape

In [None]:
y_test.shape

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn import tree
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, f1_score, recall_score


In [None]:
# gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()
tree = tree.DecisionTreeClassifier()
sgd = SGDClassifier(loss="modified_huber")

In [None]:
#Training the 5 defined models

# gnb.fit(X_train,y_train)
# print("Gaussian Naive Bayes fitted.")
mnb.fit(X_train,y_train)
print("Multinomial Naive Bayes fitted.")
bnb.fit(X_train,y_train)
print("Bernoulli Naive Bayes fitted.")
# tree.fit(X_train, y_train)
print("Decisions tree fitted.")
sgd.fit(X_train, y_train)
print("Stochastic Gradient Descent fitted.")

In [None]:
stats = {}
def add_stats(algorithm, test, pred):
    stats[algorithm] = {
                        "F1 %" : round(f1_score(test,pred)*100,2),
                        "Precision %" : round(precision_score(test,pred)*100,2),
                        "Recall %" : round(recall_score(test,pred)*100,2),
                        "Accuracy %" : round(accuracy_score(test, pred)*100,2),
                        # "Confusion Matrix" : confusion_matrix(test,pred)
                        }

In [None]:
#Predictions and adding stats to the dictionary.
# y_pred1 = gnb.predict(X_test)
# add_stats("Gaussian Native Bayes", y_pred1, y_test)

y_pred2 = mnb.predict(X_test)
add_stats("Multinomial Native Bayes", y_pred2, y_test)

y_pred3 = bnb.predict(X_test)
add_stats("Bernoulli Native Bayes", y_pred3, y_test)

# y_pred4 = tree.predict(X_test)
# add_stats("Decisions Tree", y_pred4, y_test)

y_pred5 = sgd.predict(X_test)
add_stats("Stochastic Gradient Descent", y_pred5, y_test)

#Building table
df_stats = pd.DataFrame.from_dict(stats, orient='index')
df_stats = df_stats.sort_values(by="F1 %", ascending=False)
fig, ax = plt.subplots(figsize=(8, 4))

# Hide axes
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)

# Hide axes' labels
ax.set_frame_on(False)
ax.table(cellText=df_stats.values, colLabels=df_stats.columns, rowLabels=df_stats.index, loc='center')

plt.show()

In [None]:
import math
def execute_sentiment_analysis(text, sa_algorithm):
    if text is None or text == "" or (math.isnan(text) if type(text) == float else False):
        return 0
    test = convert_text(text)
    X = tfidf.transform([test])
    return sa_algorithm.predict(X)[0] * max(sa_algorithm.predict_proba(X)[0])

In [None]:
print(execute_sentiment_analysis('game is a waste of time', sgd))
print(execute_sentiment_analysis('I really loved this game', sgd))
print(execute_sentiment_analysis('', sgd))

In [None]:
#benchmark the algorithms
reviews = [
    ["Imagine waiting for an hour and ten minutes and still not being able to get into a private game.",-1],
    ["I have been playing this game fairly consistently for the past 8 or so years. I also lead an in-game Fleet (Guild) that has had a constant player base that has grown and shrunk over time with sometimes hundreds of people. In other words, I truly love this game and almost always have enjoyed playing it. Which makes what I'm about to say very hard.Over these years the game has steadily become one of the most greediest, dirtiest cash grabs in the current Free-to-Play MMO market. It takes literal months to grind (and I mean GRIND) for a free, premium ship. Even other free ships, like the summer/winter event ones, still require weeks of grinding - and it NEVER changes. They don't change these events, so every year it's the same activities. which becomes extremely dull extremely fast.Not to mention EVERYTHING else in the game requires a purchase. Uniforms, lockbox keys, bridge officers, playable factions, inventory slots, bank slots, ship slots, duty officers, shuttles, equipment upgrades, etc. While some fans may attack me and say that not ALL of these items are only attainable through real money purchases, it is MUCH easier to buy them than to grind for them.Another in-game currency, Dilithium, has had a very small cap that has never changed for countless years. This currency allows you to upgrade your gear, support your Fleet, buy special equipment you unlock through Reputations (which, of course, require a little bit of grinding, but not nearly as bad as the previously mentioned gripes) and other useful items. The cap is insultingly low, and once again, some fans may say 'well, you can refine a little extra from your Fleet mine!' or 'If you're a lifetime member, you can refine an extra 1,000 every few days'. Of course, lifetime subscriptions cost $200-$300, assuming they're on sale. Obviously a small price to pay for a little extra dilithium that can only be initiated through a certain mission on a certain planet.This is only scratching the surface of this game's issues. A lot of the older missions have loads of bugs and are simply outdated (think Nimbus), some of the newer ones have some too, the engine has been out of date for years and they show no sign of upgrading it, and there's just so many bugs sometimes. I can recall when expansions would drop and certain missions or activities would literally cause your game to crash. I understand some of these times bugs will happen, but Cryptic never seems to truly test their new content.I am truly saddened and sickened by how awful the monetisation methods have become in this game. Like I said, I still do love this game, but I don't recommend new players to try it right now. I PRAY that they will eventually realize the insanity behind their practices and actually put some love and effort into this game, but until they do, you're better off playing older Star Trek games like Armada. ",-1],
    ["The demo ran flawlessly on Linux via Proton. After release, Denuvo absolutely destroys performance. Micro-stutters abound, the death of a game in this genre. On windows, there are fewer stutters, so it's a bit more playable, but they are still present at times. This game looks like a masterpiece, shame it can't be played normally. ",-1],
    ["I COULDNT GO DOWN THE SLIDE! THIS GAME SUCKED DONKEY BALLS!",-1],
    ["I should have refunded this a bit sooner. ",-1],
    ["It's a good game ruined by jank and the boring loot mechanics. Also no music??. ",-1],
    ["they like turned a sick ass tactics mmo into a crappy mobile game, it's insane",-1],
    ["Server issues aside, Game is solid 8/10",1],
    ["To sum up this game in a way only players will understand: This game has a unique ability to make you think about your next play through while you're still playing your current one.",1],
    ["Don't play if you enjoy remembering to eat or drink, remembering there is an outside or having free time. Other than that amazing game that can keep you entertained for hours on end",1],
    ["Don't spend real money on this. You can easily get to rank 5 by playing nothing but story missions, get some fanservice, fly the Enterprise-E around, do all the things you really want to do as a Star Trek fan, all within a few dozen hours. If, at the end of that journey, you find that you enjoy MMORPGS, then knock yourself out. But if you're a non-MMO-player like me who just really loves the Star Trek franchise, free-to-play is the way to go. ",1],
    ["Phenomenal. The combat is excellent and the level design is on point. This is a soulsborne where the developers did their homework. ",1],
    ["The game is filled with bugs but I'm still recommending it because I got to witness my friends getting scared to death which was a really pleasant experience.",1],
    ["some ♥♥♥♥ called me the n word so I nuked the whole map. i love this game",1],
    ["this is cheaper then drugs and more addictive but with somehow the same damaging potential to your social life but with non of the long lasting effects and with better story telling highly recommend",1],
    ["You must run to collect more pesos",1]
    ]

sent_analysis_algs = [
    [mnb, 'MultinomialNB'],
    # [gnb, 'GaussianNB'],
    [bnb, 'BernoulliNB'],
    [tree, 'tree'],
    [sgd, 'SGD']]

benchmark = {}
for alg, name in sent_analysis_algs:
    benchmark[name] = [0, []]
    for rev, expected_res in reviews:
        res = execute_sentiment_analysis(rev, alg)
        if res == expected_res:
            benchmark[name][0] +=1
        else:
            benchmark[name][1].append(rev)
benchmark

# 5.Saving the models


In [None]:
from joblib import dump
# TODO add the text indicization (tf_idf, cv)
# dump(mnb, 'MultinomialNB.joblib')
# # dump(gnb, 'GaussianNB.joblib')
# dump(bnb, 'BernoulliNB.joblib')
# dump(tree, 'tree.joblib')
# dump(sgd, 'SGD.joblib')

# 6.Loading the model

In [None]:
from joblib import load
# TODO add the text indicization (tf_idf, cv)
# mnb = load('MultinomialNB.joblib')
# # gnb = load('GaussianNB.joblib')
# bnb = load('BernoulliNB.joblib')
# tree = load('tree.joblib')
# sgd = load('SGD.joblib')

# 7.Tests

In [None]:
all_ids = set(revs_og['app_id'])
all_ids

In [None]:
keys = {'app_id', 'app_name'}
all_games = {key: revs_og[key] for key in keys}

In [None]:
text_extracted = revs_og.groupby('app_id')['review_text'].apply(list)

In [None]:
text_extracted.describe()

# 8.Load game categories

In [2]:
#loading game categories

gameid_categories_transported = pd.read_csv('games_tags.csv')

# Reset the index to have a numeric index
gameid_categories_transported.reset_index(drop=True, inplace=True)

# Melt the DataFrame to convert it back to the original format
gameid_categories = gameid_categories_transported.melt(var_name="game_id", value_name="categories")

# Drop rows with NaN values (optional, if you want to remove them)
gameid_categories = gameid_categories.dropna()

# Group by game_id and aggregate categories into a list
gameid_categories = gameid_categories.groupby('game_id')['categories'].agg(list).reset_index()
gameid_categories['game_id'] = gameid_categories['game_id'].astype('int64')

# Reset the index of the melted DataFrame
gameid_categories.reset_index(drop=True, inplace=True)

# Display the resulting DataFrame
gameid_categories.head()

Unnamed: 0,game_id,categories
0,10,"[Action, FPS, Multiplayer, Shooter, Classic, T..."
1,1002,"[2D Fighter, Martial Arts, Intentionally Awkwa..."
2,100400,"[Animation & Modeling, Software]"
3,10090,"[Zombies, World War II, FPS, Multiplayer, Acti..."
4,100980,"[Animation & Modeling, Utilities, Design & Ill..."


# 9.Load the previous analysis done to calculate the total scores.

In [3]:
game_scores = pd.read_csv("games_score.csv")
game_scores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9368 entries, 0 to 9367
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   game_id  9368 non-null   int64  
 1   score    9368 non-null   float64
dtypes: float64(1), int64(1)
memory usage: 146.5 KB


# 10.Load the user's gaming behaviour

## 10.1 Loading the steam api key

In [4]:
 # Get the stored api key
file_path = 'api_key.txt'

try:
    # Open the file in read mode
    with open(file_path, 'r') as file:
        # Read the API key from the file
        api_key = file.read().strip()
        print(f"Steam API Key found")
except FileNotFoundError:
    print(f"Api file '{file_path}' not found.")


Steam API Key found


## 10.2 Requesting the user's total playtime on videogames

In [5]:
import requests
steam_user_id = 76561198055831348
get_owned_games_url = "http://api.steampowered.com/IPlayerService/GetOwnedGames/v0001/?key={}&steamid={}&format=json&include_played_free_games".format(api_key, steam_user_id)
response = requests.get(get_owned_games_url)
if response.status_code == 200:
    print("Success")
    response = response.json()
else:
    print("Something went wrong: ",response.status_code,response.text)

Success


## 10.3 Extract user's playtime by game

In [6]:
import json
games_list = response["response"]["games"]
appid_playtime_dict = {game["appid"]: game["playtime_forever"] for game in games_list}
appid_playtime = pd.DataFrame.from_dict(appid_playtime_dict, orient='index')
appid_playtime.reset_index(inplace=True)
appid_playtime.columns = ['game_id', 'playtime_forever']
appid_playtime.head()
# appid_playtime = pd.DataFrame.from_dict(appid_playtime_dict, orient='index', columns=['playtime_forever'])

Unnamed: 0,game_id,playtime_forever
0,2100,531
1,2130,0
2,4000,1747
3,12900,904
4,19900,156


## 10.* Calculate the user's category score

# 11.Take a user and based on its hours spent by playing a certain genre, try to recommend a certain game that fits  its tastes

In [23]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load the game scores and game categories datasets
# game_scores = pd.read_csv('game_scores.csv')
# game_categories = pd.read_csv('game_categories.csv')

# Combine the datasets based on a common game_id column
combined_data = pd.merge(game_scores, gameid_categories, on='game_id', how='inner')
combined_data = combined_data.explode('categories')  # Explode the lists into separate rows

# Filter out games that the user has already played (replace user_played_games with the actual list of played games)
user_played_games = appid_playtime["game_id"].unique()
combined_data = combined_data[~combined_data['game_id'].isin(user_played_games)]

# Create feature vectors for each game with scores and binary indicators for categories
features = combined_data[['score']]

# Create binary category features
categories = combined_data['categories'].str.get_dummies()
features = pd.concat([features, categories], axis=1)

# Calculate user profile based on played games and playtime (or scores)
# Replace user_playtime with actual playtime data for the user
user_playtime = appid_playtime

user_profile = {category: 0 for category in combined_data['categories'].unique()}

for game_id, playtime in user_playtime.items():
    game_data = combined_data[combined_data['game_id'] == game_id]
    for category in game_data['categories']:
        user_profile[category] += playtime * game_data.iloc[0]['score']  # You can use scores instead of playtime

# Add missing categories with a score of 0 to the user profile
for category in categories:
    if category not in user_profile:
        user_profile[category] = 0

# Calculate similarity between user profile and each game's feature vector
features = features.columns[1:]
similarity_scores = cosine_similarity([list(user_profile.values())], features)

# Add similarity scores to the combined_data DataFrame
combined_data['similarity_score'] = similarity_scores[0]

# Sort games by similarity score in descending order
recommended_games = combined_data.sort_values(by='similarity_score', ascending=False)

# Get the top N recommended games
top_recommendations = recommended_games.head(10)  # Change 10 to the desired number of recommendations


KeyboardInterrupt

