# Libraries import

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import nltk

<br><br><br><br>
# Loading the dataset

In [None]:
revs_og = pd.read_csv('kaggle/dataset.csv')
revs_og.head()

In [None]:
revs_og.shape

In [None]:
revs_og.info()

<br><br><br><br>
# 1 - Data Cleaning

## 1.1 - Null Value checking and removing
This section is dedicated to the removal of rows with null app name

In [None]:
revs_og.isnull().sum()

In [None]:
revs = revs_og[['app_id', 'app_name', 'review_score','review_text']]

In [None]:
revs.dropna(inplace = True)

In [None]:
revs.head()

In [None]:
revs['review_score'].unique()

In [None]:
revs.shape

In [None]:
new_df = revs.sample(n = 10000)

In [None]:
new_df.shape

In [None]:
new_df.isnull().sum()

In [None]:
new_df.info()

In [None]:
new_df.reset_index(drop = True, inplace = True)

In [None]:
new_df.head()

<br><br>
## 1.2 - Duplicated values checking and removing
This section is dedicated to the removal of duplicated rows

In [None]:
# Print the duplicated
# new_df[new_df.duplicated()]
new_df.duplicated().sum()

In [None]:
new_df.drop_duplicates(inplace=True)

In [None]:
new_df.duplicated().sum()

In [None]:
new_df.shape

<br><br>
## 1.3 - Renaming columns

In [None]:
new_df.rename(columns={'review_score':'sentiment_target','review_text':'text'},inplace=True)

In [None]:
new_df.head()

<br><br><br><br>
# 2 - EDA

In [None]:
new_df['sentiment_target'].value_counts()

In [None]:
plt.pie(new_df['sentiment_target'].value_counts(),labels=['positive','negative'],autopct='%0.2f')
plt.show()

> Data is imbalanced

<br><br>
## 2.1 - Characters, Words and Sentences counters
Let's continue counting the number of characters, words and sentences used in each of the selected review.

In [None]:
nltk.download('punkt')

In [None]:
new_df['char_num'] = new_df['text'].apply(len)
new_df.head()

In [None]:
#word nums
new_df['word_num'] = new_df['text'].apply(lambda x: len(nltk.word_tokenize(x)))
new_df.head()

In [None]:
#num of sentences
new_df['sent_num'] = new_df['text'].apply(lambda x: len(nltk.sent_tokenize(x)))
new_df.head()

In [100]:
new_df[['char_num','word_num','sent_num']].describe()

Unnamed: 0,char_num,word_num,sent_num
count,10000.0,10000.0,10000.0
mean,311.5686,65.3193,3.897
std,618.684208,128.45761,6.311717
min,1.0,1.0,1.0
25%,29.0,7.0,1.0
50%,104.0,22.0,2.0
75%,318.0,67.0,4.0
max,7856.0,1709.0,115.0


In [None]:
#neg. reviews description
new_df[new_df['sentiment_target'] == -1][['char_num','word_num','sent_num']].describe()

In [None]:
#pos. reviews description
new_df[new_df['sentiment_target'] == 1][['char_num','word_num','sent_num']].describe()

## 2.2 - Sentiment plotting

In [None]:
import seaborn as sns

In [None]:
sns.histplot(new_df[new_df['sentiment_target'] == 1]['char_num'])
sns.histplot(new_df[new_df['sentiment_target'] == -1]['char_num'],color='red')

plt.show()

In [None]:
sns.histplot(new_df[new_df['sentiment_target'] == 1]['word_num'])
sns.histplot(new_df[new_df['sentiment_target'] == -1]['word_num'],color='red')

plt.show()

In [None]:
sns.histplot(new_df[new_df['sentiment_target'] == 1]['sent_num'])
sns.histplot(new_df[new_df['sentiment_target'] == -1]['sent_num'],color='red')

plt.show()

In [None]:
#2D plots with the, using the sentiment_target value as the X-axis, while the rest of the DF as the Y-axis
sns.pairplot(new_df,hue='sentiment_target')
plt.show()

In [None]:
# TODO Doesn't work(?) -> convert a String into a Float for whathever reason
sns.heatmap(new_df.corr(),annot=True)
plt.show()

<br><br><br><br>
# 3. Data Preprocessing
This section is dedicated to the following steps:
* Lower case transformation
* Tokenization
* Removing of special characters
* Removing of stop words and punctuation
* Stemming

In [101]:
new_df.reset_index(drop = True, inplace = True)

In [102]:
from nltk.corpus import stopwords
nltk.download('stopwords')
import string
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gabri\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [103]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [104]:
#Self-contained function for pandarallel to work
import nltk
nltk.download('stopwords')

def convert_text(text):
    #Importing libraries to work with pandarallel
    import nltk
    from nltk.corpus import stopwords
    import string
    from nltk.stem.porter import PorterStemmer

    ps = PorterStemmer()

    text = text.lower()
    text = nltk.word_tokenize(text)
    
    x = []
    y =[]
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            x.append(i)
    for i in x:
        y.append(ps.stem(i))
    return ' '.join(y)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gabri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [105]:
new_df['text'][8]

'The Plan... A range of emotions you may feel playing this. Curiosity Sadness Frustation Fear (Should definately be rated 18+ as death is so brutal) 10/10 IGN would become a fly again '

In [107]:
from pandarallel import pandarallel
pandarallel.initialize()
new_df['converted_text']=new_df['text'].parallel_apply(convert_text)
# new_df['converted_text']=new_df['text'].apply(convert_text)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [None]:
new_df.head()

In [None]:
from wordcloud import WordCloud
wc = WordCloud(width = 500,height=500,min_font_size = 10,background_color = 'white')

In [None]:
pos_wc = wc.generate(new_df[new_df['sentiment_target']==1]['converted_text'].str.cat(sep=" "))
plt.imshow(pos_wc)
plt.show()

In [None]:
neg_wc = wc.generate(new_df[new_df['sentiment_target']==-1]['converted_text'].str.cat(sep=" "))
plt.imshow(neg_wc)
plt.show()

<br><br><br><br>
# 4. Model Building

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer(max_features = 3000)
tfidf = TfidfVectorizer(max_features=5000)

In [None]:
X = cv.fit_transform(new_df['converted_text']).toarray()

In [None]:
X.shape

In [None]:
y = new_df['sentiment_target'].values

In [None]:
#y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state = 34)

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
X_test.shape

In [None]:
y_test.shape

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn import tree
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, f1_score, recall_score


In [None]:
# gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()
clf_tree = tree.DecisionTreeClassifier()
clf_sgd = SGDClassifier()

In [None]:
#Training the 5 defined models

# gnb.fit(X_train,y_train)
# print("Gaussian Naive Bayes fitted.")
mnb.fit(X_train,y_train)
print("Multinomial Naive Bayes fitted.")
bnb.fit(X_train,y_train)
print("Bernoulli Naive Bayes fitted.")
clf_tree.fit(X_train,y_train)
print("Decisions tree fitted.")
clf_sgd.fit(X_train,y_train)
print("Stochastic Gradient Descent fitted.")

In [None]:
stats = {}
def add_stats(algorithm, test, pred):
    stats[algorithm] = {
                        "F1 %" : round(f1_score(test,pred)*100,2),
                        "Precision %" : round(precision_score(test,pred)*100,2),
                        "Recall %" : round(recall_score(test,pred)*100,2),
                        "Accuracy %" : round(accuracy_score(test, pred)*100,2),
                        # "Confusion Matrix" : confusion_matrix(test,pred)
                        }

In [None]:
#Predictions and adding stats to the dictionary.
# y_pred1 = gnb.predict(X_test)
# add_stats("Gaussian Native Bayes", y_pred1, y_test)

y_pred2 = mnb.predict(X_test)
add_stats("Multinomial Native Bayes", y_pred2, y_test)

y_pred3 = bnb.predict(X_test)
add_stats("Bernoulli Native Bayes", y_pred3, y_test)

y_pred4 = clf_tree.predict(X_test)
add_stats("Decisions Tree", y_pred4, y_test)

y_pred5 = clf_sgd.predict(X_test)
add_stats("Stochastic Gradient Descent", y_pred5, y_test)

#Building table
df_stats = pd.DataFrame.from_dict(stats, orient='index')
df_stats = df_stats.sort_values(by="F1 %", ascending=False)
fig, ax = plt.subplots(figsize=(8, 4))

# Hide axes
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)

# Hide axes' labels
ax.set_frame_on(False)
ax.table(cellText=df_stats.values, colLabels=df_stats.columns, rowLabels=df_stats.index, loc='center')

plt.show()

In [None]:
def checker(text, sa_algorithm):
    test = convert_text(text)
    X = cv.transform([test])
    return sa_algorithm.predict(X)[0]

In [None]:
print(checker('game is a waste of time', mnb))
print(checker('I really loved this game', mnb))

In [None]:
#benchmark the algorithms
reviews = [
    ["Imagine waiting for an hour and ten minutes and still not being able to get into a private game.",-1],
    ["I have been playing this game fairly consistently for the past 8 or so years. I also lead an in-game Fleet (Guild) that has had a constant player base that has grown and shrunk over time with sometimes hundreds of people. In other words, I truly love this game and almost always have enjoyed playing it. Which makes what I'm about to say very hard.Over these years the game has steadily become one of the most greediest, dirtiest cash grabs in the current Free-to-Play MMO market. It takes literal months to grind (and I mean GRIND) for a free, premium ship. Even other free ships, like the summer/winter event ones, still require weeks of grinding - and it NEVER changes. They don't change these events, so every year it's the same activities. which becomes extremely dull extremely fast.Not to mention EVERYTHING else in the game requires a purchase. Uniforms, lockbox keys, bridge officers, playable factions, inventory slots, bank slots, ship slots, duty officers, shuttles, equipment upgrades, etc. While some fans may attack me and say that not ALL of these items are only attainable through real money purchases, it is MUCH easier to buy them than to grind for them.Another in-game currency, Dilithium, has had a very small cap that has never changed for countless years. This currency allows you to upgrade your gear, support your Fleet, buy special equipment you unlock through Reputations (which, of course, require a little bit of grinding, but not nearly as bad as the previously mentioned gripes) and other useful items. The cap is insultingly low, and once again, some fans may say 'well, you can refine a little extra from your Fleet mine!' or 'If you're a lifetime member, you can refine an extra 1,000 every few days'. Of course, lifetime subscriptions cost $200-$300, assuming they're on sale. Obviously a small price to pay for a little extra dilithium that can only be initiated through a certain mission on a certain planet.This is only scratching the surface of this game's issues. A lot of the older missions have loads of bugs and are simply outdated (think Nimbus), some of the newer ones have some too, the engine has been out of date for years and they show no sign of upgrading it, and there's just so many bugs sometimes. I can recall when expansions would drop and certain missions or activities would literally cause your game to crash. I understand some of these times bugs will happen, but Cryptic never seems to truly test their new content.I am truly saddened and sickened by how awful the monetisation methods have become in this game. Like I said, I still do love this game, but I don't recommend new players to try it right now. I PRAY that they will eventually realize the insanity behind their practices and actually put some love and effort into this game, but until they do, you're better off playing older Star Trek games like Armada. ",-1],
    ["The demo ran flawlessly on Linux via Proton. After release, Denuvo absolutely destroys performance. Micro-stutters abound, the death of a game in this genre. On windows, there are fewer stutters, so it's a bit more playable, but they are still present at times. This game looks like a masterpiece, shame it can't be played normally. ",-1],
    ["I COULDNT GO DOWN THE SLIDE! THIS GAME SUCKED DONKEY BALLS!",-1],
    ["I should have refunded this a bit sooner. ",-1],
    ["It's a good game ruined by jank and the boring loot mechanics. Also no music??. ",-1],
    ["they like turned a sick ass tactics mmo into a crappy mobile game, it's insane",-1],
    ["Server issues aside, Game is solid 8/10",1],
    ["To sum up this game in a way only players will understand: This game has a unique ability to make you think about your next play through while you're still playing your current one.",1],
    ["Don't play if you enjoy remembering to eat or drink, remembering there is an outside or having free time. Other than that amazing game that can keep you entertained for hours on end",1],
    ["Don't spend real money on this. You can easily get to rank 5 by playing nothing but story missions, get some fanservice, fly the Enterprise-E around, do all the things you really want to do as a Star Trek fan, all within a few dozen hours. If, at the end of that journey, you find that you enjoy MMORPGS, then knock yourself out. But if you're a non-MMO-player like me who just really loves the Star Trek franchise, free-to-play is the way to go. ",1],
    ["Phenomenal. The combat is excellent and the level design is on point. This is a soulsborne where the developers did their homework. ",1],
    ["The game is filled with bugs but I'm still recommending it because I got to witness my friends getting scared to death which was a really pleasant experience.",1],
    ["some ♥♥♥♥ called me the n word so I nuked the whole map. i love this game",1],
    ["this is cheaper then drugs and more addictive but with somehow the same damaging potential to your social life but with non of the long lasting effects and with better story telling highly recommend",1],
    ["You must run to collect more pesos",1]
    ]

sent_analysis_algs = [
    [mnb, 'MultinomialNB'],
    # [gnb, 'GaussianNB'],
    [bnb, 'BernoulliNB'],
    [clf_tree, 'tree'],
    [clf_sgd, 'SGD']]

benchmark = {}
for alg, name in sent_analysis_algs:
    benchmark[name] = [0, []]
    for rev, expected_res in reviews:
        res = checker(rev,alg)
        if res == expected_res:
            benchmark[name][0] +=1
        else:
            benchmark[name][1].append(rev)
benchmark

# 5.Saving the models


In [None]:
from joblib import dump
dump(mnb, 'MultinomialNB.joblib')
# dump(gnb, 'GaussianNB.joblib')
dump(bnb, 'BernoulliNB.joblib')
dump(clf_tree, 'tree.joblib')
dump(clf_sgd, 'SGD.joblib')

# 6.Loading the model

In [None]:
from joblib import load
mnb = load('MultinomialNB.joblib')
# gnb = load('GaussianNB.joblib')
bnb = load('BernoulliNB.joblib')
clf_tree = load('tree.joblib')
clf_sgd = load('SGD.joblib')

# 7.Tests

In [None]:
all_ids = set(new_df['app_id'])
all_ids

In [None]:
keys = {'app_id', 'app_name'}
all_games = {key: new_df[key] for key in keys}

In [None]:
text_extracted = new_df.groupby('app_id')['text'].apply(list)

In [None]:
text_extracted.describe()

# 8.Load game categories

In [None]:
#loading game categories

gameid_tags = pd.read_csv('games_tags.csv')
gameid_tags.info()

# 9.Rank games based on their sentiment analysis

In [None]:
#Weight formula: (valoreSentiment x 1,5)(se review_text non null) + 1,3 x review_score + 1,15 x (review_votes)

# 10.Given a steam user, analyze its gaming behaviours

# 11.Take a user and based on its hours spent by playing a certain genre, try to recommend a certain game that fits  its tastes