# Natural Language Processing

## Imports and Setup

In [2]:
# Importing libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import time
import pickle

In [3]:
# Importing datasets
games = pd.read_csv('Datasets/games_info_clean.csv')
games_scores = pd.read_csv('Datasets/games_scores_grouped.csv')
comments = pd.read_csv('Datasets/games_comments_cleaned.csv')
games_scores_comments = pd.read_csv('Datasets/avg_commented_userscore.csv')

In [4]:
# Dropping the 'Unnamed: 0' column from all dataframes
games.drop('Unnamed: 0', axis = 1, inplace = True)
games_scores.drop('Unnamed: 0', axis = 1, inplace = True)
comments.drop('Unnamed: 0', axis = 1, inplace = True)
games_scores_comments.drop('Unnamed: 0', axis = 1, inplace = True)

In [5]:
# Defining functions to cleanup and process the comments
def clean_up(s):
    """
    Cleans up numbers, URLs, and special characters from a string.

    Args:
        s: The string to be cleaned up.

    Returns:
        A string that has been cleaned up.
    """
    return re.sub(r'  *', ' ', re.sub(r'[^a-z]', ' ', 
                                      re.sub(r'www\.\S*', ' ', re.sub(r'http[s]?://\S*', ' ', s.lower())))).strip()


def tokenize(s):
    """
    Tokenize a string.

    Args:
        s: String to be tokenized.

    Returns:
        A list of words as the result of tokenization.
    """
    return nltk.word_tokenize(s)


def stem_and_lemmatize(l):
    """
    Perform stemming and lemmatization on a list of words.

    Args:
        l: A list of strings.

    Returns:
        A list of strings after being stemmed and lemmatized.
    """
    ps = PorterStemmer()
    wnl = WordNetLemmatizer()
    
    stemmed_list = [ps.stem(w) for w in l]
    lemmed_on_stemmed_list = [wnl.lemmatize(w) for w in stemmed_list]
    
    return lemmed_on_stemmed_list


def remove_stopwords(lst, lang = 'english'):
    """
    Remove English (default) stopwords from a list of strings.

    Args:
        lst: A list of strings.
        l = Language of the stopwords.

    Returns:
        A list of strings after stop words are removed.
    """
    stop_words = stopwords.words(lang)
    return [word for word in lst if word not in stop_words]

## Cleaning up the Comments

In [6]:
# Starting with a small 5k sample to see if it all works
df = comments.sample(5000)
df

Unnamed: 0,Title,Platform,Userscore,Comment,Username
275894,Dungeon Defenders,PC,8,"To me, it's a ""Meh"" game. It does include some...",DragYou
96802,Dota 2,PC,0,"Dead game is dead, even LoL has their harrowee...",LeoneRUSSSIANS
246059,Dragon Age II,PC,5,After having played DA2 for a while and calmin...,Rosiello
155991,Pokemon Moon,3DS,10,Story - Very GoodOnline - All rightMusic - The...,FlaffyJasmine
223547,Bulletstorm,PlayStation3,10,My favorite FPS of all time! A bloody and fun ...,Freddo222
...,...,...,...,...,...
110273,FIFA Soccer 12,PC,6,"The gameplay can be as smooth as possible, but...",Minidivine
174402,Call of Duty: Modern Warfare 2,PC,0,"The lack of dedicated server support, along w...",AlanB.
1059,Grand Theft Auto IV,PlayStation3,10,"Absolutely the best game this year, last year,...",MattWix
57595,Grand Theft Auto: San Andreas,PC,10,The first game I played seriously.And also my ...,John67


In [8]:
# Preparing the text for the analysis
%time  # wanted to try and see execution time - but I don't quite get this output?
df['Comments_Processed'] = df['Comment'].apply(lambda x: remove_stopwords(stem_and_lemmatize(tokenize(clean_up(x)))))
df

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 21 µs


Unnamed: 0,Title,Platform,Userscore,Comment,Username,Comments_Processed
275894,Dungeon Defenders,PC,8,"To me, it's a ""Meh"" game. It does include some...",DragYou,"[meh, game, doe, includ, uniqu, featur, love, ..."
96802,Dota 2,PC,0,"Dead game is dead, even LoL has their harrowee...",LeoneRUSSSIANS,"[dead, game, dead, even, lol, ha, harroween, g..."
246059,Dragon Age II,PC,5,After having played DA2 for a while and calmin...,Rosiello,"[play, da, calm, initi, rage, took, month, bel..."
155991,Pokemon Moon,3DS,10,Story - Very GoodOnline - All rightMusic - The...,FlaffyJasmine,"[stori, veri, goodonlin, rightmus, best, pokem..."
223547,Bulletstorm,PlayStation3,10,My favorite FPS of all time! A bloody and fun ...,Freddo222,"[favorit, fp, time, bloodi, fun, shooter, love..."
...,...,...,...,...,...,...
110273,FIFA Soccer 12,PC,6,"The gameplay can be as smooth as possible, but...",Minidivine,"[gameplay, smooth, possibl, one, problem, game..."
174402,Call of Duty: Modern Warfare 2,PC,0,"The lack of dedicated server support, along w...",AlanB.,"[lack, dedic, server, support, along, player, ..."
1059,Grand Theft Auto IV,PlayStation3,10,"Absolutely the best game this year, last year,...",MattWix,"[absolut, best, game, thi, year, last, year, p..."
57595,Grand Theft Auto: San Andreas,PC,10,The first game I played seriously.And also my ...,John67,"[first, game, play, serious, also, favorit, ma..."


In [9]:
# Creating the 'target' column
## I'm assuming that reviews with a score >= 5 to be positive and with < 5 to be negative
df['Target'] = np.where((df['Userscore'] >= 5), 'Pos', 'Neg')

# Not sure if I need 3 labels or not, if I need this 'neutral' or if I should leave it like this
#df['Target'] = np.where((df['Userscore'] == 5), 'Neutral', df['Target'])

In [10]:
# Checking the values
df['Target'].value_counts()

Pos    4155
Neg     845
Name: Target, dtype: int64

In [11]:
# Creating the bag of words
bow = [word for lst in df['Comments_Processed'] for word in lst]
fdist = FreqDist(bow)

# Getting just the 5k most common words
most_common = fdist.most_common(5000)

# Building the features
def find_features(document, bow):
    text = document.lower()
    features = dict()
    for w, c in bow:
        features[w] = w in text
    return features

def make_matrix(series_text, series_target, bow):
    # initially I had bool(t) in the return, and then I decided to remove it to see if I could actually get the
    ## labels to show up in the most_inf_feats instead of just true and false things, and it worked, but now it
    ## won't work again with bool(t) for some reason, the most_inf_feats always returns without any results
    ## I tried with the same values as in the NLP lab (0, 2, 4 - or just 0 - 4) still didn't work, so yeah
    return [(find_features(s, bow), t) for s, t in zip(series_text.values, series_target.values)]

matrix = make_matrix(df['Comment'], df['Target'], most_common)

# Testing the model
size = int(len(matrix) * 0.2)  # xxx -> 20% of the data

# Training with 80% of the data and testing against the remaining 20%
training_set = matrix[size:]
testing_set = matrix[:size]

classifier = nltk.NaiveBayesClassifier.train(training_set)

# Showing the top 15 most informative features
classifier.show_most_informative_features(15)  # funny how the most informative feats are all negative lol

Most Informative Features
                  refund = True              Neg : Pos    =     24.8 : 1.0
                 exercis = True              Neg : Pos    =     18.2 : 1.0
                 monitor = True              Neg : Pos    =     18.2 : 1.0
                 disgrac = True              Neg : Pos    =     14.9 : 1.0
                 stutter = True              Neg : Pos    =     14.9 : 1.0
                 unenjoy = True              Neg : Pos    =     14.9 : 1.0
                 blatant = True              Neg : Pos    =     14.9 : 1.0
                 lesbian = True              Neg : Pos    =     14.9 : 1.0
                uninstal = True              Neg : Pos    =     14.9 : 1.0
                 billion = True              Neg : Pos    =     14.9 : 1.0
                  insult = True              Neg : Pos    =     13.1 : 1.0
                  redeem = True              Neg : Pos    =     12.1 : 1.0
                  immort = True              Neg : Pos    =     11.6 : 1.0

In [12]:
# Seems like a pretty good accuracy for a first try
print('Model accuracy:', str(round(nltk.classify.accuracy(classifier, testing_set) * 100, 2)) + '%')

Model accuracy: 80.7%


## Training the model in the whole DF (RUN THIS AGAIN LATER AT NIGHT)

In [34]:
# Preparing the text for the analysis
comments['Comments_Processed'] = \
comments['Comment'].apply(lambda x: remove_stopwords(stem_and_lemmatize(tokenize(clean_up(x)))))

# Creating the 'target' column
comments['Target'] = np.where((comments['Userscore'] >= 5), 'Pos', 'Neg')

# gotta check if this 3rd label improves performance or not
#comments['Target'] = np.where((comments['Userscore'] == 5), 'Neutral', comments['Target'])

# Creating the bag of words
bow = [word for lst in comments['Comments_Processed'] for word in lst]
fdist = FreqDist(bow)

# Getting just the 5k most common words
most_common = fdist.most_common(5000)

# Building the features and making the matrix
matrix = make_matrix(comments['Comment'], comments['Target'], most_common)

# Defining the size to use for the training and testing
size = int(len(matrix) * 0.25)  # xxx -> 25% of the data

# Training with 75% of the data and testing against the remaining 25%
training_set = matrix[size:]
testing_set = matrix[:size]

# Initializing and training the model
classifier = nltk.NaiveBayesClassifier.train(training_set)

# Showing the top 15 most informative features
most_inf_feat = classifier.show_most_informative_features(15)
print('')

# Printing the model's accuracy
print('Model accuracy:', str(round(nltk.classify.accuracy(classifier, testing_set) * 100, 2)) + '%')

Most Informative Features
              ticketsnew = True              Neg : Pos    =     30.2 : 1.0
              diretideno = True              Neg : Pos    =     28.6 : 1.0
                 maestra = True              Pos : Neg    =     19.3 : 1.0
               diretideg = True              Neg : Pos    =     19.1 : 1.0
                  refund = True              Neg : Pos    =     18.6 : 1.0
                  pikmin = True              Pos : Neg    =     18.5 : 1.0
                   glado = True              Pos : Neg    =     17.3 : 1.0
                    duda = True              Pos : Neg    =     15.8 : 1.0
                  sonora = True              Pos : Neg    =     14.0 : 1.0
                 disgrac = True              Neg : Pos    =     13.6 : 1.0
                 juegazo = True              Pos : Neg    =     13.2 : 1.0
                    scam = True              Neg : Pos    =     13.1 : 1.0
                  impecc = True              Pos : Neg    =     11.4 : 1.0

In [36]:
'''
I find it strange that after all this significantly bigger training the model's accuracy has barely increase.
I would expect to see at least a 70% there.
On a side note, I find it very odd that almost all the most informative words are all neutral?? That's just so strange
to me, since neutral only accounts for 10k records, whereas positive has over 220k values!!

After I ran everything again the scores changed, however, the point still stands. After training with a huge amount
of data, the scores barely changed. Don't know if it is a good thing or not.

Not sure if I might have a problem with this dataset being imbalanced? Since I have so many pos over negs? Have to
look into it.
'''
comments.Target.value_counts()

Pos    233865
Neg     48336
Name: Target, dtype: int64

In [13]:
# Just taking a look at the final dataframe
comments  # have to re-run the whole training in this DF again later today

Unnamed: 0,Title,Platform,Userscore,Comment,Username
0,The Legend of Zelda: Ocarina of Time,Nintendo64,10,"Everything in OoT is so near at perfection, it...",SirCaestus
1,The Legend of Zelda: Ocarina of Time,Nintendo64,10,I won't bore you with what everyone is already...,Kaistlin
2,The Legend of Zelda: Ocarina of Time,Nintendo64,10,Anyone who gives the masterpiece below a 7 or ...,Jacody
3,The Legend of Zelda: Ocarina of Time,Nintendo64,10,I'm one of those people who think that this is...,doodlerman
4,The Legend of Zelda: Ocarina of Time,Nintendo64,10,This game is the highest rated game on Metacr...,StevenA
...,...,...,...,...,...
282196,Etrian Odyssey Untold: The Millennium Girl,3DS,7,"Extremely similar to EO:4, which obviously isn...",RileyWRussell
282197,Etrian Odyssey Untold: The Millennium Girl,3DS,0,Typical overrated Atlus trash. A game i should...,TemplarGR
282198,Etrian Odyssey Untold: The Millennium Girl,3DS,9,While I find the story mode to have annoying c...,midipon
282199,Etrian Odyssey Untold: The Millennium Girl,3DS,8,"Pretty good, but it certainly lacks the visual...",night4


In [39]:
# Saving the trained algorithm
save_classifier = open('naivebayes.pickle', 'wb')
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [None]:
# To open the classifier and use it
'''classifier_f = open('naivebayes.pickle', 'rb')
classifier = pickle.load(classifier_f)
classifier_f.close()'''

In [17]:
# Resetting the index before the export so it looks better
df.reset_index(drop = True, inplace = True)

# Exporting this DF to always use the same 5k sample in the next notebook for different classifiers (algorithms)
df.to_json('Datasets/comments_5ksample.json')  # exporting as json to maintain the 'comments_processed' data type

In [46]:
# Testing a label change to see if it might be better balanced this way and improve results, or not
## might do this later on, as it will take forever to run this 3 times on the whole DF
print(comments['Target'].value_counts())

comments['Target_2'] = np.where((comments['Userscore'] <= 5), 'Negative', 'Positive')
comments['Target_2'] = np.where(((comments['Userscore'] >= 6) & (comments['Userscore'] <= 8)), 
                                'Neutral', comments['Target_2'])

comments['Target_2'].value_counts()

Pos    233865
Neg     48336
Name: Target, dtype: int64


Positive    164629
Neutral      59164
Negative     58408
Name: Target_2, dtype: int64