# Natural Language Processing

## Imports and Setup

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import swifter

import re
import nltk
from collections import defaultdict
from nltk.corpus import wordnet as wn
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.probability import FreqDist

import time
import pickle

In [4]:
# Importing the dataset
comments = pd.read_csv('Datasets/games_comments_cleaned.csv')

In [6]:
# Dropping the 'Unnamed: 0' column
comments.drop('Unnamed: 0', axis = 1, inplace = True)

In [7]:
# Defining functions to cleanup and process the comments
def clean_up(s):
    """
    Cleans up numbers, URLs, and special characters from a string.

    Args:
        s: The string to be cleaned up.

    Returns:
        A string that has been cleaned up.
    """
    return re.sub(r'  *', ' ', re.sub(r'[^a-z]', ' ', 
                                      re.sub(r'www\.\S*', ' ', re.sub(r'http[s]?://\S*', ' ', s.lower())))).strip()


def tokenize(s):
    """
    Tokenize a string.

    Args:
        s: String to be tokenized.

    Returns:
        A list of words as the result of tokenization.
    """
    return word_tokenize(s)


'''
After doing some research, I came to the conclusion that this way of doing this, in this function, was not correct, and that I should either choose to stem or lemmatize words, but not do both
on top of each other.

It seems that lemmatization is the better option, although it can take longer as it's more complex, in the way that if tries to always return actual words (the dictionary form of a word),
rather than just remove suffixes and all that, like the stemming method does.

That being said, I will not use this function, and will instead use one to just perform the lemmatization method.


def stem_and_lemmatize(l):
    """
    Perform stemming and lemmatization on a list of words (tokens).

    Args:
        l: A list of strings.

    Returns:
        A list of strings after being stemmed and lemmatized.
    """
    ps = PorterStemmer()
    wnl = WordNetLemmatizer()
    
    stemmed_list = [ps.stem(w) for w in l]
    lemmed_on_stemmed_list = [wnl.lemmatize(w) for w in stemmed_list]
    
    return lemmed_on_stemmed_list
'''

def lemmatize(tokens):
    """
    Performs lemmatization on a list of tokens.
    
    Args:
        tokens: List of tokens
    
    Returns:
        Lemmatized list.
    """
    wnl = WordNetLemmatizer()
    
    # Took this from this website: https://www.guru99.com/stemming-lemmatization-python-nltk.html
    # I was having trouble because the lemmatizer wasn't recognizing the keys that were being returned by the pos_tag function
    # so I found this way of mapping the outputs in a way that was recognized as a wordnet value
    tag_map = defaultdict(lambda: wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    
    return [wnl.lemmatize(token, tag_map[tag[0]]) for token, tag in pos_tag(tokens)]


def remove_stopwords(lst, lang = 'english'):
    """
    Remove English (default) stopwords from a list of strings.

    Args:
        lst: A list of strings.
        lang = Language of the stopwords.

    Returns:
        A list of strings after stop words are removed.
    """
    stop_words = stopwords.words(lang)
    return [word for word in lst if word not in stop_words]

## Cleaning up the Comments

In [8]:
# Testing the functions created above
# Still a bit unsure if I should remove the stopwords or not

test_str = "This is just a test to see if these functions are working well or not."

print(f'Without stopwords removed: {lemmatize(tokenize(clean_up(test_str)))}')

print(f'\nWith stopwords removed: {remove_stopwords(lemmatize(tokenize(clean_up(test_str))))}')

Without stopwords removed: ['this', 'be', 'just', 'a', 'test', 'to', 'see', 'if', 'these', 'function', 'be', 'work', 'well', 'or', 'not']

With stopwords removed: ['test', 'see', 'function', 'work', 'well']


In [9]:
# Starting with a small 5k sample to see if it all works
df = comments.sample(5000, random_state=18)
df

Unnamed: 0,Title,Platform,Userscore,Comment,Username
69228,Super Smash Bros. for Wii U,WiiU,9,"This game is amazing, improves in every aspect...",Zin_49
274382,Zero Escape: Zero Time Dilemma,3DS,8,A bunch of bootleg robotic versions of the mai...,Techbane
254833,Metro: Last Light,PC,6,I liked the original this is some innovation b...,gstiker5
207122,PlanetSide 2,PC,5,Planetside 2 has battles of epic scale and de...,NasseSeta
140020,Diablo III,PC,1,Game looks good and overall i like the graphic...,JamesLFranco
...,...,...,...,...,...
87033,Tom Clancy's Ghost Recon Advanced Warfighter,Xbox360,5,Definitely a good title for great graphics bu...,CiaranG.
237704,The Sims,PlayStation2,10,This game is tight.,JonathanO.
177351,The Legend of Zelda: Four Swords Adventures,GameCube,10,I think this games very fun and chalenging fo...,OmarD.
17436,Half-Life,PC,10,"Half-Life has a great storyline, it was almos...",JaimyP


In [10]:
%%time
# Preparing the text for the analysis
df['Comments_Processed'] = df['Comment'].apply(lambda x: remove_stopwords(lemmatize(tokenize(clean_up(x)))))  # took 34.5 seconds
df

CPU times: user 34.1 s, sys: 363 ms, total: 34.4 s
Wall time: 34.5 s


Unnamed: 0,Title,Platform,Userscore,Comment,Username,Comments_Processed
69228,Super Smash Bros. for Wii U,WiiU,9,"This game is amazing, improves in every aspect...",Zin_49,"[game, amaze, improves, every, aspect, previou..."
274382,Zero Escape: Zero Time Dilemma,3DS,8,A bunch of bootleg robotic versions of the mai...,Techbane,"[bunch, bootleg, robotic, version, main, chara..."
254833,Metro: Last Light,PC,6,I liked the original this is some innovation b...,gstiker5,"[like, original, innovation, silent, main, cha..."
207122,PlanetSide 2,PC,5,Planetside 2 has battles of epic scale and de...,NasseSeta,"[planetside, battle, epic, scale, decent, gfx,..."
140020,Diablo III,PC,1,Game looks good and overall i like the graphic...,JamesLFranco,"[game, look, good, overall, like, graphic, gam..."
...,...,...,...,...,...,...
87033,Tom Clancy's Ghost Recon Advanced Warfighter,Xbox360,5,Definitely a good title for great graphics bu...,CiaranG.,"[definitely, good, title, great, graphic, game..."
237704,The Sims,PlayStation2,10,This game is tight.,JonathanO.,"[game, tight]"
177351,The Legend of Zelda: Four Swords Adventures,GameCube,10,I think this games very fun and chalenging fo...,OmarD.,"[think, game, fun, chalenging, single, player,..."
17436,Half-Life,PC,10,"Half-Life has a great storyline, it was almos...",JaimyP,"[half, life, great, storyline, almost, movie, ..."


In [11]:
%%time
# Testing now with the swifter method, to hopefully decrease the apply time
df['Comments_Processed'] = df['Comment'].swifter.apply(lambda x: remove_stopwords(lemmatize(tokenize(clean_up(x)))))  # took 43 seconds
df

# Hmm... I would expect it to go faster, and maybe decrease time in half, but it actually increased... lol
# In other tries it actually chose to use Dask's apply, hence the CPU times being very small (but wall time still the same)
# but this time it chose Pandas' apply, so the time increased slightly
# Maybe later with the full dataset I'll see a better decrease in time?

HBox(children=(HTML(value='Pandas Apply'), FloatProgress(value=0.0, max=5000.0), HTML(value='')))


CPU times: user 40.3 s, sys: 747 ms, total: 41.1 s
Wall time: 43 s


Unnamed: 0,Title,Platform,Userscore,Comment,Username,Comments_Processed
69228,Super Smash Bros. for Wii U,WiiU,9,"This game is amazing, improves in every aspect...",Zin_49,"[game, amaze, improves, every, aspect, previou..."
274382,Zero Escape: Zero Time Dilemma,3DS,8,A bunch of bootleg robotic versions of the mai...,Techbane,"[bunch, bootleg, robotic, version, main, chara..."
254833,Metro: Last Light,PC,6,I liked the original this is some innovation b...,gstiker5,"[like, original, innovation, silent, main, cha..."
207122,PlanetSide 2,PC,5,Planetside 2 has battles of epic scale and de...,NasseSeta,"[planetside, battle, epic, scale, decent, gfx,..."
140020,Diablo III,PC,1,Game looks good and overall i like the graphic...,JamesLFranco,"[game, look, good, overall, like, graphic, gam..."
...,...,...,...,...,...,...
87033,Tom Clancy's Ghost Recon Advanced Warfighter,Xbox360,5,Definitely a good title for great graphics bu...,CiaranG.,"[definitely, good, title, great, graphic, game..."
237704,The Sims,PlayStation2,10,This game is tight.,JonathanO.,"[game, tight]"
177351,The Legend of Zelda: Four Swords Adventures,GameCube,10,I think this games very fun and chalenging fo...,OmarD.,"[think, game, fun, chalenging, single, player,..."
17436,Half-Life,PC,10,"Half-Life has a great storyline, it was almos...",JaimyP,"[half, life, great, storyline, almost, movie, ..."


In [13]:
# Creating the 'target' column
## I'm assuming that reviews with a score >= 5 to be positive and with < 5 to be negative
df['Target'] = np.where((df['Userscore'] >= 5), 'Pos', 'Neg')

# Not sure if I need 3 labels or not, if I need this 'neutral' or if I should leave it like this
#df['Target'] = np.where((df['Userscore'] == 5), 'Neutral', df['Target'])

In [14]:
# Checking the values
df['Target'].value_counts()  # this is way too unbalanced

Pos    4131
Neg     869
Name: Target, dtype: int64

In [15]:
%%time

# Creating the bag of words
bow = [word for lst in df['Comments_Processed'] for word in lst]  # normal bow - just a list of words
fdist = FreqDist(bow)  # processed bow - nltk object with the distribution count of each word

# Getting just the 5k most common words
most_common = fdist.most_common(5000)  # final bow - list of tuples with each word and its total count

# Building the features
def find_features(document, bow):
    '''
    'bow' has to be the already processed 'most_common' one, and not a simple list of words.
    'document' is a string, in this case, it'll be a whole comment.
    
    The for loop below is getting the word and its respective count from the bow, and then populating the dictionary that was
    previously created with the key being the word from the bow, and the value either True or False, depending if said word is in
    the 'document/text/string' that is being checked or not.
    
    The final result is a dictionary of all the words in the bow (the 5k most common, in this case), with the key being the word of the bow
    and the value whether that word is in the string being analyzed or not.
    '''
    text = document.lower()
    features = dict()
    
    for word, count in bow:
        features[word] = word in text
        
    return features

# Making the matrix
def make_matrix(series_text, series_target, bow):
    # initially I had bool(t) in the return, and then I decided to remove it to see if I could actually get the
    ## labels to show up in the most_inf_feats instead of just true and false things, and it worked, but now it
    ## won't work again with bool(t) for some reason, the most_inf_feats always returns without any results
    ## I tried with the same values as in the NLP lab (0, 2, 4 - or just 0 - 4) still didn't work, so yeah
    return [(find_features(comment, bow), target) for comment, target in zip(series_text.values, series_target.values)]

matrix = make_matrix(df['Comment'], df['Target'], most_common)

# Defining the size to use for the training and testing
size = int(len(matrix) * 0.2)  # this is 20% of the data

# Training with 80% of the data and testing against the remaining 20%
training_set = matrix[size:]
testing_set = matrix[:size]

# Initializing and training the model
classifier = nltk.NaiveBayesClassifier.train(training_set)

# Showing the top 15 most informative features
classifier.show_most_informative_features(15)  # funny how the most informative feats are all negative, except for 1, even though the neg comments are much less in quantity
print('')

# Checking the model's accuracy
print('Model accuracy:', str(round(nltk.classify.accuracy(classifier, testing_set) * 100, 2)) + '%', '\n')
# Seems like a pretty good accuracy for a first try
# However, it's probably high 'cause the data is very unbalanced

Most Informative Features
                   greed = True              Neg : Pos    =     29.4 : 1.0
                  refund = True              Neg : Pos    =     27.5 : 1.0
                lifeless = True              Neg : Pos    =     26.9 : 1.0
                   false = True              Neg : Pos    =     23.7 : 1.0
                  delete = True              Neg : Pos    =     20.5 : 1.0
                defender = True              Neg : Pos    =     17.4 : 1.0
             maintenance = True              Neg : Pos    =     17.4 : 1.0
                 garbage = True              Neg : Pos    =     16.8 : 1.0
                  greedy = True              Neg : Pos    =     16.1 : 1.0
                 fanbase = True              Neg : Pos    =     16.1 : 1.0
                    glen = True              Neg : Pos    =     14.2 : 1.0
                customer = True              Neg : Pos    =     13.4 : 1.0
                    turd = True              Neg : Pos    =     12.3 : 1.0

## Training the model in the whole DF

In [16]:
%%time
# Preparing the text for the analysis
## swifter still decided to use pandas apply here, for some reason, so I just decided to stop using it
comments['Comments_Processed'] = comments['Comment'].apply(lambda x: remove_stopwords(lemmatize(tokenize(clean_up(x))))) # 33mins to finish

# Creating the 'target' column
comments['Target'] = np.where((comments['Userscore'] >= 5), 'Pos', 'Neg')

CPU times: user 32min 41s, sys: 32 s, total: 33min 13s
Wall time: 33min 38s


In [23]:
# Downcasting the column types just because, to save some memory
comments.dtypes

for col in comments.columns:
    if col not in ['Userscore', 'Comments_Processed']:
        comments[col] = comments[col].astype('string')  # unfortunately, seems like the comments_processed col cannot be changed into string

comments['Userscore'] = pd.to_numeric(comments['Userscore'], downcast='integer')

In [24]:
# Checking final memory usage
comments.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 282201 entries, 0 to 282200
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Title               282201 non-null  string
 1   Platform            282201 non-null  string
 2   Userscore           282201 non-null  int8  
 3   Comment             282201 non-null  string
 4   Username            282201 non-null  string
 5   Comments_Processed  282201 non-null  object
 6   Target              282201 non-null  string
dtypes: int8(1), object(1), string(5)
memory usage: 217.4 MB


In [25]:
%%time
# Creating the bag of words
bow = [word for lst in comments['Comments_Processed'] for word in lst]
fdist = FreqDist(bow)

# Getting just the 5k most common words
most_common = fdist.most_common(5000)

# Building the features and making the matrix
matrix = make_matrix(comments['Comment'], comments['Target'], most_common)

# Defining the size to use for the training and testing
size = int(len(matrix) * 0.2)  # xxx -> 20% of the data

# Training with 80% of the data and testing against the remaining 20%
training_set = matrix[size:]
testing_set = matrix[:size]

# Initializing and training the model
classifier = nltk.NaiveBayesClassifier.train(training_set)

# Showing the top 15 most informative features
most_inf_feat = classifier.show_most_informative_features(15)
print('')

# Checking the model's accuracy
print('Model accuracy:', str(round(nltk.classify.accuracy(classifier, testing_set) * 100, 2)) + '%', '\n')  # took 53 mins

Most Informative Features
              ticketsnew = True              Neg : Pos    =     31.3 : 1.0
            diretidegive = True              Neg : Pos    =     24.9 : 1.0
                 maestra = True              Pos : Neg    =     23.1 : 1.0
               excelente = True              Pos : Neg    =     22.8 : 1.0
                  refund = True              Neg : Pos    =     18.7 : 1.0
                  pikmin = True              Pos : Neg    =     18.0 : 1.0
                 mejores = True              Pos : Neg    =     17.8 : 1.0
                    duda = True              Pos : Neg    =     16.4 : 1.0
                  glados = True              Pos : Neg    =     16.4 : 1.0
                  sonora = True              Pos : Neg    =     14.9 : 1.0
             uninstalled = True              Neg : Pos    =     14.0 : 1.0
                disgrace = True              Neg : Pos    =     13.8 : 1.0
                    scam = True              Neg : Pos    =     13.2 : 1.0

In [26]:
'''
I find it strange that after all this significantly bigger training the model's accuracy has barely increase.
I would expect to see at least a 70% there.
On a side note, I find it very odd that almost all the most informative words are all neutral?? That's just so strange
to me, since neutral only accounts for 10k records, whereas positive has over 220k values!!

After I ran everything again the scores changed, however, the point still stands. After training with a huge amount
of data, the scores barely changed. Don't know if it is a good thing or not.

Pretty sure I might have a problem with this dataset being super unbalanced, since I have so many pos over negs. Have to look into it.
'''
comments['Target'].value_counts()

Pos    233865
Neg     48336
Name: Target, dtype: Int64

In [28]:
# Just taking a look at the final dataframe
comments.head()

Unnamed: 0,Title,Platform,Userscore,Comment,Username,Comments_Processed,Target
0,The Legend of Zelda: Ocarina of Time,Nintendo64,10,"Everything in OoT is so near at perfection, it...",SirCaestus,"[everything, oot, near, perfection, really, wo...",Pos
1,The Legend of Zelda: Ocarina of Time,Nintendo64,10,I won't bore you with what everyone is already...,Kaistlin,"[win, bore, everyone, already, say, amazing, g...",Pos
2,The Legend of Zelda: Ocarina of Time,Nintendo64,10,Anyone who gives the masterpiece below a 7 or ...,Jacody,"[anyone, give, masterpiece, either, hate, asto...",Pos
3,The Legend of Zelda: Ocarina of Time,Nintendo64,10,I'm one of those people who think that this is...,doodlerman,"[one, people, think, great, game, time, matter...",Pos
4,The Legend of Zelda: Ocarina of Time,Nintendo64,10,This game is the highest rated game on Metacr...,StevenA,"[game, high, rated, game, metacritic, good, re...",Pos


In [29]:
# Saving the trained algorithm
save_classifier = open('naivebayes_original_pt2_full_dataset.pickle', 'wb')
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [None]:
# To open the classifier and use it
'''classifier_f = open('naivebayes.pickle', 'rb')
classifier = pickle.load(classifier_f)
classifier_f.close()'''

In [30]:
# Resetting the index before the export so it looks better
df.reset_index(drop = True, inplace = True)

# Exporting this DF to always use the same 5k sample in the next notebook for different classifiers (algorithms)
df.to_json('Datasets/comments_5ksample.json')  # exporting as json to maintain the 'comments_processed' data type