In [31]:
import numpy as np
import pandas as pd

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
# import en_core_web_lg

from tqdm import notebook as tqdm
from pprint import pprint

import pyLDAvis
import pyLDAvis.gensim 
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
nlp = spacy.load("en")

In [8]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

# Load Data

In [11]:
train_df = pd.read_csv('../data/cats/train.csv', index_col=0)
valid_df = pd.read_csv('../data/cats/valid.csv', index_col=0)
test_df = pd.read_csv('../data/cats/test.csv', index_col=0)

print(len(train_df), len(valid_df), len(test_df), len(train_df)+ len(valid_df)+ len(test_df))

train_df.head()

10754 3068 1549 15371


Unnamed: 0,game_id,description,Card_Game,Wargame,Fantasy,Party_Game,Dice,Fighting,Science_Fiction,Childrens_Game,Abstract_Strategy,Economic,rand
3,224517,Brass: Birmingham is an economic strategy game...,0,0,0,0,0,0,0,0,0,1,0.327588
5,233078,Twilight Imperium (Fourth Edition) is a game o...,0,1,0,0,0,0,1,0,0,1,0.21512
6,187645,From the publisher:<br/><br/>Star Wars: Rebell...,0,1,0,0,0,1,1,0,0,0,0.434723
7,12333,"&quot;Now the trumpet summons us again, not as...",0,1,0,0,0,0,0,0,0,0,0.567444
8,220308,Gaia Project is a new game in the line of Terr...,0,0,0,0,0,0,1,0,0,1,0.474764


In [15]:
train_df['description'] = train_df.description.apply(lambda x: strip_tags(x))
valid_df['description'] = valid_df.description.apply(lambda x: strip_tags(x))
test_df['description'] = test_df.description.apply(lambda x: strip_tags(x))

train_df.head()

Unnamed: 0,game_id,description,Card_Game,Wargame,Fantasy,Party_Game,Dice,Fighting,Science_Fiction,Childrens_Game,Abstract_Strategy,Economic,rand
3,224517,Brass: Birmingham is an economic strategy game...,0,0,0,0,0,0,0,0,0,1,0.327588
5,233078,Twilight Imperium (Fourth Edition) is a game o...,0,1,0,0,0,0,1,0,0,1,0.21512
6,187645,From the publisher:Star Wars: Rebellion is a b...,0,1,0,0,0,1,1,0,0,0,0.434723
7,12333,"""Now the trumpet summons us again, not as a ca...",0,1,0,0,0,0,0,0,0,0,0.567444
8,220308,Gaia Project is a new game in the line of Terr...,0,0,0,0,0,0,1,0,0,1,0.474764


In [16]:
# Iterates over the words in the stop words list and resets the "is_stop" flag.
for word in STOP_WORDS:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True

In [17]:
def lemmatizer(doc):
    # This takes in a doc of tokens from the NER and lemmatizes them. 
    # Pronouns (like "I" and "you" get lemmatized to '-PRON-', so I'm removing those.
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)
    
def remove_stopwords(doc):
    # This will remove stopwords and punctuation.
    # Use token.text to return strings, which we'll need for Gensim.
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

# The add_pipe function appends our functions to the default pipeline.
nlp.add_pipe(lemmatizer,name='lemmatizer',after='ner')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

In [19]:
doc_list = []
# Iterates through each article in the corpus.
for doc in tqdm(train_df.description):
    # Passes that article through the pipeline and adds to a new list.
    pr = nlp(doc)
    doc_list.append(pr)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=10754.0), HTML(value='')))




In [22]:
# Creates, which is a mapping of word IDs to words.
words = corpora.Dictionary(doc_list)

# Turns each document into a bag of words.
corpus = [words.doc2bow(doc) for doc in doc_list]

In [23]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words,
                                           num_topics=10, 
                                           random_state=2,
                                           update_every=1,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [24]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.081*"ship" + 0.043*"treasure" + 0.031*"pirate" + 0.017*"Star" + '
  '0.016*"crew" + 0.015*"island" + 0.010*"boat" + 0.009*"coin" + 0.009*"sea" + '
  '0.009*"captain"'),
 (1,
  '0.033*"game" + 0.030*"player" + 0.008*"good" + 0.006*"time" + '
  '0.006*"question" + 0.006*"money" + 0.006*"word" + 0.006*"win" + '
  '0.006*"answer" + 0.006*"build"'),
 (2,
  '0.077*"card" + 0.071*"player" + 0.031*"game" + 0.024*"play" + 0.017*"point" '
  '+ 0.017*"turn" + 0.013*"win" + 0.012*"place" + 0.012*"dice" + '
  '0.011*"number"'),
 (3,
  '0.040*"game" + 0.023*"card" + 0.017*"character" + 0.015*"play" + '
  '0.014*"player" + 0.008*"Game" + 0.007*"deck" + 0.007*"     " + 0.006*"new" '
  '+ 0.006*"use"'),
 (4,
  '0.017*"de" + 0.010*"des" + 0.008*"et" + 0.007*"la" + 0.007*"le" + 0.006*"à" '
  '+ 0.006*"les" + 0.004*"di" + 0.004*"une" + 0.004*"Caesar"'),
 (5,
  '0.020*"player" + 0.017*"game" + 0.008*"power" + 0.008*"control" + '
  '0.007*"use" + 0.007*"battle" + 0.007*"attack" + 0.006*"fight" + '

In [29]:
# Compute Perplexity
# a measure of how good the model is. lower the better.
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  


Perplexity:  -8.644986829102502


In [33]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, words)
vis

# With TF-IDF

In [35]:
tfidf = gensim.models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [38]:
lda_model_tfidf = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf,
                                           id2word=words,
                                           num_topics=10, 
                                           random_state=2,
                                           update_every=1,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [39]:
# Print the Keyword in the 10 topics
pprint(lda_model_tfidf.print_topics())
doc_lda = lda_model_tfidf[corpus_tfidf]

[(0,
  '0.004*"letter" + 0.002*"ball" + 0.002*"spinner" + 0.002*"trivia" + '
  '0.002*"spin" + 0.001*"Pursuit" + 0.001*"booklet" + 0.001*"Trivial" + '
  '0.001*"wheel" + 0.001*"soviet"'),
 (1,
  '0.006*"Monopoly" + 0.002*"ingredient" + 0.002*"Disney" + 0.002*"200" + '
  '0.002*"trump" + 0.002*"phrase" + 0.001*"3d" + 0.001*"chicken" + '
  '0.001*"pencil" + 0.001*"sentence"'),
 (2,
  '0.001*"butterfly" + 0.001*"honey" + 0.000*"hive" + 0.000*"Jeux" + '
  '0.000*"Créateurs" + 0.000*"Société" + 0.000*"Concours" + 0.000*"bee" + '
  '0.000*"Ligretto" + 0.000*"Blokus"'),
 (3,
  '0.000*"iceberg" + 0.000*"antique" + 0.000*"Titanic" + 0.000*"Sheep" + '
  '0.000*"LEGO" + 0.000*"Hobbit" + 0.000*"Unexpected" + 0.000*"bell" + '
  '0.000*"Forward" + 0.000*"baby"'),
 (4,
  '0.001*"Stories" + 0.001*"penguin" + 0.001*"fiddle" + 0.001*"reconstruct" + '
  '0.001*"spooky" + 0.000*"morbid" + 0.000*"Vader" + 0.000*"fiddly" + '
  '0.000*"teenager" + 0.000*"Darth"'),
 (5,
  '0.006*"card" + 0.006*"  " + 0.003*"p

In [40]:
# Compute Perplexity
# a measure of how good the model is. lower the better.
print('\nPerplexity: ', lda_model_tfidf.log_perplexity(corpus_tfidf))  


Perplexity:  -11.701924723028032


In [41]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_tfidf, corpus_tfidf, words)
vis