In [1]:
import numpy as np
import pandas as pd

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
# import en_core_web_lg

import tqdm
from pprint import pprint

import pyLDAvis
import pyLDAvis.gensim 
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
nlp = spacy.load("en")

# Load Data

In [3]:
data_df = pd.read_csv('../data/bgg_data_mod.csv', index_col=0)

data_df = data_df[data_df.game_desc.str.len() > 5]

print(data_df.shape)
data_df.head()

(18693, 266)


Unnamed: 0,name,game_desc,boardgamecategorys,boardgamemechanics,game_id,CAT:,CAT:Abstract_Strategy,CAT:Action/Dexterity,CAT:Adventure,CAT:Age_of_Reason,...,mech:Variable_Phase_Order,mech:Variable_Player_Powers,mech:Variable_Setup,mech:Victory_Points_as_a_Resource,mech:Voting,mech:Worker_Placement,mech:Worker_Placement_with_Dice_Workers,mech:You_Choose,mech:Zone_of_Control,mech:and_Pool_Building
0,Gloomhaven,Gloomhaven is a game of Euro-inspired tactica...,"['Adventure', 'Exploration', 'Fantasy', 'Fight...","['Action Retrieval', 'Campaign / Battle Card D...",174430,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
1,Pandemic Legacy: Season 1,Pandemic Legacy is a co-operative campaign gam...,"['Environmental', 'Medical']","['Action Points', 'Cooperative Game', 'Hand Ma...",161936,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,Terraforming Mars,"In the 2400s, mankind begins to terraform the ...","['Economic', 'Environmental', 'Industry / Manu...","['Card Drafting', 'End Game Bonuses', 'Hand Ma...",167791,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,Brass: Birmingham,Brass: Birmingham is an economic strategy game...,"['Economic', 'Industry / Manufacturing', 'Tran...","['Connections', 'Hand Management', 'Income', '...",224517,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,Through the Ages: A New Story of Civilization,Through the Ages: A New Story of Civilization ...,"['Card Game', 'Civilization', 'Economic']","['Action Points', 'Auction/Bidding', 'Auction:...",182028,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


train_df = pd.read_csv('../data/cats/train.csv', index_col=0)
valid_df = pd.read_csv('../data/cats/valid.csv', index_col=0)
test_df = pd.read_csv('../data/cats/test.csv', index_col=0)

print(len(train_df), len(valid_df), len(test_df), len(train_df)+ len(valid_df)+ len(test_df))

train_df.head()

train_df['description'] = train_df.description.apply(lambda x: strip_tags(x))
valid_df['description'] = valid_df.description.apply(lambda x: strip_tags(x))
test_df['description'] = test_df.description.apply(lambda x: strip_tags(x))

train_df.head()

In [4]:
# Iterates over the words in the stop words list and resets the "is_stop" flag.
for word in STOP_WORDS:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True

In [5]:
def lemmatizer(doc):
    # This takes in a doc of tokens from the NER and lemmatizes them. 
    # Pronouns (like "I" and "you" get lemmatized to '-PRON-', so I'm removing those.
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)
    
def remove_stopwords(doc):
    # This will remove stopwords and punctuation.
    # Use token.text to return strings, which we'll need for Gensim.
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

# The add_pipe function appends our functions to the default pipeline.
nlp.add_pipe(lemmatizer,name='lemmatizer',after='ner')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

In [6]:
doc_list = []
# Iterates through each article in the corpus.
for doc in tqdm.tqdm(data_df.description):
    # Passes that article through the pipeline and adds to a new list.
    pr = nlp(doc)
    doc_list.append(pr)

100%|████████████████████████████████████████████████████████████████████████████| 18693/18693 [07:49<00:00, 39.85it/s]


In [7]:
# Creates, which is a mapping of word IDs to words.
words = corpora.Dictionary(doc_list)

# Turns each document into a bag of words.
corpus = [words.doc2bow(doc) for doc in doc_list]

In [8]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=words,
                                           num_topics=10, 
                                           random_state=2,
                                           update_every=1,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [9]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.064*"tile" + 0.051*"board" + 0.050*"piece" + 0.024*"place" + '
  '0.022*"space" + 0.019*"animal" + 0.017*"color" + 0.016*"square" + '
  '0.016*"pawn" + 0.015*"player"'),
 (1,
  '0.083*"card" + 0.080*"player" + 0.033*"game" + 0.026*"play" + 0.018*"point" '
  '+ 0.018*"  " + 0.017*"turn" + 0.014*"win" + 0.013*"number" + 0.012*"dice"'),
 (2,
  '0.061*"de" + 0.021*"la" + 0.015*"que" + 0.015*"en" + 0.014*"y" + 0.013*"el" '
  '+ 0.012*"un" + 0.009*"los" + 0.007*"se" + 0.007*"para"'),
 (3,
  '0.211*"    " + 0.013*"robot" + 0.011*"Dé" + 0.011*"circuit" + '
  '0.009*"implement" + 0.005*"Rock" + 0.005*"Formule" + 0.005*"Paper" + '
  '0.005*"Devil" + 0.005*"№"'),
 (4,
  '0.043*"game" + 0.019*"  " + 0.014*"player" + 0.008*"question" + '
  '0.008*"team" + 0.007*"answer" + 0.007*"play" + 0.007*"word" + 0.007*"try" + '
  '0.007*"board"'),
 (5,
  '0.027*"money" + 0.020*"game" + 0.017*"player" + 0.015*"buy" + 0.014*"city" '
  '+ 0.011*"sell" + 0.009*"property" + 0.009*"company" + 0.009*"count

In [10]:
# Compute Perplexity
# a measure of how good the model is. lower the better.
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  


Perplexity:  -8.690066921572363


In [11]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, words)
vis

# With TF-IDF

In [12]:
tfidf = gensim.models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [13]:
lda_model_tfidf = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf,
                                           id2word=words,
                                           num_topics=10, 
                                           random_state=2,
                                           update_every=1,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [14]:
# Print the Keyword in the 10 topics
pprint(lda_model_tfidf.print_topics())
doc_lda = lda_model_tfidf[corpus_tfidf]

[(0,
  '0.000*"Rory" + 0.000*"Wonderland" + 0.000*"telling" + 0.000*"Alice" + '
  '0.000*"Carrom" + 0.000*"librarian" + 0.000*"SonhandoSonhando" + '
  '0.000*"Brady" + 0.000*"Dés" + 0.000*"Merveilles"'),
 (1,
  '0.000*"burger" + 0.000*"Burger" + 0.000*"Cry" + 0.000*"Havoc" + '
  '0.000*"Railways" + 0.000*"cookie" + 0.000*"dino" + 0.000*"Shogi" + '
  '0.000*"mustard" + 0.000*"Ketchup"'),
 (2,
  '0.000*"Turtles" + 0.000*"turtle" + 0.000*"Créateurs" + 0.000*"Concours" + '
  '0.000*"Société" + 0.000*"Teenage" + 0.000*"Jeux" + 0.000*"Ogre" + '
  '0.000*"TMNT" + 0.000*"Ali"'),
 (3,
  '0.002*"ingredient" + 0.002*"=" + 0.002*"ghost" + 0.002*"Disney" + '
  '0.002*"memory" + 0.002*"dog" + 0.002*"judge" + 0.002*"pop" + 0.002*"brain" '
  '+ 0.002*"british"'),
 (4,
  '0.003*"$" + 0.002*"hotel" + 0.002*"monopoly" + 0.002*"frog" + 0.002*"trump" '
  '+ 0.001*"tip" + 0.001*"electronic" + 0.001*"pad" + 0.001*"mouth" + '
  '0.001*"sword"'),
 (5,
  '0.001*"ant" + 0.000*"ladybug" + 0.000*"anthill" + 0.000*

In [15]:
# Compute Perplexity
# a measure of how good the model is. lower the better.
print('\nPerplexity: ', lda_model_tfidf.log_perplexity(corpus_tfidf))  


Perplexity:  -11.715382635112029


In [17]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_tfidf, corpus_tfidf, words)
vis