# Pre Processing Step
We want to process the data for LDA like:
- Keep NOUNS only as they are the elements that mostly hold aspect value
- Split reviews in sentences (Avoid topic modelling in a too broad sense) (https://aclanthology.org/N10-1122.pdf)

In [3]:
import pandas as pd
from gensim import corpora

# LDA does not benefit from repeated words among many documents. So we should clean them as they are stopwords.
corpus_path = "../dataset/output/default_sentences/pre_processed.80k.csv"
ds = pd.read_csv(corpus_path)['comments'].apply(lambda x: x.split(' '))

dictionary = corpora.Dictionary(ds)
doc_frequency = {}

for index in range(len(dictionary)):
    doc_frequency[dictionary.get(index)] = len(ds[[dictionary.get(index) in x for x in ds]])

pd.DataFrame.from_dict(doc_frequency, orient='index')[0].map(lambda x: x / len(ds))

{'game': 28115,
 'german': 110,
 'involve': 286,
 'long': 1689,
 'politic': 24,
 'political': 70,
 'hard': 1027,
 'intrigue': 39,
 'love': 2026,
 'play': 13867,
 'player': 6769,
 'try': 1797,
 'drawback': 37,
 'incredible': 72,
 'length': 266,
 'rule': 2633,
 'translation': 26,
 'cardboard': 96,
 'excellent': 604,
 'nice': 1704,
 'piece': 525,
 'token': 352,
 'balanced': 254,
 'fairly': 375,
 'sister': 22,
 'tend': 158,
 'collection': 658,
 'core': 256,
 'mid': 99,
 'strategy': 1621,
 'elegant': 159,
 'highly': 276,
 'score': 820,
 'simple': 1577,
 'abstract': 469,
 'board': 1902,
 'charm': 42,
 'island': 84,
 'japanese': 34,
 'paste': 54,
 'shape': 77,
 'sort': 267,
 'theme': 2257,
 'true': 152,
 'work': 853,
 'badly': 54,
 'draw': 749,
 'like': 6490,
 'screw': 115,
 'think': 2338,
 'tile': 1237,
 'big': 978,
 'fan': 526,
 'luck': 1294,
 'ruin': 86,
 'sale': 84,
 'trade': 348,
 'available': 308,
 'bean': 23,
 'card': 5601,
 'edition': 649,
 'miss': 358,
 'name': 60,
 'number': 615,
 '

In [9]:
pd.DataFrame.from_dict(doc_frequency, orient='index')[0].map(lambda x: x / len(ds))

game         0.411098
german       0.001608
involve      0.004182
long         0.024697
politic      0.000351
               ...   
muppet       0.000015
pigs         0.000015
coerce       0.000015
hibernate    0.000015
improper     0.000015
Name: 0, Length: 19749, dtype: float64

Ar 'game'(40%), 'play'(20%) and '<game_name>'(11%) stopwords? <br>
For sure 'game' is a stopword, what for the other two? I believe they bring no context.
We remove them!

In [1]:
stop_words = ['game', 'play', '<game_name>']

In [2]:
# Default approach on the sentences only to avoid global topic recognition
from model import LdaModelGenerator, LdaGeneratorConfig

corpus_path = "../dataset/output/default_sentences/pre_processed.80k.csv"
default_config = LdaGeneratorConfig(corpus_file_path=corpus_path)
lda, dictionary = LdaModelGenerator(default_config, stop_words).make_model()

Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/68390 [00:00<?, ?it/s]

In [3]:
from gensim.models import CoherenceModel
import pandas as pd

test_ds = pd.read_csv("../dataset/output/default_sentences/pre_processed.80k.test.csv")

# For the c_v model
texts = test_ds['comments'].apply(lambda x: x.split(' '))

topn = 10  # For the coherence evaluation.
coh = CoherenceModel(lda, texts=texts, coherence='c_v', topn=topn)
print(f"Model perplexity: {lda.log_perplexity(texts.apply(lambda x: dictionary.doc2bow(x)).tolist())}")
print(f"On topn={topn} we have:")
print(f"Model coherence of: {coh.get_coherence()}")
print("Coherence per topic:")
coh.get_coherence_per_topic()

Model perplexity: -28.539873341336463
On topn=10 we have:
Model coherence of: 0.3806187801058944
Coherence per topic:


[0.30619673418016713,
 0.625321044315345,
 0.3451918285525357,
 0.3961520295908048,
 0.3716536346661269,
 0.31241651938227855,
 0.38341617253487703,
 0.37256915384219746,
 0.3643398730431328,
 0.462646736713212,
 0.37025132057848176,
 0.24999385809652908,
 0.4016976786890141,
 0.3668163372978198]

In [4]:
lda.show_topics(20, 20)

[(0,
  '0.056*"know" + 0.049*"use" + 0.038*"thing" + 0.030*"similar" + 0.029*"actually" + 0.025*"option" + 0.021*"worth" + 0.021*"series" + 0.020*"good" + 0.018*"player" + 0.017*"copy" + 0.017*"die" + 0.017*"see" + 0.016*"sell" + 0.015*"update" + 0.015*"fact" + 0.015*"word" + 0.014*"plus" + 0.014*"basically" + 0.014*"available"'),
 (1,
  '0.234*"card" + 0.057*"board" + 0.039*"box" + 0.029*"big" + 0.028*"draw" + 0.025*"hand" + 0.025*"hard" + 0.024*"small" + 0.019*"ability" + 0.018*"player" + 0.018*"piece" + 0.017*"pick" + 0.017*"setup" + 0.017*"need" + 0.015*"story" + 0.015*"action" + 0.015*"track" + 0.014*"opponent" + 0.013*"special" + 0.012*"run"'),
 (2,
  '0.082*"lot" + 0.065*"long" + 0.061*"new" + 0.052*"take" + 0.035*"group" + 0.034*"design" + 0.032*"time" + 0.028*"player" + 0.021*"good" + 0.020*"fun" + 0.019*"rule" + 0.018*"write" + 0.017*"say" + 0.016*"problem" + 0.016*"leave" + 0.016*"deep" + 0.015*"bit" + 0.014*"campaign" + 0.013*"expect" + 0.012*"trick"'),
 (3,
  '0.083*"point

### POS: Nouns only

Now we check if the NOUN only approach works better. <br>
Take nouns only of the POS tagged ds:

In [29]:
# Now we try to work on noun only to see if we get a good representation.
corpus_path = "../dataset/output/pos_tagged/pre_processed.80k.csv"
pos_tagged_ds = pd.read_csv(corpus_path)['comments']

ds = pos_tagged_ds.swifter.apply(lambda x: x.split(' '))
ds = ds.swifter.apply(lambda x: [w.split('__noun')[0] for w in x if w.endswith('__noun')])
ds = ds[ds.map(len) > 1].map(lambda x: ' '.join(x)).drop_duplicates()

corpus_path = "../dataset/output/pos_tagged/pre_processed.80k.noun_only.csv"
ds.to_csv(corpus_path, index=False)

# Now we also make the test set
test_corpus_path = "../dataset/output/pos_tagged/pre_processed.80k.test.csv"
pos_tagged_ds = pd.read_csv(test_corpus_path)['comments']

ds = pos_tagged_ds.swifter.apply(lambda x: x.split(' '))
ds = ds.swifter.apply(lambda x: [w.split('__noun')[0] for w in x if w.endswith('__noun')])
ds = ds[ds.map(len) > 1].map(lambda x: ' '.join(x)).drop_duplicates()

test_corpus_path = "../dataset/output/pos_tagged/pre_processed.80k.noun_only.test.csv"
ds.to_csv(test_corpus_path, index=False)

Pandas Apply:   0%|          | 0/60701 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/60701 [00:00<?, ?it/s]

In [9]:
import pandas as pd
from gensim import corpora

# LDA does not benefit from repeated words among many documents. So we should clean them as they are stopwords.
corpus_path = "../dataset/output/pos_tagged/pre_processed.80k.noun_only.csv"
ds = pd.read_csv(corpus_path)['comments'].apply(lambda x: x.split(' '))

count_dictionary = corpora.Dictionary(ds)
doc_frequency = {}

for index in range(len(count_dictionary)):
    doc_frequency[count_dictionary.get(index)] = len(ds[[count_dictionary.get(index) in x for x in ds]])

pd.DataFrame.from_dict(doc_frequency, orient='index')[0].map(lambda x: x / len(ds))

drawback       0.001420
game           0.683538
length         0.008883
translation    0.001170
piece          0.018054
token          0.011013
fun            0.116711
sister         0.000940
board          0.061569
charm          0.001497
Name: 0, dtype: float64

In [6]:
# Default approach on the sentences only to avoid global topic recognition
from model import LdaModelGenerator, LdaGeneratorConfig

stop_words = ['game', 'play', '<game_name>']
corpus_path = "../dataset/output/pos_tagged/pre_processed.80k.noun_only.csv"
default_config = LdaGeneratorConfig(corpus_file_path=corpus_path)
noun_lda, noun_dictionary = LdaModelGenerator(default_config, stop_words).make_model()

Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/52120 [00:00<?, ?it/s]

In [7]:
from gensim.models import CoherenceModel
import pandas as pd

test_corpus_path = "../dataset/output/pos_tagged/pre_processed.80k.noun_only.test.csv"
test_ds = pd.read_csv(test_corpus_path)

# For the c_v model
texts = test_ds['comments'].apply(lambda x: x.split(' '))

topn = 10  # For the coherence evaluation.
coh = CoherenceModel(noun_lda, texts=texts, coherence='c_v', topn=topn)
print(f"Model perplexity: {noun_lda.log_perplexity(texts.apply(lambda x: noun_dictionary.doc2bow(x)).tolist())}")
print(f"On topn={topn} we have:")
print(f"Model coherence of: {coh.get_coherence()}")
print("Coherence per topic:")
coh.get_coherence_per_topic()

Model perplexity: -20.254782829421483
On topn=10 we have:
Model coherence of: 0.521321437417917
Coherence per topic:


[0.41723452087712937,
 0.8103165457515475,
 0.6801578855107604,
 0.3912342857105625,
 0.5440103502257652,
 0.39467163953274637,
 0.5219267761869469,
 0.4897019448296467,
 0.6979860354360248,
 0.4267472846778877,
 0.4671028205934368,
 0.5115350579793349,
 0.49584457938416193,
 0.45003039715488774]

In [49]:
noun_lda.show_topics(20, 20)

[(0,
  '0.170*"game" + 0.144*"rule" + 0.102*"play" + 0.059*"rating" + 0.030*"lot" + 0.020*"mode" + 0.019*"system" + 0.016*"brain" + 0.016*"battle" + 0.016*"series" + 0.015*"head" + 0.015*"designer" + 0.014*"enemy" + 0.013*"book" + 0.012*"depth" + 0.012*"rulebook" + 0.011*"wargame" + 0.011*"way" + 0.011*"house" + 0.011*"company"'),
 (1,
  '0.148*"game" + 0.143*"<game_name>" + 0.104*"version" + 0.097*"worker" + 0.095*"placement" + 0.074*"strategy" + 0.062*"art" + 0.027*"euro" + 0.024*"work" + 0.022*"idea" + 0.022*"taste" + 0.018*"aspect" + 0.016*"randomness" + 0.015*"half" + 0.014*"memory" + 0.014*"reason" + 0.011*"racing" + 0.011*"mechanism" + 0.010*"fight" + 0.008*"chess"'),
 (2,
  '0.074*"game" + 0.071*"collection" + 0.057*"map" + 0.048*"space" + 0.047*"board" + 0.038*"bonus" + 0.034*"weight" + 0.033*"drafting" + 0.028*"price" + 0.024*"addition" + 0.023*"word" + 0.021*"world" + 0.021*"trade" + 0.020*"improvement" + 0.020*"train" + 0.019*"wife" + 0.017*"text" + 0.017*"change" + 0.016*"

In [None]:
# Now the model has high coherence which is good but are the extracted aspects good?
# Probably not.

In [5]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda, texts.apply(lambda x: dictionary.doc2bow(x)).tolist(), dictionary)
vis

In [8]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(noun_lda, texts.apply(lambda x: noun_dictionary.doc2bow(x)).tolist(), noun_dictionary)
vis

In [None]:
# Topic 1 identifies Downtime for example. Yet it also does crash with maybe with bookkeeping.
# We have to better tune the parameters to check if we can create better communities

Following the experiments I decided to follow the path of nouns only composition document. <br>
The hyperparameters tuning is now the next step.