In [1]:
# Common default imports
from pprint import pprint
from model_manager import LDAManager
from main.lda.config import LdaGeneratorConfig
from gensim import corpora
import pandas as pd
from gensim.corpora import Dictionary
# Plotting utility for the LDA models
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Stop words identified from the next blocks
stop_words = ['game', 'play', '<game_name>']

# Pre Processing Step
We want to process the data for LDA like:
- Keep NOUNS only as they are the elements that mostly hold aspect value
- Split reviews in sentences (Avoid topic modelling in a too broad sense) (https://aclanthology.org/N10-1122.pdf)

In [None]:
# LDA does not benefit from repeated words among many documents. So we should clean them as they are stopwords.
corpus_path = "../dataset/output/default_sentences/pre_processed.310k.csv"
ds = pd.read_csv(corpus_path)['comments'].apply(lambda x: x.split(' '))

dictionary = corpora.Dictionary(ds)
doc_frequency = {}

for index in range(len(dictionary)):
    doc_frequency[dictionary.get(index)] = len(ds[[dictionary.get(index) in x for x in ds]])
pd.DataFrame.from_dict(doc_frequency, orient='index')[0].map(lambda x: x / len(ds)).sort_values(ascending=False)

Are 'game'(40%), 'play'(20%) and '\<game_name\>'(11%) stopwords? <br>
For sure 'game' is! What about the other two? I believe they bring no context.
We remove them!

### Default: Sentences slit

In [6]:
# 80k
corpus_path = "../dataset/output/default_sentences/pre_processed.80k.csv"
test_corpus_path = "../dataset/output/default_sentences/pre_processed.80k.test.csv"

model_manager = LDAManager.from_config(LdaGeneratorConfig(name="default_sentences_80k"), stop_words)
model = model_manager.get_model(corpus_path, True, True)
res = model_manager.evaluate(test_corpus_path)

pprint(res)

Generating a new compiled model from fs
Model not found. Making a new one.
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/68390 [00:00<?, ?it/s]

{'coherence': [-17.177862702834112,
               -17.02777543298526,
               -17.290263201442876,
               -16.936958294632383],
 'perplexity': -8.588481933552947,
 'topn': [3, 5, 10, 20]}


In [7]:
dictionary: Dictionary = model.id2word
# Load texts to
texts = pd.read_csv(test_corpus_path)['comments'].apply(lambda x: x.split(' '))

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(model, texts.apply(lambda x: dictionary.doc2bow(x)).tolist(), dictionary)
# Render the plot
vis

Now the bigger dataset for 310k

In [2]:
corpus_path = "../dataset/output/default_sentences/pre_processed.310k.csv"
test_corpus_path = "../dataset/output/default_sentences/pre_processed.310k.test.csv"

model_manager = LDAManager.from_config(LdaGeneratorConfig(name="default_sentences"), stop_words)
model = model_manager.get_model(corpus_path, True, True)
res = model_manager.evaluate(test_corpus_path)

pprint(res)

Generating a new compiled model from fs
Model not found. Making a new one.
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/233897 [00:00<?, ?it/s]

{'coherence': [-14.345296267821746,
               -13.742494545430045,
               -14.779922291032763,
               -15.116752158504827],
 'perplexity': -8.19623024177193,
 'topn': [3, 5, 10, 20]}


In [3]:
dictionary: Dictionary = model.id2word
# Load texts to
texts = pd.read_csv(test_corpus_path)['comments'].apply(lambda x: x.split(' '))

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(model, texts.apply(lambda x: dictionary.doc2bow(x)).tolist(), dictionary)
# Render the plot
vis

### NOUN-sentences ds
Filter out any non noun from dataset using the POS tagged processing pipleine

In [None]:
from pre_processing import extract_pos_ds
import pandas as pd

print("Creating the __noun filtered ds:")
corpus_path = "../dataset/output/pos_tagged_sentence_level/pre_processed.310k.csv"
store_path = "../dataset/output/pos_tagged_sentence_level/pre_processed.310k.noun_only.csv"
extract_pos_ds(pd.read_csv(corpus_path)['comments'], "__noun", store_path)
print("ds created under: " + store_path)

print("Creating the __noun filtered test ds:")
corpus_path = "../dataset/output/pos_tagged_sentence_level/pre_processed.310k.test.csv"
store_path = "../dataset/output/pos_tagged_sentence_level/pre_processed.310k.noun_only.test.csv"
extract_pos_ds(pd.read_csv(corpus_path)['comments'], "__noun", store_path)
print("ds created under: " + store_path)

Now we check if the NOUN only approach works better. <br>
Take nouns only of the POS tagged ds:

In [None]:
corpus_path = "../dataset/output/pos_tagged_sentence_level/pre_processed.310k.noun_only.csv"
test_corpus_path = "../dataset/output/pos_tagged_sentence_level/pre_processed.310k.noun_only.test.csv"

model_manager = LDAManager.from_config(LdaGeneratorConfig(name="default_sentences"), stop_words)
model = model_manager.get_model(corpus_path, False, True)
res = model_manager.evaluate(test_corpus_path)

pprint(res)

In [14]:
dictionary: Dictionary = model.id2word
# Load texts to
texts = pd.read_csv(test_corpus_path)['comments'].apply(lambda x: x.split(' '))

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(model, texts.apply(lambda x: dictionary.doc2bow(x)).tolist(), dictionary)
vis

Now that the dataset was built and stored train an LDA on it:

In [4]:
corpus_path = "../dataset/output/pos_tagged/pre_processed.310k.noun_only.csv"
test_corpus_path = "../dataset/output/pos_tagged/pre_processed.310k.noun_only.test.csv"

model_manager = LDAManager.from_config(LdaGeneratorConfig(name="default_sentences"), stop_words)
model = model_manager.get_model(corpus_path, False, True)
res = model_manager.evaluate(test_corpus_path)

pprint(res)

Generating a new compiled model from scratch
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/195789 [00:00<?, ?it/s]

{'coherence': [-8.206318645320582,
               -8.092144166145504,
               -8.572682666308848,
               -9.292421386724754],
 'perplexity': -7.193918552296681,
 'topn': [3, 5, 10, 20]}


In [5]:
dictionary: Dictionary = model.id2word
# Load texts to
texts = pd.read_csv(test_corpus_path)['comments'].apply(lambda x: x.split(' '))

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(model, texts.apply(lambda x: dictionary.doc2bow(x)).tolist(), dictionary)

vis

Following the experiments I decided to follow the path of nouns only composition document. <br>
The hyperparameters tuning is now the next step.

Noun only on sentences model performs way worse, let's see if going for non LocalLDA but focusing on NOUNS only is viable:

# Hyperparameter tuning results applied
We found the best $K$ for our LDA models applied on the datasets. <br>
As stated in the tuning notebook the NOUN on sentences pipeline produced low coherence models reason for that approach was dropped.

### Default sentences
Best found $K \in \{7, 9\}$ <br>
Let's try training two models on the full data and compare them.

In [2]:
corpus_path = "../dataset/output/default_sentences/pre_processed.310k.csv"
test_corpus_path = "../dataset/output/default_sentences/pre_processed.310k.test.csv"

runs = [
    dict(config=LdaGeneratorConfig(name="sentences-K9", topics=9), results=[], model=None),
    dict(config=LdaGeneratorConfig(name="sentences-K11", topics=11), results=[], model=None)
]

In [3]:
for run in runs:
    model_manager = LDAManager.from_config(run['config'], stop_words)
    print(model_manager.config)
    run['model'] = model_manager.get_model(corpus_path)
    run['results'] = model_manager.evaluate(test_corpus_path)
    # Print the results just cause
    pprint(run['results'])

LdaGeneratorConfig(name='sentences-K9', topics=9, random_state=42, passes=10, alpha='symmetric', eta=None, output_folder='./output')
Generating a new compiled model from scratch
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/233897 [00:00<?, ?it/s]

{'coherence': [-16.814645681233134,
               -15.08919057759007,
               -14.764100315046967,
               -14.83918106863862],
 'perplexity': -8.022003243917984,
 'topn': [3, 5, 10, 20]}
LdaGeneratorConfig(name='sentences-K11', topics=11, random_state=42, passes=10, alpha='symmetric', eta=None, output_folder='./output')
Generating a new compiled model from scratch
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/233897 [00:00<?, ?it/s]

{'coherence': [-13.430330864577495,
               -14.203485979641725,
               -14.612426153560664,
               -15.049984868299163],
 'perplexity': -8.116019899285954,
 'topn': [3, 5, 10, 20]}


Compare the two visualizations

In [4]:
pyLDAvis.enable_notebook()

In [5]:
dictionary: Dictionary = runs[0]['model'].id2word
# Load texts to
texts = pd.read_csv(test_corpus_path)['comments'].apply(lambda x: x.split(' '))

# Visualize the topics
vis = gensimvis.prepare(runs[0]['model'], texts.apply(lambda x: dictionary.doc2bow(x)).tolist(), dictionary)
vis

In [6]:
dictionary: Dictionary = runs[1]['model'].id2word
# Load texts to
texts = pd.read_csv(test_corpus_path)['comments'].apply(lambda x: x.split(' '))

# Visualize the topics
vis = gensimvis.prepare(runs[1]['model'], texts.apply(lambda x: dictionary.doc2bow(x)).tolist(), dictionary)
vis

Of the two trained models configurations the one with lower variance yields the better model.<br>
The $K=9$ configuration while having a better average does not return the most valid resulting model. <br>
For increasing top-n the $K=11$ has a better score and surpasses the other model.

In [25]:
from gensim.models import LdaModel, LdaMulticore

model = LdaMulticore.load('./output/sentences-K11/sentences-K11.model')
# We work on the best in coherence only
m1: LdaModel = model
m1.print_topics(20, 20)

[(0,
  '0.070*"player" + 0.038*"expansion" + 0.023*"high" + 0.014*"base" + 0.014*"want" + 0.012*"count" + 0.011*"rate" + 0.011*"point" + 0.011*"good" + 0.011*"low" + 0.010*"buy" + 0.010*"board" + 0.010*"number" + 0.009*"include" + 0.009*"recommend" + 0.009*"turn" + 0.008*"money" + 0.007*"score" + 0.007*"room" + 0.007*"interaction"'),
 (1,
  '0.027*"rule" + 0.025*"player" + 0.014*"go" + 0.014*"time" + 0.013*"turn" + 0.013*"end" + 0.013*"look" + 0.012*"get" + 0.012*"rating" + 0.011*"start" + 0.011*"hour" + 0.011*"easy" + 0.010*"know" + 0.009*"couple" + 0.008*"scenario" + 0.008*"lose" + 0.008*"try" + 0.008*"forward" + 0.008*"<number>" + 0.007*"win"'),
 (2,
  '0.064*"like" + 0.061*"feel" + 0.037*"lot" + 0.036*"little" + 0.034*"bit" + 0.017*"fun" + 0.016*"player" + 0.013*"thing" + 0.012*"way" + 0.011*"different" + 0.009*"take" + 0.009*"think" + 0.009*"get" + 0.007*"actually" + 0.007*"interaction" + 0.006*"long" + 0.006*"interesting" + 0.006*"point" + 0.006*"work" + 0.005*"boring"'),
 (3,
  

In [None]:
gold_aspects = ["luck", "bookkeeping", "downtime", "interaction", "bash", "complex/complicated", "misc"]
mapped_topics = {
    "0": "misc",  # Recommendation / Own / Want
    "1": "bash",  # Could also be downtime but bash seems to fit
    "2": "downtime",  # Seems noisy
    "3": "misc",  # Value (in terms of the game)
    "4": "misc",  # ???
    "5": "complex/complicated",  # Light + Family + Party -> Indicate complexity low or high
    "6": "bash",  # Directly act towards pushing down the player
    "7": "complex/complicated",  #
    "8": "bookkeeping",  # Resource management
    "9": "downtime",  # Temporal references and boring + ap
    "10": "interaction",  # Mechanics (I believe it to be interaction)
}

# With this mapping we can infer the aspect by label after classifying with LDA

Map to gold standards and/or recognize the associated aspect

### NOUN only
The best found $K=?$ <br>
Let's try training two models on the full data and compare them.


In [15]:
corpus_path = "../dataset/output/pos_tagged/pre_processed.310k.noun_only.csv"
test_corpus_path = "../dataset/output/pos_tagged/pre_processed.310k.noun_only.test.csv"

runs = [
    dict(config=LdaGeneratorConfig(name="noun-K7", topics=7), results=[], model=None),
    dict(config=LdaGeneratorConfig(name="noun-K9", topics=9), results=[], model=None),
]

In [8]:
for run in runs:
    model_manager = LDAManager.from_config(run['config'], stop_words)
    run['model'] = model_manager.get_model(corpus_path)
    run['results'] = model_manager.evaluate(test_corpus_path)
    # Print the results just cause
    pprint(run['results'])

Generating a new compiled model from scratch
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/195789 [00:00<?, ?it/s]

{'coherence': [-7.948833081292833,
               -6.802857928452987,
               -6.95858608877034,
               -7.416240587926887],
 'perplexity': -6.993845573618511,
 'topn': [3, 5, 10, 20]}
Generating a new compiled model from scratch
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/195789 [00:00<?, ?it/s]

{'coherence': [-8.906816819996525,
               -7.345944142314569,
               -7.23757389847958,
               -8.497291086234405],
 'perplexity': -7.057565041995237,
 'topn': [3, 5, 10, 20]}


Visualize the results:

In [9]:
pyLDAvis.enable_notebook()

In [10]:
dictionary: Dictionary = runs[0]['model'].id2word
# Load texts to
texts = pd.read_csv(test_corpus_path)['comments'].apply(lambda x: x.split(' '))

# Visualize the topics
vis = gensimvis.prepare(runs[0]['model'], texts.apply(lambda x: dictionary.doc2bow(x)).tolist(), dictionary)
vis

In [11]:
dictionary: Dictionary = runs[1]['model'].id2word
# Load texts to
texts = pd.read_csv(test_corpus_path)['comments'].apply(lambda x: x.split(' '))

# Visualize the topics
vis = gensimvis.prepare(runs[1]['model'], texts.apply(lambda x: dictionary.doc2bow(x)).tolist(), dictionary)
vis

The best in terms of coherence is the best found model as it also had lower variance and best overall average. <br>
$K=9$ was chosen as it better performed in top-3 but the first one is more robust.

Map to gold standards and/or recognize the associated aspect

In [14]:
from gensim.models import LdaModel, LdaMulticore

# We work on the best in coherence only
m1: LdaModel = runs[0]['model']
m1.print_topics(20, 20)

[(0,
  '0.037*"action" + 0.034*"card" + 0.028*"point" + 0.023*"building" + 0.019*"worker" + 0.019*"turn" + 0.019*"resource" + 0.017*"placement" + 0.012*"round" + 0.012*"player" + 0.012*"area" + 0.011*"way" + 0.011*"lot" + 0.011*"board" + 0.011*"engine" + 0.010*"end" + 0.009*"mechanic" + 0.009*"opponent" + 0.009*"order" + 0.009*"scoring"'),
 (1,
  '0.147*"card" + 0.027*"deck" + 0.026*"player" + 0.017*"luck" + 0.014*"hand" + 0.013*"filler" + 0.013*"gamer" + 0.012*"time" + 0.011*"co" + 0.010*"trick" + 0.009*"minute" + 0.009*"value" + 0.008*"way" + 0.008*"ability" + 0.008*"decision" + 0.008*"op" + 0.007*"draw" + 0.007*"round" + 0.007*"set" + 0.007*"character"'),
 (2,
  '0.058*"rule" + 0.028*"time" + 0.025*"player" + 0.019*"theme" + 0.018*"hour" + 0.016*"mechanic" + 0.016*"system" + 0.014*"scenario" + 0.011*"lot" + 0.009*"rating" + 0.009*"combat" + 0.008*"battle" + 0.008*"war" + 0.008*"rulebook" + 0.008*"wargame" + 0.007*"component" + 0.007*"book" + 0.007*"way" + 0.007*"copy" + 0.006*"gamep

In [None]:
gold_aspects = ["luck", "bookkeeping", "downtime", "interaction", "bash", "complex/complicated", "misc"]
mapped_topics = {
    "0": "interaction",  # Strategy/Planning
    "1": "luck",  # Direct luck references as draw and cards and op + Ability (which can indicate low luck)
    "2": "downtime",  # Downtime seems most fitting
    "3": "complex/complicated",  # Strategy - Table
    "4": "misc",  # Components
    "5": "misc",  # Player Experience
    "6": "bash",  # I am not convinced over this one, could be interaction
}

# With this mapping we can infer the aspect by label after classifying with LDA

While NOUN processing was better in terms of coherence the overall feeling I get from the aspects is less understandable than the ones from the sentence level. <br>
It acutally does have less  noise and more conceptual separation.