# Pre Processing Step
We want to process the data for LDA like:
- Keep NOUNS only as they are the elements that mostly hold aspect value
- Split reviews in sentences (Avoid topic modelling in a too broad sense) (https://aclanthology.org/N10-1122.pdf)

In [None]:
import pandas as pd
from gensim import corpora

# LDA does not benefit from repeated words among many documents. So we should clean them as they are stopwords.
corpus_path = "../dataset/output/default_sentences/pre_processed.310k.csv"
ds = pd.read_csv(corpus_path)['comments'].apply(lambda x: x.split(' '))

dictionary = corpora.Dictionary(ds)
doc_frequency = {}

for index in range(len(dictionary)):
    doc_frequency[dictionary.get(index)] = len(ds[[dictionary.get(index) in x for x in ds]])

In [None]:
pd.DataFrame.from_dict(doc_frequency, orient='index')[0].map(lambda x: x / len(ds)).sort_values(ascending=False)

Are 'game'(40%), 'play'(20%) and '\<game_name\>'(11%) stopwords? <br>
For sure 'game' is! What about the other two? I believe they bring no context.
We remove them!

In [1]:
stop_words = ['game', 'play', '<game_name>']

In [2]:
# Default approach on the sentences only to avoid global topic recognition
from model import LdaModelGenerator
from main.lda.config import LdaGeneratorConfig

corpus_path = "../dataset/output/default_sentences/pre_processed.310k.csv"
default_config = LdaGeneratorConfig(name="default")
lda, dictionary = LdaModelGenerator(default_config, stop_words).make_model(corpus_path)

Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/233897 [00:00<?, ?it/s]

In [3]:
from gensim.models import CoherenceModel
import pandas as pd

test_ds = pd.read_csv("../dataset/output/default_sentences/pre_processed.310k.test.csv")

# For the c_v model
texts = test_ds['comments'].apply(lambda x: x.split(' '))

topn = 10  # For the coherence evaluation.
cv_coh = CoherenceModel(lda, texts=texts, coherence='c_v', topn=topn)
npmi_coh = CoherenceModel(lda, texts=texts, coherence='c_npmi', topn=topn)

In [4]:
print(
    f"Model perplexity: {lda.log_perplexity(texts.apply(lambda x: dictionary.doc2bow(x)).tolist())}\n"
    f"With topn = {topn} we have: \n"
    f" - CV coherence: {cv_coh.get_coherence()}\n"
    f" - NPMI coherence: {npmi_coh.get_coherence()}: \n"
    "CV coherence per topic:"
)
cv_coh.get_coherence_per_topic()

Model perplexity: -31.480142717806686
With topn = 10 we have: 
 - CV coherence: 0.40848540031780045
 - NPMI coherence: 0.006683881671522012: 
CV coherence per topic:


[0.27254764741517806,
 0.4486105365601417,
 0.38699135879672514,
 0.36609561601874974,
 0.4000108767175522,
 0.4858056115886423,
 0.3388149335408939,
 0.4785679438306373,
 0.4978451360173294,
 0.5357060026370613,
 0.4060204725685014,
 0.33383785820077666,
 0.40856793362175825,
 0.3593736769352586]

In [5]:
lda.show_topics(20, 20)

[(0,
  '0.281*"player" + 0.052*"expansion" + 0.027*"try" + 0.023*"count" + 0.019*"year" + 0.017*"low" + 0.017*"well" + 0.017*"hour" + 0.017*"quickly" + 0.016*"read" + 0.016*"new" + 0.016*"old" + 0.015*"party" + 0.014*"goal" + 0.014*"good" + 0.014*"move" + 0.014*"say" + 0.013*"high" + 0.012*"figure" + 0.012*"spend"'),
 (1,
  '0.048*"turn" + 0.047*"building" + 0.045*"round" + 0.043*"end" + 0.041*"box" + 0.039*"big" + 0.038*"build" + 0.035*"work" + 0.030*"small" + 0.022*"power" + 0.019*"super" + 0.018*"choose" + 0.017*"balance" + 0.017*"couple" + 0.016*"perfect" + 0.016*"recommend" + 0.016*"create" + 0.015*"leave" + 0.015*"combo" + 0.015*"get"'),
 (2,
  '0.102*"lot" + 0.080*"bit" + 0.068*"little" + 0.055*"well" + 0.045*"take" + 0.036*"think" + 0.034*"long" + 0.028*"fun" + 0.027*"buy" + 0.026*"original" + 0.026*"get" + 0.019*"solid" + 0.017*"sure" + 0.016*"actually" + 0.014*"good" + 0.013*"similar" + 0.013*"sell" + 0.013*"thing" + 0.012*"way" + 0.011*"instead"'),
 (3,
  '0.086*"different" 

### POS: Nouns only

Now we check if the NOUN only approach works better. <br>
Take nouns only of the POS tagged ds:

In [6]:
from pre_processing import extract_pos_ds
import pandas as pd

print("Creating the __noun filtered ds:")
corpus_path = "../dataset/output/pos_tagged_sentence_level/pre_processed.310k.csv"
store_path = "../dataset/output/pos_tagged_sentence_level/pre_processed.310k.noun_only.csv"
extract_pos_ds(pd.read_csv(corpus_path)['comments'], "__noun", store_path)
print("ds created under: " + store_path)

print("Creating the __noun filtered test ds:")
corpus_path = "../dataset/output/pos_tagged_sentence_level/pre_processed.310k.test.csv"
store_path = "../dataset/output/pos_tagged_sentence_level/pre_processed.310k.noun_only.test.csv"
extract_pos_ds(pd.read_csv(corpus_path)['comments'], "__noun", store_path)
print("ds created under: " + store_path)

Creating the __noun filtered ds:


Pandas Apply:   0%|          | 0/234249 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/234249 [00:00<?, ?it/s]

ds created under: ../dataset/output/pos_tagged_sentence_level/pre_processed.310k.noun_only.csv
Creating the __noun filtered test ds:


Pandas Apply:   0%|          | 0/78083 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/78083 [00:00<?, ?it/s]

ds created under: ../dataset/output/pos_tagged_sentence_level/pre_processed.310k.noun_only.test.csv


In [None]:
# This logic is translated in:
from pre_processing import extract_pos_ds

In [None]:
import pandas as pd
from gensim import corpora

# LDA does not benefit from repeated words among many documents. So we should clean them as they are stopwords.
corpus_path = "../dataset/output/pos_tagged_sentence_level/pre_processed.310k.noun_only.csv"
ds = pd.read_csv(corpus_path)['comments'].apply(lambda x: x.split(' '))

count_dictionary = corpora.Dictionary(ds)
doc_frequency = {}

for index in range(len(count_dictionary)):
    doc_frequency[count_dictionary.get(index)] = len(ds[[count_dictionary.get(index) in x for x in ds]])

pd.DataFrame.from_dict(doc_frequency, orient='index')[0].map(lambda x: x / len(ds))

In [4]:
# Default approach on the sentences only to avoid global topic recognition
from model import LdaModelGenerator, LdaGeneratorConfig

stop_words = ['game', 'play', '<game_name>']
corpus_path = "../dataset/output/pos_tagged_sentence_level/pre_processed.310k.noun_only.csv"
default_config = LdaGeneratorConfig(name="default")
noun_lda, noun_dictionary = LdaModelGenerator(default_config, stop_words).make_model(corpus_path)

Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/167602 [00:00<?, ?it/s]

In [5]:
from gensim.models import CoherenceModel
import pandas as pd

test_corpus_path = "../dataset/output/pos_tagged_sentence_level/pre_processed.310k.noun_only.test.csv"
test_ds = pd.read_csv(test_corpus_path)

# For the c_v model
texts = test_ds['comments'].apply(lambda x: x.split(' '))

topn = 10  # For the coherence evaluation.
cv_coh = CoherenceModel(noun_lda, texts=texts, coherence='c_v', topn=topn)
npmi_coh = CoherenceModel(noun_lda, texts=texts, coherence='c_npmi', topn=topn)

In [6]:
print(
    f"Model perplexity: {noun_lda.log_perplexity(texts.apply(lambda x: noun_dictionary.doc2bow(x)).tolist())}\n"
    f"With topn = {topn} we have: \n"
    f" - CV coherence: {cv_coh.get_coherence()}\n"
    f" - NPMI coherence: {npmi_coh.get_coherence()}: \n"
    "CV coherence per topic:"
)
cv_coh.get_coherence_per_topic()

Model perplexity: -30.785678623952037
With topn = 10 we have: 
 - CV coherence: 0.2885869000600669
 - NPMI coherence: -0.03455960748217064: 
CV coherence per topic:


[0.32972073815509945,
 0.38794153062224623,
 0.2586327140975048,
 0.2619792147532881,
 0.31468470514702435,
 0.2315422886045449,
 0.19522473098956622,
 0.40443356934457364,
 0.22090880887349415,
 0.3502331299174756,
 0.28964400649838845,
 0.2626494647303481,
 0.2935958395068493,
 0.23902585960053319]

In [10]:
noun_lda.show_topics(20, 20)

[(0,
  '0.233*"bit" + 0.095*"character" + 0.095*"component" + 0.090*"box" + 0.083*"art" + 0.044*"quality" + 0.039*"depth" + 0.033*"filler" + 0.030*"twist" + 0.029*"series" + 0.027*"use" + 0.025*"count" + 0.015*"company" + 0.014*"field" + 0.012*"fantasy" + 0.012*"memory" + 0.012*"learning" + 0.012*"damage" + 0.010*"favorite" + 0.009*"curve"'),
 (1,
  '0.149*"fun" + 0.097*"lot" + 0.072*"decision" + 0.068*"placement" + 0.066*"worker" + 0.065*"people" + 0.052*"table" + 0.044*"scoring" + 0.043*"option" + 0.042*"space" + 0.035*"family" + 0.031*"opponent" + 0.025*"issue" + 0.016*"day" + 0.016*"ton" + 0.014*"light" + 0.012*"fiddly" + 0.011*"leader" + 0.011*"session" + 0.011*"review"'),
 (2,
  '0.562*"player" + 0.051*"engine" + 0.034*"goal" + 0.029*"combat" + 0.026*"ship" + 0.026*"chance" + 0.023*"movement" + 0.022*"room" + 0.020*"dungeon" + 0.017*"move" + 0.016*"difference" + 0.016*"line" + 0.015*"number" + 0.015*"skill" + 0.013*"interest" + 0.012*"situation" + 0.012*"rest" + 0.010*"position" 

In [None]:
# Now the model has high coherence which is good but are the extracted aspects good?
# Probably not.

In [11]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda, texts.apply(lambda x: dictionary.doc2bow(x)).tolist(), dictionary)
vis

In [12]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(noun_lda, texts.apply(lambda x: noun_dictionary.doc2bow(x)).tolist(), noun_dictionary)
vis

In [None]:
# Topic 1 identifies Downtime for example. Yet it also does crash with maybe with bookkeeping.
# We have to better tune the parameters to check if we can create better communities

Following the experiments I decided to follow the path of nouns only composition document. <br>
The hyperparameters tuning is now the next step.

Noun only on sentences model performs way worse, let's see if going for non LocalLDA but focusing on NOUNS only is viable:

In [13]:
from pre_processing import extract_pos_ds
import pandas as pd

print("Creating the __noun filtered ds:")
corpus_path = "../dataset/output/pos_tagged/pre_processed.310k.csv"
store_path = "../dataset/output/pos_tagged/pre_processed.310k.noun_only.csv"
extract_pos_ds(pd.read_csv(corpus_path)['comments'], "__noun", store_path)
print("ds created under: " + store_path)

print("Creating the __noun filtered test ds:")
corpus_path = "../dataset/output/pos_tagged/pre_processed.310k.test.csv"
store_path = "../dataset/output/pos_tagged/pre_processed.310k.noun_only.test.csv"
extract_pos_ds(pd.read_csv(corpus_path)['comments'], "__noun", store_path)
print("ds created under: " + store_path)

Creating the __noun filtered ds:


Pandas Apply:   0%|          | 0/233777 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/233777 [00:00<?, ?it/s]

ds created under: ../dataset/output/pos_tagged/pre_processed.310k.noun_only.csv
Creating the __noun filtered test ds:


Pandas Apply:   0%|          | 0/77926 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/77926 [00:00<?, ?it/s]

ds created under: ../dataset/output/pos_tagged/pre_processed.310k.noun_only.test.csv


In [14]:
# Default approach on the sentences only to avoid global topic recognition
from model import LdaModelGenerator, LdaGeneratorConfig

stop_words = ['game', 'play', '<game_name>']
corpus_path = "../dataset/output/pos_tagged/pre_processed.310k.noun_only.csv"
default_config = LdaGeneratorConfig(name='default')
noun_lda, noun_dictionary = LdaModelGenerator(default_config, stop_words).make_model(corpus_path)

Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/195789 [00:00<?, ?it/s]

In [15]:
from gensim.models import CoherenceModel
import pandas as pd

test_corpus_path = "../dataset/output/pos_tagged/pre_processed.310k.noun_only.test.csv"
test_ds = pd.read_csv(test_corpus_path)

# For the c_v model
texts = test_ds['comments'].apply(lambda x: x.split(' '))

topn = 10  # For the coherence evaluation.
cv_coh = CoherenceModel(noun_lda, texts=texts, coherence='c_v', topn=topn)
npmi_coh = CoherenceModel(noun_lda, texts=texts, coherence='c_npmi', topn=topn)

In [16]:
print(
    f"Model perplexity: {noun_lda.log_perplexity(texts.apply(lambda x: noun_dictionary.doc2bow(x)).tolist())}\n"
    f"With topn = {topn} we have: \n"
    f" - CV coherence: {cv_coh.get_coherence()}\n"
    f" - NPMI coherence: {npmi_coh.get_coherence()}: \n"
    "CV coherence per topic:"
)
cv_coh.get_coherence_per_topic()

Model perplexity: -23.98261823214068
With topn = 10 we have: 
 - CV coherence: 0.5975213978908255
 - NPMI coherence: 0.009577051016547825: 
CV coherence per topic:


[0.5551108165702432,
 0.713715061668484,
 0.5962325548829728,
 0.7550304784445258,
 0.6024835953397344,
 0.532905401010275,
 0.5991436978554352,
 0.47656312764646813,
 0.6019909845184954,
 0.5737234016458104,
 0.6269564511945285,
 0.58992218149521,
 0.6158909102359408,
 0.5256309079634304]

In [17]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(noun_lda, texts.apply(lambda x: noun_dictionary.doc2bow(x)).tolist(), noun_dictionary)
vis