In [1]:
# Common default imports
from pprint import pprint
from model_manager import LDAManager
from main.lda.config import LdaGeneratorConfig
from gensim import corpora
import pandas as pd
from gensim.corpora import Dictionary
# Plotting utility for the LDA models
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Stop words identified from the next blocks
stop_words = ['game', 'play', '<game_name>']

# Pre Processing Step
We want to process the data for LDA like:
- Keep NOUNS only as they are the elements that mostly hold aspect value
- Split reviews in sentences (Avoid topic modelling in a too broad sense) (https://aclanthology.org/N10-1122.pdf)

In [None]:
# LDA does not benefit from repeated words among many documents. So we should clean them as they are stopwords.
corpus_path = "../dataset/output/default_sentences/pre_processed.310k.csv"
ds = pd.read_csv(corpus_path)['comments'].apply(lambda x: x.split(' '))

dictionary = corpora.Dictionary(ds)
doc_frequency = {}

for index in range(len(dictionary)):
    doc_frequency[dictionary.get(index)] = len(ds[[dictionary.get(index) in x for x in ds]])
pd.DataFrame.from_dict(doc_frequency, orient='index')[0].map(lambda x: x / len(ds)).sort_values(ascending=False)

Are 'game'(40%), 'play'(20%) and '\<game_name\>'(11%) stopwords? <br>
For sure 'game' is! What about the other two? I believe they bring no context.
We remove them!

### Default: Sentences slit

In [6]:
# 80k
corpus_path = "../dataset/output/default_sentences/pre_processed.80k.csv"
test_corpus_path = "../dataset/output/default_sentences/pre_processed.80k.test.csv"

model_manager = LDAManager.from_scratch(LdaGeneratorConfig(name="default_sentences_80k"), stop_words)
model = model_manager.get_model(corpus_path, True, True)
res = model_manager.evaluate(test_corpus_path)

pprint(res)

Generating a new compiled model from fs
Model not found. Making a new one.
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/68390 [00:00<?, ?it/s]

{'coherence': [-17.177862702834112,
               -17.02777543298526,
               -17.290263201442876,
               -16.936958294632383],
 'perplexity': -8.588481933552947,
 'topn': [3, 5, 10, 20]}


In [7]:
dictionary: Dictionary = model.id2word
# Load texts to
texts = pd.read_csv(test_corpus_path)['comments'].apply(lambda x: x.split(' '))

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(model, texts.apply(lambda x: dictionary.doc2bow(x)).tolist(), dictionary)
# Render the plot
vis

Now the bigger dataset for 310k

In [2]:
corpus_path = "../dataset/output/default_sentences/pre_processed.310k.csv"
test_corpus_path = "../dataset/output/default_sentences/pre_processed.310k.test.csv"

model_manager = LDAManager.from_scratch(LdaGeneratorConfig(name="default_sentences"), stop_words)
model = model_manager.get_model(corpus_path, True, True)
res = model_manager.evaluate(test_corpus_path)

pprint(res)

Generating a new compiled model from fs
Model not found. Making a new one.
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/233897 [00:00<?, ?it/s]

{'coherence': [-14.345296267821746,
               -13.742494545430045,
               -14.779922291032763,
               -15.116752158504827],
 'perplexity': -8.19623024177193,
 'topn': [3, 5, 10, 20]}


In [3]:
dictionary: Dictionary = model.id2word
# Load texts to
texts = pd.read_csv(test_corpus_path)['comments'].apply(lambda x: x.split(' '))

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(model, texts.apply(lambda x: dictionary.doc2bow(x)).tolist(), dictionary)
# Render the plot
vis

### NOUN-sentences ds
Filter out any non noun from dataset using the POS tagged processing pipleine

In [None]:
from pre_processing import extract_pos_ds
import pandas as pd

print("Creating the __noun filtered ds:")
corpus_path = "../dataset/output/pos_tagged_sentence_level/pre_processed.310k.csv"
store_path = "../dataset/output/pos_tagged_sentence_level/pre_processed.310k.noun_only.csv"
extract_pos_ds(pd.read_csv(corpus_path)['comments'], "__noun", store_path)
print("ds created under: " + store_path)

print("Creating the __noun filtered test ds:")
corpus_path = "../dataset/output/pos_tagged_sentence_level/pre_processed.310k.test.csv"
store_path = "../dataset/output/pos_tagged_sentence_level/pre_processed.310k.noun_only.test.csv"
extract_pos_ds(pd.read_csv(corpus_path)['comments'], "__noun", store_path)
print("ds created under: " + store_path)

Now we check if the NOUN only approach works better. <br>
Take nouns only of the POS tagged ds:

In [None]:
corpus_path = "../dataset/output/pos_tagged_sentence_level/pre_processed.310k.noun_only.csv"
test_corpus_path = "../dataset/output/pos_tagged_sentence_level/pre_processed.310k.noun_only.test.csv"

model_manager = LDAManager.from_scratch(LdaGeneratorConfig(name="default_sentences"), stop_words)
model = model_manager.get_model(corpus_path, False, True)
res = model_manager.evaluate(test_corpus_path)

pprint(res)

In [14]:
dictionary: Dictionary = model.id2word
# Load texts to
texts = pd.read_csv(test_corpus_path)['comments'].apply(lambda x: x.split(' '))

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(model, texts.apply(lambda x: dictionary.doc2bow(x)).tolist(), dictionary)
vis

## NOUN only
On the full review this time so no sentence splitting is performed

In [None]:
from pre_processing import extract_pos_ds
import pandas as pd

print("Creating the __noun filtered ds:")
corpus_path = "../dataset/output/pos_tagged/pre_processed.310k.csv"
store_path = "../dataset/output/pos_tagged/pre_processed.310k.noun_only.csv"
extract_pos_ds(pd.read_csv(corpus_path)['comments'], "__noun", store_path)
print("ds created under: " + store_path)

print("Creating the __noun filtered test ds:")
corpus_path = "../dataset/output/pos_tagged/pre_processed.310k.test.csv"
store_path = "../dataset/output/pos_tagged/pre_processed.310k.noun_only.test.csv"
extract_pos_ds(pd.read_csv(corpus_path)['comments'], "__noun", store_path)
print("ds created under: " + store_path)

80k test run

In [11]:
# 80k test run
corpus_path = "../dataset/output/pos_tagged/pre_processed.80k.noun_only.csv"
test_corpus_path = "../dataset/output/pos_tagged/pre_processed.80k.noun_only.test.csv"

model_manager = LDAManager.from_scratch(LdaGeneratorConfig(name="noun_only9_80k", topics=9), stop_words)
model = model_manager.get_model(corpus_path, True, True)
res = model_manager.evaluate(test_corpus_path)

pprint(res)

Generating a new compiled model from fs
Model not found. Making a new one.
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/51995 [00:00<?, ?it/s]

{'coherence': [-12.200767914223142,
               -12.209822157148004,
               -11.71934405929391,
               -12.08168787494773],
 'perplexity': -7.370057944179688,
 'topn': [3, 5, 10, 20]}


In [12]:
dictionary: Dictionary = model.id2word
# Load texts to
texts = pd.read_csv(test_corpus_path)['comments'].apply(lambda x: x.split(' '))

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(model, texts.apply(lambda x: dictionary.doc2bow(x)).tolist(), dictionary)

vis

Now that the dataset was built and stored train an LDA on it:

In [4]:
corpus_path = "../dataset/output/pos_tagged/pre_processed.310k.noun_only.csv"
test_corpus_path = "../dataset/output/pos_tagged/pre_processed.310k.noun_only.test.csv"

model_manager = LDAManager.from_scratch(LdaGeneratorConfig(name="default_sentences"), stop_words)
model = model_manager.get_model(corpus_path, False, True)
res = model_manager.evaluate(test_corpus_path)

pprint(res)

Generating a new compiled model from scratch
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/195789 [00:00<?, ?it/s]

{'coherence': [-8.206318645320582,
               -8.092144166145504,
               -8.572682666308848,
               -9.292421386724754],
 'perplexity': -7.193918552296681,
 'topn': [3, 5, 10, 20]}


In [5]:
dictionary: Dictionary = model.id2word
# Load texts to
texts = pd.read_csv(test_corpus_path)['comments'].apply(lambda x: x.split(' '))

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(model, texts.apply(lambda x: dictionary.doc2bow(x)).tolist(), dictionary)

vis

Following the experiments I decided to follow the path of nouns only composition document. <br>
The hyperparameters tuning is now the next step.

Noun only on sentences model performs way worse, let's see if going for non LocalLDA but focusing on NOUNS only is viable:

# Hyperparameter tuning results applied
We found the best $K$ for our LDA models applied on the datasets. <br>
As stated in the tuning notebook the NOUN on sentences pipeline produced low coherence models reason for that approach was dropped.

### Default sentences
Best found $K \in \{7, 9\}$ <br>
Let's try training two models on the full data and compare them.

In [21]:
corpus_path = "../dataset/output/default_sentences/pre_processed.310k.csv"
test_corpus_path = "../dataset/output/default_sentences/pre_processed.310k.test.csv"

runs = [
    dict(config=LdaGeneratorConfig(name="sentences-K7", topics=7), K=7, results=[], model=None),
    dict(config=LdaGeneratorConfig(name="sentences-K9", topics=9), K=9, results=[], model=None)
]

In [22]:
import numpy as np

corpus = pd.read_csv("../dataset/output/default_sentences/pre_processed.310k.csv")
folds = np.array_split(corpus, 5)
train = pd.concat([folds[index] for index in range(len(folds)) if index != 2])
train

Unnamed: 0,comments
0,like art work edition
1,provisional rating base play
2,<game_name> game try like success
3,theme present certainly engage
4,intense strategic game goa aos <game_name> pri...
...,...
233892,pretty clear spirit create equal potentially s...
233893,say doubt example point inequality look make c...
233894,kind randomness okay like <game_name> fine mer...
233895,hit table stay game


In [24]:
for run in runs:
    model_manager = LDAManager.from_scratch(run['config'], stop_words)
    print(model_manager.config)
    run['model'] = model_manager.get_model(train)
    run['results'] = model_manager.evaluate(test_corpus_path)
    # Print the results just cause
    pprint(run['results'])

LdaGeneratorConfig(name='sentences-K7', topics=7, random_state=42, passes=10, alpha='symmetric', eta=None, output_folder='./output')
LdaGeneratorConfig(name='fs', topics=7, random_state=42, passes=10, alpha='symmetric', eta=None, output_folder='./output')
LdaGeneratorConfig(name='sentences-K7', topics=7, random_state=42, passes=10, alpha='symmetric', eta=None, output_folder='./output')
Generating a new compiled model from scratch
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/187118 [00:00<?, ?it/s]

KeyboardInterrupt: 

Compare the two visualizations

In [19]:
pyLDAvis.enable_notebook()

In [20]:
dictionary: Dictionary = runs[0]['model'].id2word
# Load texts to
texts = pd.read_csv(test_corpus_path)['comments'].apply(lambda x: x.split(' '))

# Visualize the topics
vis = gensimvis.prepare(runs[0]['model'], texts.apply(lambda x: dictionary.doc2bow(x)).tolist(), dictionary)
vis

In [21]:
dictionary: Dictionary = runs[1]['model'].id2word
# Load texts to
texts = pd.read_csv(test_corpus_path)['comments'].apply(lambda x: x.split(' '))

# Visualize the topics
vis = gensimvis.prepare(runs[1]['model'], texts.apply(lambda x: dictionary.doc2bow(x)).tolist(), dictionary)
vis

Map to gold standards and/or recognize the associated aspect

### NOUN only
The best found $K=?$ <br>
Let's try training two models on the full data and compare them.


In [4]:
corpus_path = "../dataset/output/pos_tagged/pre_processed.310k.noun_only.csv"
test_corpus_path = "../dataset/output/pos_tagged/pre_processed.310k.noun_only.test.csv"

runs = [
    dict(config=LdaGeneratorConfig(name="noun-K7", topics=7), K=7, results=[], model=None),
    dict(config=LdaGeneratorConfig(name="noun-K9", topics=10), K=10, results=[], model=None),
]

In [6]:
for run in runs:
    model_manager = LDAManager.from_scratch(run['config'], stop_words)
    run['model'] = model_manager.get_model(corpus_path)
    run['results'] = model_manager.evaluate(test_corpus_path)
    # Print the results just cause
    pprint(run['results'])

Generating a new compiled model from scratch
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/195789 [00:00<?, ?it/s]

{'coherence': [-7.948833081292833,
               -6.802857928452987,
               -6.95858608877034,
               -7.2892432162160405],
 'perplexity': -6.99375285229275,
 'topn': [3, 5, 10, 20]}
Generating a new compiled model from scratch
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/195789 [00:00<?, ?it/s]

{'coherence': [-8.906816819996525,
               -7.403514085571197,
               -7.1920301854734685,
               -8.324668992793418],
 'perplexity': -7.056838863124454,
 'topn': [3, 5, 10, 20]}


Visualize the results:

In [10]:
pyLDAvis.enable_notebook()

In [12]:
dictionary: Dictionary = runs[0]['model'].id2word
# Load texts to
texts = pd.read_csv(test_corpus_path)['comments'].apply(lambda x: x.split(' '))

# Visualize the topics
vis = gensimvis.prepare(runs[0]['model'], texts.apply(lambda x: dictionary.doc2bow(x)).tolist(), dictionary)
vis

In [13]:
dictionary: Dictionary = runs[1]['model'].id2word
# Load texts to
texts = pd.read_csv(test_corpus_path)['comments'].apply(lambda x: x.split(' '))

# Visualize the topics
vis = gensimvis.prepare(runs[1]['model'], texts.apply(lambda x: dictionary.doc2bow(x)).tolist(), dictionary)
vis

Map to gold standards and/or recognize the associated aspect