In [1]:
# Common default imports
from pprint import pprint
from model_manager import LDAManager
from main.lda.config import LdaGeneratorConfig
from gensim import corpora
import pandas as pd
from gensim.corpora import Dictionary
# Plotting utility for the LDA models
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Stop words identified from the next blocks
stop_words = ['game', 'play', '<game_name>']

# Pre Processing Step
We want to process the data for LDA like:
- Keep NOUNS only as they are the elements that mostly hold aspect value
- Split reviews in sentences (Avoid topic modelling in a too broad sense) (https://aclanthology.org/N10-1122.pdf)

In [None]:
# LDA does not benefit from repeated words among many documents. So we should clean them as they are stopwords.
corpus_path = "../dataset/output/default_sentences/pre_processed.310k.csv"
ds = pd.read_csv(corpus_path)['comments'].apply(lambda x: x.split(' '))

dictionary = corpora.Dictionary(ds)
doc_frequency = {}

for index in range(len(dictionary)):
    doc_frequency[dictionary.get(index)] = len(ds[[dictionary.get(index) in x for x in ds]])
pd.DataFrame.from_dict(doc_frequency, orient='index')[0].map(lambda x: x / len(ds)).sort_values(ascending=False)

Are 'game'(40%), 'play'(20%) and '\<game_name\>'(11%) stopwords? <br>
For sure 'game' is! What about the other two? I believe they bring no context.
We remove them!

### Default: Sentences split

In [6]:
# 80k
corpus_path = "../dataset/output/default_sentences/pre_processed.80k.csv"
test_corpus_path = "../dataset/output/default_sentences/pre_processed.80k.test.csv"

model_manager = LDAManager.from_config(LdaGeneratorConfig(name="default_sentences_80k"), stop_words)
model = model_manager.get_model(corpus_path, True, True)
res = model_manager.evaluate(test_corpus_path)

pprint(res)

Generating a new compiled model from fs
Model not found. Making a new one.
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/68390 [00:00<?, ?it/s]

{'coherence': [-17.177862702834112,
               -17.02777543298526,
               -17.290263201442876,
               -16.936958294632383],
 'perplexity': -8.588481933552947,
 'topn': [3, 5, 10, 20]}


In [7]:
dictionary: Dictionary = model.id2word
# Load texts to
texts = pd.read_csv(test_corpus_path)['comments'].apply(lambda x: x.split(' '))

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(model, texts.apply(lambda x: dictionary.doc2bow(x)).tolist(), dictionary)
# Render the plot
vis

Now the bigger dataset for 310k

In [2]:
corpus_path = "../dataset/output/default_sentences/pre_processed.310k.csv"
test_corpus_path = "../dataset/output/default_sentences/pre_processed.310k.test.csv"

model_manager = LDAManager.from_config(LdaGeneratorConfig(name="default_sentences"), stop_words)
model = model_manager.get_model(corpus_path, True, True)
res = model_manager.evaluate(test_corpus_path)

pprint(res)

Generating a new compiled model from fs
Model not found. Making a new one.
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/233897 [00:00<?, ?it/s]

{'coherence': [-14.345296267821746,
               -13.742494545430045,
               -14.779922291032763,
               -15.116752158504827],
 'perplexity': -8.19623024177193,
 'topn': [3, 5, 10, 20]}


In [3]:
dictionary: Dictionary = model.id2word
# Load texts to
texts = pd.read_csv(test_corpus_path)['comments'].apply(lambda x: x.split(' '))

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(model, texts.apply(lambda x: dictionary.doc2bow(x)).tolist(), dictionary)
# Render the plot
vis

Now we check if the NOUN only approach works better. <br>
Take nouns only of the POS tagged ds:

Now that the dataset was built and stored train an LDA on it:

In [None]:
from main.lda.pre_processing import extract_pos_ds
import pandas as pd

print("Creating the __noun filtered ds:")
corpus_path = "../dataset/output/pos_tagged/pre_processed.310k.csv"
store_path = "../dataset/output/pos_tagged/pre_processed.310k.noun_only.csv"
extract_pos_ds(pd.read_csv(corpus_path)['comments'], "__noun", store_path)
print("ds created under: " + store_path)

print("Creating the __noun filtered test ds:")
corpus_path = "../dataset/output/pos_tagged/pre_processed.310k.test.csv"
store_path = "../dataset/output/pos_tagged/pre_processed.310k.noun_only.test.csv"
extract_pos_ds(pd.read_csv(corpus_path)['comments'], "__noun", store_path)
print("ds created under: " + store_path)

In [4]:
corpus_path = "../dataset/output/pos_tagged/pre_processed.310k.noun_only.csv"
test_corpus_path = "../dataset/output/pos_tagged/pre_processed.310k.noun_only.test.csv"

model_manager = LDAManager.from_config(LdaGeneratorConfig(name="default_sentences"), stop_words)
model = model_manager.get_model(corpus_path, False, True)
res = model_manager.evaluate(test_corpus_path)

pprint(res)

Generating a new compiled model from scratch
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/195789 [00:00<?, ?it/s]

{'coherence': [-8.206318645320582,
               -8.092144166145504,
               -8.572682666308848,
               -9.292421386724754],
 'perplexity': -7.193918552296681,
 'topn': [3, 5, 10, 20]}


In [5]:
dictionary: Dictionary = model.id2word
# Load texts to
texts = pd.read_csv(test_corpus_path)['comments'].apply(lambda x: x.split(' '))

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(model, texts.apply(lambda x: dictionary.doc2bow(x)).tolist(), dictionary)

vis

Following the experiments I decided to follow the path of nouns only composition document. <br>
The hyperparameters tuning is now the next step.