# Pre Processing Step
We want to process the data for LDA like:
- Keep NOUNS only as they are the elements that mostly hold aspect value
- Split reviews in sentences (Avoid topic modelling in a too broad sense) (https://aclanthology.org/N10-1122.pdf)

In [3]:
from main.lda.config import LdaGeneratorConfig
import pandas as pd
# Required imports for the coming cells
from model import LdaModelGenerator
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from gensim.models import CoherenceModel

def run_procedure(corpus_path: str, test_ds_path: str, config: LdaGeneratorConfig):
    stop_words = ['game', 'play', '<game_name>']
    lda, dictionary = LdaModelGenerator(config, stop_words).make_model(corpus_path)

    # Evaluate the model
    split_texts = pd.read_csv(test_ds_path)['comments'].apply(lambda x: x.split(' '))
    results = dict(cv_coh=[], npmi_coh=[], topn=[3, 10, 25])
    results['perplexity'] = lda.log_perplexity(split_texts.apply(lambda x: dictionary.doc2bow(x)).tolist())

    for topn in results['topn']:
        cv_model = CoherenceModel(lda, texts=split_texts, coherence='c_v', topn=topn)
        npmi_model = CoherenceModel(lda, texts=split_texts, coherence='c_npmi', topn=topn)
        results['cv_coh'].append(cv_model.get_coherence())
        results['npmi_coh'].append(npmi_model.get_coherence())

    print(
        f"Model perplexity: {results['perplexity']}\n"
        f"With topn = {results['topn']} we have: \n"
        f" - CV coherence: {results['cv_coh']}\n"
        f" - NPMI coherence: {results['npmi_coh']}\n"
    )

    return results, lda, dictionary


In [None]:
from gensim import corpora

# LDA does not benefit from repeated words among many documents. So we should clean them as they are stopwords.
corpus_path = "../dataset/output/default_sentences/pre_processed.310k.csv"
ds = pd.read_csv(corpus_path)['comments'].apply(lambda x: x.split(' '))

dictionary = corpora.Dictionary(ds)
doc_frequency = {}

for index in range(len(dictionary)):
    doc_frequency[dictionary.get(index)] = len(ds[[dictionary.get(index) in x for x in ds]])

In [None]:
pd.DataFrame.from_dict(doc_frequency, orient='index')[0].map(lambda x: x / len(ds)).sort_values(ascending=False)

Are 'game'(40%), 'play'(20%) and '\<game_name\>'(11%) stopwords? <br>
For sure 'game' is! What about the other two? I believe they bring no context.
We remove them!

In [None]:
stop_words = ['game', 'play', '<game_name>']

corpus_path = "../dataset/output/default_sentences/pre_processed.310k.csv"
default_config = LdaGeneratorConfig(name="default")
lda, dictionary = LdaModelGenerator(default_config, stop_words).make_model(corpus_path)

In [None]:
from gensim.models import CoherenceModel
import pandas as pd

test_ds = pd.read_csv("../dataset/output/default_sentences/pre_processed.310k.test.csv")

# For the c_v model
texts = test_ds['comments'].apply(lambda x: x.split(' '))

topn = 10  # For the coherence evaluation.
cv_coh = CoherenceModel(lda, texts=texts, coherence='c_v', topn=topn)
npmi_coh = CoherenceModel(lda, texts=texts, coherence='c_npmi', topn=topn)

In [None]:
print(
    f"Model perplexity: {lda.log_perplexity(texts.apply(lambda x: dictionary.doc2bow(x)).tolist())}\n"
    f"With topn = {topn} we have: \n"
    f" - CV coherence: {cv_coh.get_coherence()}\n"
    f" - NPMI coherence: {npmi_coh.get_coherence()}: \n"
    "CV coherence per topic:"
)
cv_coh.get_coherence_per_topic()

In [None]:
lda.show_topics(20, 20)

### POS: Nouns only

Now we check if the NOUN only approach works better. <br>
Take nouns only of the POS tagged ds:

In [None]:
from pre_processing import extract_pos_ds
import pandas as pd

print("Creating the __noun filtered ds:")
corpus_path = "../dataset/output/pos_tagged_sentence_level/pre_processed.310k.csv"
store_path = "../dataset/output/pos_tagged_sentence_level/pre_processed.310k.noun_only.csv"
extract_pos_ds(pd.read_csv(corpus_path)['comments'], "__noun", store_path)
print("ds created under: " + store_path)

print("Creating the __noun filtered test ds:")
corpus_path = "../dataset/output/pos_tagged_sentence_level/pre_processed.310k.test.csv"
store_path = "../dataset/output/pos_tagged_sentence_level/pre_processed.310k.noun_only.test.csv"
extract_pos_ds(pd.read_csv(corpus_path)['comments'], "__noun", store_path)
print("ds created under: " + store_path)

In [None]:
# This logic is translated in:
from pre_processing import extract_pos_ds

In [None]:
import pandas as pd
from gensim import corpora

# LDA does not benefit from repeated words among many documents. So we should clean them as they are stopwords.
corpus_path = "../dataset/output/pos_tagged_sentence_level/pre_processed.310k.noun_only.csv"
ds = pd.read_csv(corpus_path)['comments'].apply(lambda x: x.split(' '))

count_dictionary = corpora.Dictionary(ds)
doc_frequency = {}

for index in range(len(count_dictionary)):
    doc_frequency[count_dictionary.get(index)] = len(ds[[count_dictionary.get(index) in x for x in ds]])

pd.DataFrame.from_dict(doc_frequency, orient='index')[0].map(lambda x: x / len(ds))

In [None]:
# Default approach on the sentences only to avoid global topic recognition
from model import LdaModelGenerator, LdaGeneratorConfig

stop_words = ['game', 'play', '<game_name>']
corpus_path = "../dataset/output/pos_tagged_sentence_level/pre_processed.310k.noun_only.csv"
default_config = LdaGeneratorConfig(name="default")
noun_lda, noun_dictionary = LdaModelGenerator(default_config, stop_words).make_model(corpus_path)

In [None]:
from gensim.models import CoherenceModel
import pandas as pd

test_corpus_path = "../dataset/output/pos_tagged_sentence_level/pre_processed.310k.noun_only.test.csv"
test_ds = pd.read_csv(test_corpus_path)

# For the c_v model
texts = test_ds['comments'].apply(lambda x: x.split(' '))

topn = 10  # For the coherence evaluation.
cv_coh = CoherenceModel(noun_lda, texts=texts, coherence='c_v', topn=topn)
npmi_coh = CoherenceModel(noun_lda, texts=texts, coherence='c_npmi', topn=topn)

In [None]:
print(
    f"Model perplexity: {noun_lda.log_perplexity(texts.apply(lambda x: noun_dictionary.doc2bow(x)).tolist())}\n"
    f"With topn = {topn} we have: \n"
    f" - CV coherence: {cv_coh.get_coherence()}\n"
    f" - NPMI coherence: {npmi_coh.get_coherence()}: \n"
    "CV coherence per topic:"
)
cv_coh.get_coherence_per_topic()

In [None]:
noun_lda.show_topics(20, 20)

In [None]:
# Now the model has high coherence which is good but are the extracted aspects good?
# Probably not.

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda, texts.apply(lambda x: dictionary.doc2bow(x)).tolist(), dictionary)
vis

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(noun_lda, texts.apply(lambda x: noun_dictionary.doc2bow(x)).tolist(), noun_dictionary)
vis

In [None]:
# Topic 1 identifies Downtime for example. Yet it also does crash with maybe with bookkeeping.
# We have to better tune the parameters to check if we can create better communities

Following the experiments I decided to follow the path of nouns only composition document. <br>
The hyperparameters tuning is now the next step.

Noun only on sentences model performs way worse, let's see if going for non LocalLDA but focusing on NOUNS only is viable:

In [None]:
from pre_processing import extract_pos_ds
import pandas as pd

print("Creating the __noun filtered ds:")
corpus_path = "../dataset/output/pos_tagged/pre_processed.310k.csv"
store_path = "../dataset/output/pos_tagged/pre_processed.310k.noun_only.csv"
extract_pos_ds(pd.read_csv(corpus_path)['comments'], "__noun", store_path)
print("ds created under: " + store_path)

print("Creating the __noun filtered test ds:")
corpus_path = "../dataset/output/pos_tagged/pre_processed.310k.test.csv"
store_path = "../dataset/output/pos_tagged/pre_processed.310k.noun_only.test.csv"
extract_pos_ds(pd.read_csv(corpus_path)['comments'], "__noun", store_path)
print("ds created under: " + store_path)

In [None]:
# Default approach on the sentences only to avoid global topic recognition
from model import LdaModelGenerator, LdaGeneratorConfig

stop_words = ['game', 'play', '<game_name>']
corpus_path = "../dataset/output/pos_tagged/pre_processed.310k.noun_only.csv"
default_config = LdaGeneratorConfig(name='default')
noun_lda, noun_dictionary = LdaModelGenerator(default_config, stop_words).make_model(corpus_path)

In [None]:
from gensim.models import CoherenceModel
import pandas as pd

test_corpus_path = "../dataset/output/pos_tagged/pre_processed.310k.noun_only.test.csv"
test_ds = pd.read_csv(test_corpus_path)

# For the c_v model
texts = test_ds['comments'].apply(lambda x: x.split(' '))

topn = 10  # For the coherence evaluation.
cv_coh = CoherenceModel(noun_lda, texts=texts, coherence='c_v', topn=topn)
npmi_coh = CoherenceModel(noun_lda, texts=texts, coherence='c_npmi', topn=topn)

In [None]:
print(
    f"Model perplexity: {noun_lda.log_perplexity(texts.apply(lambda x: noun_dictionary.doc2bow(x)).tolist())}\n"
    f"With topn = {topn} we have: \n"
    f" - CV coherence: {cv_coh.get_coherence()}\n"
    f" - NPMI coherence: {npmi_coh.get_coherence()}: \n"
    "CV coherence per topic:"
)
cv_coh.get_coherence_per_topic()

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(noun_lda, texts.apply(lambda x: noun_dictionary.doc2bow(x)).tolist(), noun_dictionary)
vis

## Tuned best found K values
Hyperparameters tuning yielded on average the following best topics:
- Default sentences: [10]
- NOUN only full review: [7, 12]

### Default sentences:

In [None]:
# Default approach on the sentences only to avoid global topic recognition
from model import LdaModelGenerator
from main.lda.config import LdaGeneratorConfig

stop_words = ['game', 'play', '<game_name>']

corpus_path = "../dataset/output/default_sentences/pre_processed.310k.csv"
config = LdaGeneratorConfig(name="sentences_k_10", topics=10)
lda, dictionary = LdaModelGenerator(config, stop_words).make_model(corpus_path)

In [None]:
from gensim.models import CoherenceModel
import pandas as pd

test_ds = pd.read_csv("../dataset/output/default_sentences/pre_processed.310k.test.csv")

# For the c_v model
texts = test_ds['comments'].apply(lambda x: x.split(' '))

perplexity = lda.log_perplexity(texts.apply(lambda x: dictionary.doc2bow(x)).tolist())
results = dict(cv_coh=[], npmi_coh=[], topn=[3, 10, 25], perplexity=perplexity)

for topn in results['topn']:
    cv_coh = CoherenceModel(lda, texts=texts, coherence='c_v', topn=topn)
    npmi_coh = CoherenceModel(lda, texts=texts, coherence='c_npmi', topn=topn)
    results['cv_coh'].append(cv_coh.get_coherence())
    results['npmi_coh'].append(npmi_coh.get_coherence())

In [None]:
# todo save lda model

In [None]:
print(
    f"Model perplexity: {results['perplexity']}\n"
    f"With topn = {results['topn']} we have: \n"
    f" - CV coherence: {results['cv_coh']}\n"
    f" - NPMI coherence: {results['npmi_coh']}\n"
)

Visualize the result:

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda, texts.apply(lambda x: dictionary.doc2bow(x)).tolist(), dictionary)
vis

## NOUNs only

In [None]:
 # Default approach on the sentences only to avoid global topic recognition
from model import LdaModelGenerator
from main.lda.config import LdaGeneratorConfig

stop_words = ['game', 'play', '<game_name>']

corpus_path = "../dataset/output/pos_tagged/pre_processed.310k.noun_only.csv"
config = LdaGeneratorConfig(name="sentences_k_10", topics=12)
lda, dictionary = LdaModelGenerator(config, stop_words).make_model(corpus_path)

In [None]:
from gensim.models import CoherenceModel
import pandas as pd

test_ds = pd.read_csv("../dataset/output/pos_tagged/pre_processed.310k.noun_only.test.csv")

# For the c_v model
texts = test_ds['comments'].apply(lambda x: x.split(' '))

perplexity = lda.log_perplexity(texts.apply(lambda x: dictionary.doc2bow(x)).tolist())
results = dict(cv_coh=[], npmi_coh=[], topn=[3, 10, 25], perplexity=perplexity)

for topn in results['topn']:
    cv_coh = CoherenceModel(lda, texts=texts, coherence='c_v', topn=topn)
    npmi_coh = CoherenceModel(lda, texts=texts, coherence='c_npmi', topn=topn)
    results['cv_coh'].append(cv_coh.get_coherence())
    results['npmi_coh'].append(npmi_coh.get_coherence())

In [None]:
print(
    f"Model perplexity: {results['perplexity']}\n"
    f"With topn = {results['topn']} we have: \n"
    f" - CV coherence: {results['cv_coh']}\n"
    f" - NPMI coherence: {results['npmi_coh']}\n"
)

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda, texts.apply(lambda x: dictionary.doc2bow(x)).tolist(), dictionary)
vis

In [4]:
# Run for K = 7
corpus_path = "../dataset/output/pos_tagged/pre_processed.310k.noun_only.csv"
test_ds_path = "../dataset/output/pos_tagged/pre_processed.310k.noun_only.test.csv"
res, lda, dictionary = run_procedure(corpus_path, test_ds_path, LdaGeneratorConfig(name="sentences_k_7", topics=12))

pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda, texts.apply(lambda x: dictionary.doc2bow(x)).tolist(), dictionary)
vis

Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/195789 [00:00<?, ?it/s]

Model perplexity: -8.208558033118516
With topn = [3, 10, 25] we have: 
 - CV coherence: [0.7180530229398828, 0.6123247263225092, 0.5880882077576267]
 - NPMI coherence: [0.05778951271585514, 0.023637662956389766, 0.005265168985537204]



NameError: name 'texts' is not defined