In [1]:
from pprint import pprint
# Required imports.
from main.abae.config import ABAEManagerConfig
from main.abae.model_manager import ABAEManager

In case we want to avoid the long gibberish by gensim on Word2Vec or other stuff:

In [2]:
import logging

logging.disable()
logging.disable(logging.DEBUG)
logging.disable(logging.INFO)

Let's make a first attempt on the various pre-processing pipelines:

### Full-reviews
Try to run the procedure on the default implementation of the pipeline. Default as it was thought to be a simple version for ABAE specifically.

## Preprocessing
Unlike LDA we should not toy too much with the sentence structure as ABAE uses word embeddings and needs the sequence information to weight the terms based on the surrounding context. One question remains:

**Should we work on sentence level or full reviews? Let's try a first simple comparison**

In [None]:
corpus_path = "../dataset/output/minimal_replacement/pre_processed.310k.csv"
test_corpus_path = "../dataset/output/minimal_replacement/pre_processed.310k.test.csv"

manager = ABAEManager.from_scratch(ABAEManagerConfig('min_replacement', epochs=1), corpus_path, override=True)
manager.train(corpus_path, verbose=2)
res = manager.evaluate([25, 10, 3], test_corpus=test_corpus_path)

In [4]:
print(res)

{'coherence': [-13.770155006493448, -12.955013135290317, -12.312635854649255], 'top': [25, 10, 3], 'loss': [5.155586242675781, 5.15452241897583], 'silhouette_score': 0.05925522372126579}


## Sentence-split reviews

In [None]:
corpus_path = "../dataset/output/default_sentences/pre_processed.80k.csv"
test_corpus_path = "../dataset/output/default_sentences/pre_processed.80k.test.csv"

manager = ABAEManager.from_scratch(ABAEManagerConfig('sentence_default'), corpus_path)
manager.train(corpus_path, verbose=2)
res = manager.evaluate([20, 10, 5, 3], test_corpus=test_corpus_path)

pprint(res)

Pandas Apply:   0%|          | 0/68390 [00:00<?, ?it/s]

[WinError 2] Das System kann die angegebene Datei nicht finden
  File "D:\PycharmProjects\nlp-course-project\.venv\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\jacop\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\jacop\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\jacop\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Creating new Aspect embedding model

Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/68390 [00:00<?, ?it/s]

Max sequence length calculation in progress...
We loose information on 8(0.011697616610615587% of ds).
Generating a new compiled model from scratch
Training is starting:
Epoch 1/15


Results for the run:
```
{
    'cv_coh': [0.6656065043812275, 0.4116552201333862, 0.251877270410466],
    'loss': [5.250680923461914, 5.257223606109619],
    'npmi_coh': [-0.36774750155270663, -0.3153002812427124, -0.26334968605114956],
    'silhouette_score': 0.102481075,
    'top': [25, 10, 3]
}
```

I know that doing a  comparison on a single run is not that meaningful. <br>
I could do k-CV to estimate the expected model loss to get a valid analysis. <br>
But for the sake of the experiment we consider this good enough.

## NOUN only
I expect this to utterly fail in the task as we loose sentence structure which ABAE sounds to be abusing. <br>
Being based on word2vec embeddings this kind of pre-processing should be harmful I suppose.

In [5]:
from main.lda.pre_processing import extract_pos_ds
import pandas as pd

print("Creating the __noun filtered ds:")
corpus_path = "../dataset/output/pos_tagged/pre_processed.80k.csv"
store_path = "../dataset/output/pos_tagged/pre_processed.80k.noun_only.csv"
extract_pos_ds(pd.read_csv(corpus_path)['comments'], "__noun", store_path)
print("ds created under: " + store_path)

print("Creating the __noun filtered test ds:")
corpus_path = "../dataset/output/pos_tagged/pre_processed.80k.test.csv"
store_path = "../dataset/output/pos_tagged/pre_processed.80k.noun_only.test.csv"
extract_pos_ds(pd.read_csv(corpus_path)['comments'], "__noun", store_path)
print("ds created under: " + store_path)

Creating the __noun filtered ds:


Pandas Apply:   0%|          | 0/60701 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/60701 [00:00<?, ?it/s]

ds created under: ../dataset/output/pos_tagged/pre_processed.80k.noun_only.csv
Creating the __noun filtered test ds:


Pandas Apply:   0%|          | 0/20234 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/20234 [00:00<?, ?it/s]

ds created under: ../dataset/output/pos_tagged/pre_processed.80k.noun_only.test.csv


In [None]:
# todo rifai
corpus_path = "../dataset/output/pos_tagged/pre_processed.80k.noun_only.csv"
test_corpus_path = "../dataset/output/pos_tagged/pre_processed.80k.noun_only.test.csv"

manager = ABAEManager.from_scratch(ABAEManagerConfig('noun_only'), corpus_path)
manager.train(corpus_path, verbose=2)
res = manager.evaluate([25, 10, 3], test_corpus=test_corpus_path)
pprint(res)

For the future work and as one by the proposed ABAE paper we won't be splitting up reviews in sentences but use the full review as the model does not increase much in performance if not done like this.

## Hyperparameters tuning results

In [None]:
import json

file_path = "./output/config/abae_configurations_results.json"
configs = [
    pd.DataFrame(json.load(open(file_path))).at[11, 'config'],
    pd.DataFrame(json.load(open(file_path))).at[20, 'config'],
    # These two perform worse but have a lower variance which might result in a more robust solution
    pd.DataFrame(json.load(open(file_path))).at[14, 'config'],
    pd.DataFrame(json.load(open(file_path))).at[15, 'config'],
]

runs = [
    dict(config=ABAEManagerConfig.from_configuration(f"final_full_{i}", configs[i]), results=[], model=None)
    for i in range(len(configs))
]

Run on configs

In [None]:
corpus = "../dataset/output/default/pre_processed.310k.csv"
test_corpus_path = "../dataset/output/default/pre_processed.310k.test.csv"

In [None]:
for run in runs:
    manager = ABAEManager.from_scratch(run.config, corpus_path, override=True)
    manager.train(corpus_path, verbose=2)
    run['results'] = manager.evaluate([25, 10, 3], test_corpus=test_corpus_path)
    run['model'] = manager.get_compiled_model(load_existing=True)

    # Still print the results in output.
    pprint(run['results'])