In [1]:
# Required imports.
from main.abae.config import ABAEManagerConfig
from main.abae.runner import ABAERunner
from main.abae.model_manager import ABAEManager

In case we want to avoid the long gibberish by gensim on Word2Vec or other stuff:

In [2]:
import logging

logging.disable()
logging.disable(logging.DEBUG)
logging.disable(logging.INFO)

Let's make a first attempt on the various pre-processing pipelines:

### Full-reviews
Try to run the procedure on the default implementation of the pipeline. Default as it was thought to be a simple version for ABAE specifically.

## Preprocessing
Unlike LDA we should not toy too much with the sentence structure as ABAE uses word embeddings and needs the sequence information to weight the terms based on the surrounding context. One question remains:

**Should we work on sentence level or full reviews? Let's try a first simple comparison**

In [None]:
corpus_path = "../dataset/output/default/pre_processed.80k.csv"
test_corpus_path = "../dataset/output/default/pre_processed.80k.test.csv"

manager = ABAEManager.from_scratch(ABAEManagerConfig('default', epochs=2, batch_size=512), corpus_path)
manager.train(corpus_path, verbose=2)
manager.evaluate([3, 10], test_corpus_path=test_corpus_path)

Pandas Apply:   0%|          | 0/60640 [00:00<?, ?it/s]

Loading the existing found model as requested in path ./output/default/default.aspect_embeddings.model
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/60640 [00:00<?, ?it/s]

Max sequence length calculation in progress...
We loose information on 1340(2.2097625329815305% of ds).
Generating a new compiled model from scratch
Training is starting:
Epoch 1/2


Latest run result:
```
Max Margin loss: [4.6614, 4.6604]
```

Results of the latest run:
```
NPMI coherence: -0.23037988688672537
CV score: 0.5646751917101251
Max margin reconstruction result: [4.745960235595703, 4.744936943054199]
```

## Sentence-split reviews

In [None]:
from main.abae.model_manager import ABAEManagerConfig, ABAEManager

corpus_path = "../dataset/output/default_sentences/pre_processed.80k.csv"
test_corpus_path = "../dataset/output/default_sentences/pre_processed.80k.test.csv"

runner = ABAERunner(corpus_path, test_corpus_path, config=ABAEManagerConfig('default'), override_existing=True)
runner.default_full_run()

Results for the run:
```
NPMI coherence: -0.3117940799781337
CV score: 0.6031670887985386
Max margin reconstruction result: [5.261548042297363, 5.262136936187744]
```

I know that doing a  comparison on a single run is not that meaningful. <br>
I could do k-CV to estimate the expected model loss to get a valid analysis. <br>
But for the sake of the experiment we consider this good enough.

## NOUN only
I expect this to utterly fail in the task as we loose sentence structure which ABAE sounds to be abusing. <br>
Being based on word2vec embeddings this kind of pre-processing should be harmful I suppose.

In [None]:
from main.lda.pre_processing import extract_pos_ds
import pandas as pd

print("Creating the __noun filtered ds:")
corpus_path = "../dataset/output/pos_tagged/pre_processed.80k.csv"
store_path = "../dataset/output/pos_tagged/pre_processed.80k.noun_only.csv"
extract_pos_ds(pd.read_csv(corpus_path)['comments'], "__noun", store_path)
print("ds created under: " + store_path)

print("Creating the __noun filtered test ds:")
corpus_path = "../dataset/output/pos_tagged/pre_processed.80k.test.csv"
store_path = "../dataset/output/pos_tagged/pre_processed.80k.noun_only.test.csv"
extract_pos_ds(pd.read_csv(corpus_path)['comments'], "__noun", store_path)
print("ds created under: " + store_path)

In [None]:
from main.abae.model_manager import ABAEManagerConfig, ABAEManager

corpus_path = "../dataset/output/pos_tagged/pre_processed.80k.noun_only.csv"
test_corpus_path = "../dataset/output/pos_tagged/pre_processed.80k.noun_only.test.csv"

runner = ABAERunner(corpus_path, test_corpus_path, config=ABAEManagerConfig('default'), override_existing=True)
runner.default_full_run()

For the future work and as one by the proposed ABAE paper we won't be splitting up reviews in sentences but use the full review as the model does not increase much if not done like this.

Results of HP tuning

In [None]:
import json

# Best tuning configurations
configs = ['31479139-17f6-4f6d-aa8b-494cbc8f183b', '3f192c54-6623-48a7-b01b-2d5019dad186']
results = json.load(open("./output/config/abae_configurations_results.json"))
configs = list(map(lambda x: x['config'], filter(lambda x: x['id'] in configs, results)))

Run on configs

In [None]:
configs[0]

In [None]:
import keras
from main.abae.model_manager import ABAEManager
from main.abae.config import ABAEManagerConfig

# To boost training
keras.mixed_precision.set_global_policy("mixed_float16")

corpus = "../dataset/output/default/pre_processed.310k.csv"
default_config = ABAEManagerConfig.from_configuration("abae_best_0", configs[0])
abae_manager = ABAEManager.from_scratch(default_config, corpus)
# Train
history, _ = abae_manager.train(corpus)
model = abae_manager.get_compiled_model(refresh=False)

In [None]:
history.history

In [None]:
from evaluation import ABAEEvaluationProcessor
import pandas as pd

test_corpus_path = "../dataset/output/default/pre_processed.310k.test.csv"
inv_vocab = abae_manager.generator.emb_model.model.wv.index_to_key
processor = ABAEEvaluationProcessor(abae_manager, pd.read_csv(test_corpus_path))

In [None]:
from main.abae.dataset import PositiveNegativeABAEDataset
import pandas as pd
from torch.utils.data import DataLoader

npmi_coh = processor.c_npmi_coherence_model(top_n=10)
npmi_coherence = npmi_coh.get_coherence()

cv_coh = processor.c_v_coherence_model(top_n=100)
cv_coherence = cv_coh.get_coherence()

df = pd.read_csv(test_corpus_path)
vocabulary = abae_manager.generator.emb_model.vocabulary()
max_seq_len = default_config.max_seq_len
negative_sample_size = default_config.negative_sample_size
test_ds = PositiveNegativeABAEDataset(df, vocabulary, max_seq_len, negative_sample_size)

results = model.evaluate(DataLoader(test_ds, batch_size=default_config.batch_size))

In [None]:
print(f"Max margin reconstruction result: {results}")
print(f"NPMI coherence: {npmi_coherence}")
print(f"CV score: {cv_coherence}")
print(f"Max margin reconstruction result: {results}")

In [None]:
from main.abae.model_manager import ABAEManager
from main.abae.config import ABAEManagerConfig

corpus = "../dataset/output/default/pre_processed.310k.csv"
default_config = ABAEManagerConfig.from_configuration("abae_best_1", configs[1])
abae_manager = ABAEManager.from_scratch(default_config, corpus)
# Train
history, _ = abae_manager.train(corpus)
model = abae_manager.get_compiled_model(refresh=False)