In [1]:
corpus_file = "../../data/corpus.csv"
test_fraction = 0.25  # Fraction fo samples used to generate a test set.
generation_setups = zip([80000, 310000], [2, 8])  # tuple[size, random_state]

In [2]:
from pre_processing import PreProcessingServiceFactory
from dataset_sampler import BggDatasetRandomBalancedSampler

## Pre-processing
We produce two datasets: a "scouting" one for the first steps and one for the final models that are eventually hp tuned

In [4]:
import pandas as pd

# Read the game names to replace with the <GAME_NAME> tag
game_names = pd.read_csv("../../resources/2024-08-18.csv")['Name']
game_names = pd.concat([game_names, pd.Series(["Quick", "Catan"])], ignore_index=True).tolist()
print(f"We have a total of {len(game_names)} different game names.")

We have a total of 25901 different game names.


In [5]:
def process(service, generation_setup: zip, corpus_file_path: str):
    for size, random_state in generation_setup:
        sampler = BggDatasetRandomBalancedSampler(int(size / 10), corpus_file_path, random_state)
        service.process_dataset(int(size), sampler)

## Default

In [None]:
ps = PreProcessingServiceFactory.default(game_names, "./output/default", test_fraction)
process(ps, generation_setups, corpus_file)

## Sentence-split

In [None]:
sentence_ps = PreProcessingServiceFactory.default_sentences(game_names, "./output/default_sentences", test_fraction)
process(sentence_ps, generation_setups, corpus_file)

### POS-tagged

In [19]:
pos_tag_ps = PreProcessingServiceFactory.pos_tagged(game_names, "./output/pos_tagged", test_fraction)
process(pos_tag_ps, generation_setups, corpus_file)

I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 5886/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 11779/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 17670/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 23511/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 29253/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 35092/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 40879/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 46755/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 52442/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 58110/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 63857/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 69588/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 75303/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 80935/80000
Processing terminated. We are storing the work ready file now...
Test subset created with success as ./output/pos_tagged/pre_processed.80k.test.csv
File created with success as ./output/pos_tagged/pre_processed.80k.csv
I have a total of 2220 games with reviews. We take 14 reviews per game.


Pandas Apply:   0%|          | 0/31080 [00:00<?, ?it/s]

ds_size: 20590/310000
I have a total of 2220 games with reviews. We take 14 reviews per game.


Pandas Apply:   0%|          | 0/31080 [00:00<?, ?it/s]

ds_size: 40943/310000
I have a total of 2220 games with reviews. We take 14 reviews per game.


Pandas Apply:   0%|          | 0/31080 [00:00<?, ?it/s]

ds_size: 61045/310000
I have a total of 2220 games with reviews. We take 14 reviews per game.


Pandas Apply:   0%|          | 0/31080 [00:00<?, ?it/s]

ds_size: 81005/310000
I have a total of 2220 games with reviews. We take 14 reviews per game.


Pandas Apply:   0%|          | 0/31080 [00:00<?, ?it/s]

ds_size: 100723/310000
I have a total of 2220 games with reviews. We take 14 reviews per game.


Pandas Apply:   0%|          | 0/31080 [00:00<?, ?it/s]

ds_size: 120193/310000
I have a total of 2220 games with reviews. We take 14 reviews per game.


Pandas Apply:   0%|          | 0/31080 [00:00<?, ?it/s]

ds_size: 139659/310000
I have a total of 2220 games with reviews. We take 14 reviews per game.


Pandas Apply:   0%|          | 0/31080 [00:00<?, ?it/s]

ds_size: 158760/310000
I have a total of 2220 games with reviews. We take 14 reviews per game.


Pandas Apply:   0%|          | 0/31080 [00:00<?, ?it/s]

ds_size: 177863/310000
I have a total of 2220 games with reviews. We take 14 reviews per game.


Pandas Apply:   0%|          | 0/31080 [00:00<?, ?it/s]

ds_size: 196901/310000
I have a total of 2220 games with reviews. We take 14 reviews per game.


Pandas Apply:   0%|          | 0/31071 [00:00<?, ?it/s]

ds_size: 215479/310000
I have a total of 2218 games with reviews. We take 14 reviews per game.


Pandas Apply:   0%|          | 0/31022 [00:00<?, ?it/s]

ds_size: 234051/310000
I have a total of 2211 games with reviews. We take 15 reviews per game.


Pandas Apply:   0%|          | 0/33115 [00:00<?, ?it/s]

ds_size: 253737/310000
I have a total of 2206 games with reviews. We take 15 reviews per game.


Pandas Apply:   0%|          | 0/33026 [00:00<?, ?it/s]

ds_size: 273279/310000
I have a total of 2198 games with reviews. We take 15 reviews per game.


Pandas Apply:   0%|          | 0/32916 [00:00<?, ?it/s]

ds_size: 292741/310000
I have a total of 2188 games with reviews. We take 15 reviews per game.


Pandas Apply:   0%|          | 0/32717 [00:00<?, ?it/s]

ds_size: 311703/310000
Processing terminated. We are storing the work ready file now...
Test subset created with success as ./output/pos_tagged/pre_processed.310k.test.csv
File created with success as ./output/pos_tagged/pre_processed.310k.csv


Now a POS tagged variant on sentences

In [6]:
out_path = "./output/pos_tagged_sentence_level"
pos_tag_sentence_ps = PreProcessingServiceFactory.pos_tagged_sentence_level(game_names, out_path, test_fraction)
process(pos_tag_sentence_ps, generation_setups, corpus_file)

In [5]:
out_path = "./output/sentences_no_replacement"
sentence_ps = PreProcessingServiceFactory.default_no_replacement(out_path, test_fraction)
process(sentence_ps, generation_setups, corpus_file)
# May removing dates, number etc. be harmful?

I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 5911/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 11816/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 17718/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 23574/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 29331/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 35179/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 40976/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 46864/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 52569/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 58243/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 64001/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 69746/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 75469/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 81104/80000
Processing terminated. We are storing the work ready file now...
Test subset created with success as ./output/sentences_no_replacement/pre_processed.80k.test.csv
File created with success as ./output/sentences_no_replacement/pre_processed.80k.csv
I have a total of 2220 games with reviews. We take 14 reviews per game.


Pandas Apply:   0%|          | 0/31080 [00:00<?, ?it/s]

ds_size: 20633/310000
I have a total of 2220 games with reviews. We take 14 reviews per game.


Pandas Apply:   0%|          | 0/31080 [00:00<?, ?it/s]

ds_size: 41029/310000
I have a total of 2220 games with reviews. We take 14 reviews per game.


Pandas Apply:   0%|          | 0/31080 [00:00<?, ?it/s]

ds_size: 61174/310000
I have a total of 2220 games with reviews. We take 14 reviews per game.


Pandas Apply:   0%|          | 0/31080 [00:00<?, ?it/s]

ds_size: 81178/310000
I have a total of 2220 games with reviews. We take 14 reviews per game.


Pandas Apply:   0%|          | 0/31080 [00:00<?, ?it/s]

ds_size: 100932/310000
I have a total of 2220 games with reviews. We take 14 reviews per game.


Pandas Apply:   0%|          | 0/31080 [00:00<?, ?it/s]

ds_size: 120420/310000
I have a total of 2220 games with reviews. We take 14 reviews per game.


Pandas Apply:   0%|          | 0/31080 [00:00<?, ?it/s]

ds_size: 139906/310000
I have a total of 2220 games with reviews. We take 14 reviews per game.


Pandas Apply:   0%|          | 0/31080 [00:00<?, ?it/s]

ds_size: 159029/310000
I have a total of 2220 games with reviews. We take 14 reviews per game.


Pandas Apply:   0%|          | 0/31080 [00:00<?, ?it/s]

ds_size: 178166/310000
I have a total of 2220 games with reviews. We take 14 reviews per game.


Pandas Apply:   0%|          | 0/31080 [00:00<?, ?it/s]

ds_size: 197237/310000
I have a total of 2220 games with reviews. We take 14 reviews per game.


Pandas Apply:   0%|          | 0/31071 [00:00<?, ?it/s]

ds_size: 215835/310000
I have a total of 2218 games with reviews. We take 14 reviews per game.


Pandas Apply:   0%|          | 0/31022 [00:00<?, ?it/s]

ds_size: 234424/310000
I have a total of 2211 games with reviews. We take 15 reviews per game.


Pandas Apply:   0%|          | 0/33115 [00:00<?, ?it/s]

ds_size: 254132/310000
I have a total of 2206 games with reviews. We take 15 reviews per game.


Pandas Apply:   0%|          | 0/33026 [00:00<?, ?it/s]

ds_size: 273703/310000
I have a total of 2198 games with reviews. We take 15 reviews per game.


Pandas Apply:   0%|          | 0/32916 [00:00<?, ?it/s]

ds_size: 293190/310000
I have a total of 2188 games with reviews. We take 15 reviews per game.


Pandas Apply:   0%|          | 0/32717 [00:00<?, ?it/s]

ds_size: 312178/310000
Processing terminated. We are storing the work ready file now...
Test subset created with success as ./output/sentences_no_replacement/pre_processed.310k.test.csv
File created with success as ./output/sentences_no_replacement/pre_processed.310k.csv


As ABAE mysteriously works better on fewer data see if it was a lucky seed:

In [3]:
# 22 ~ "We may not have much in common, you and I. Still, I consider you as a friend"
generation_setups = zip([80000], [22])  # tuple[size, random_state]

In [None]:
ps = PreProcessingServiceFactory.default(game_names, "./output/default-22", test_fraction)
process(ps, generation_setups, corpus_file)

In [7]:
sentence_ps = PreProcessingServiceFactory.default_sentences(game_names, "./output/default_sentences-22", test_fraction)
process(sentence_ps, generation_setups, corpus_file)

In [8]:
# 22 ~ "We may not have much in common, you and I. Still, I consider you as a friend"
generation_setups = zip([80000], [8])  # tuple[size, random_state]
ps = PreProcessingServiceFactory.default(game_names, "./output/default-8", test_fraction)
process(ps, generation_setups, corpus_file)

I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 5912/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 11787/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 17654/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 23483/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 29281/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 35132/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 40991/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 46736/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 52470/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 58123/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 63860/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 69539/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 75238/80000
I have a total of 2220 games with reviews. We take 4 reviews per game.


Pandas Apply:   0%|          | 0/8880 [00:00<?, ?it/s]

ds_size: 80920/80000
Processing terminated. We are storing the work ready file now...
Test subset created with success as ./output/default-8/pre_processed.80k.test.csv
File created with success as ./output/default-8/pre_processed.80k.csv
