### What hyperparameters to tune:
Take a look here:
> https://stats.stackexchange.com/questions/349761/reasonable-hyperparameter-range-for-latent-dirichlet-allocation

So our go to are:
- Topics number
- alpha: Document-Topic Density
- beta: Word-Topic Density


### Parameters definition

In [1]:
from main.hp_tuning import UniqueParametersConfigFsGenerator, RandomTunableOffsetParameter

seed = 1408
config_path = "./output/config"

config_generator = UniqueParametersConfigFsGenerator(patience=100, seen_configurations_path=config_path,
                                                     seen_configurations_filename="seen_configurations.noun_only.json")

# The amount of topics we want to look for
# Topics are by far the most relevant parameter to tune, alpha and beta will come later maybe:
config_generator.add_parameter('topics', RandomTunableOffsetParameter(value_range=(7, 72), step=5, seed=seed))

In [2]:
# config_generator.add_parameter('alpha', RandomTunableOffsetParameter(value_range=(0.005, 1.0), step=0.5, seed=seed))
# config_generator.add_parameter('beta', RandomTunableDiscreteParameter(values_list=beta, seed=seed))

In [3]:
import pandas as pd
import numpy as np

corpus = pd.read_csv("../dataset/output/pos_tagged/pre_processed.310k.noun_only.csv")
folds = np.array_split(corpus, 5)

  return bound(*args, **kwds)


In [None]:
from pathlib import Path
import json
from gensim.models import CoherenceModel
from main.lda.model import LdaGeneratorConfig, LdaModelGenerator

results = []
n_folds = 5
# This script can be re-run as often as desired as the history is persisted
for i in range(10):  # How many different configurations we want to see

    config = next(config_generator)
    if config is None:
        break  # We could not find a new configuration in patience time

    print(f"Running configuration = {config} ({i + 1}/10)")
    tops = [3, 10, 25]
    run_result = dict(
        config=config, cv_coh={top: [] for top in tops}, npmi_coh={top: [] for top in tops}, perplexity=[]
    )
    for k in range(n_folds):  # K-fold CV
        # Build the correct data splits
        validation_split = folds[k]  # On what to compute the validation metrics
        train = pd.concat([folds[index] for index in range(len(folds)) if index != k])
        print(f"Running fold = {k}")
        model, dictionary = LdaModelGenerator(LdaGeneratorConfig().from_dict(config)).make_model(train)
        print("Model generation over, evaluating...")
        # Validation part
        texts = validation_split['comments'].apply(lambda x: x.split(' '))
        # Metrics tracking

        perplexity = model.log_perplexity(texts.apply(lambda x: dictionary.doc2bow(x)).tolist())

        run_result['perplexity'].append(perplexity)

        cv_coh_values = []
        npmi_coh_values = []

        for top in [3, 10, 25]:
            cv_coh = CoherenceModel(model, texts=texts, coherence='c_v', topn=top)
            npmi_coh = CoherenceModel(model, texts=texts, coherence='c_npmi', topn=top)
            run_result['cv_coh'][top].append(cv_coh.get_coherence())
            run_result['npmi_coh'][top].append(npmi_coh.get_coherence())

    results.append(run_result)


results_path = "./output/config/hp_tuning_results.noun_only.json"
if Path(results_path).is_file():
    existing_res = json.load(open("./output/config/hp_tuning_results.noun_only.json"))
    results = results + existing_res
json.dump(results, open(results_path, 'w'))

In [None]:
from main.hp_tuning import UniqueParametersConfigFsGenerator, RandomTunableOffsetParameter

seed = 1408
config_path = "./output/config"

config_generator = UniqueParametersConfigFsGenerator(patience=100, seen_configurations_path=config_path,
                                                     seen_configurations_filename="seen_configurations.noun_only_sentence.json")

# The amount of topics we want to look for
# Topics are by far the most relevant parameter to tune, alpha and beta will come later maybe:
config_generator.add_parameter('topics', RandomTunableOffsetParameter(value_range=(7, 72), step=5, seed=seed))

In [None]:
import pandas as pd
import numpy as np

corpus = pd.read_csv("../dataset/output/pos_tagged_sentence_level/pre_processed.310k.noun_only.csv")
folds = np.array_split(corpus, 5)

In [None]:
from pathlib import Path
import json
from gensim.models import CoherenceModel
from main.lda.model import LdaGeneratorConfig, LdaModelGenerator

results = []
n_folds = 5
# This script can be re-run as often as desired as the history is persisted
for i in range(10):  # How many different configurations we want to see

    config = next(config_generator)
    if config is None:
        break  # We could not find a new configuration in patience time

    print(f"Running configuration = {config} ({i + 1}/10)")
    tops = [3, 10, 25]
    run_result = dict(
        config=config, cv_coh={top: [] for top in tops}, npmi_coh={top: [] for top in tops}, perplexity=[]
    )
    for k in range(n_folds):  # K-fold CV
        # Build the correct data splits
        validation_split = folds[k]  # On what to compute the validation metrics
        train = pd.concat([folds[index] for index in range(len(folds)) if index != k])
        print(f"Running fold = {k}")
        model, dictionary = LdaModelGenerator(LdaGeneratorConfig().from_dict(config)).make_model(train)
        print("Model generation over, evaluating...")
        # Validation part
        texts = validation_split['comments'].apply(lambda x: x.split(' '))
        # Metrics tracking

        perplexity = model.log_perplexity(texts.apply(lambda x: dictionary.doc2bow(x)).tolist())

        run_result['perplexity'].append(perplexity)

        cv_coh_values = []
        npmi_coh_values = []

        for top in [3, 10, 25]:
            cv_coh = CoherenceModel(model, texts=texts, coherence='c_v', topn=top)
            npmi_coh = CoherenceModel(model, texts=texts, coherence='c_npmi', topn=top)
            run_result['cv_coh'][top].append(cv_coh.get_coherence())
            run_result['npmi_coh'][top].append(npmi_coh.get_coherence())

    results.append(run_result)


results_path = "./output/config/hp_tuning_results.noun_only_sent.json"
if Path(results_path).is_file():
    existing_res = json.load(open("./output/config/hp_tuning_results.noun_only_sent.json"))
    results = results + existing_res
json.dump(results, open(results_path, 'w'))

In [1]:
from main.hp_tuning import UniqueParametersConfigFsGenerator, RandomTunableOffsetParameter

seed = 1408
config_path = "./output/config"

config_generator = UniqueParametersConfigFsGenerator(patience=100, seen_configurations_path=config_path,
                                                     seen_configurations_filename="seen_configurations.default_sentence.json")

# The amount of topics we want to look for
# Topics are by far the most relevant parameter to tune, alpha and beta will come later maybe:
config_generator.add_parameter('topics', RandomTunableOffsetParameter(value_range=(7, 72), step=5, seed=seed))

Loading previous configurations from: ./output/config/seen_configurations.default_sentence.json


In [2]:
import pandas as pd
import numpy as np

corpus = pd.read_csv("../dataset/output/default_sentences/pre_processed.310k.csv")
folds = np.array_split(corpus, 5)

  return bound(*args, **kwds)


In [5]:
from pathlib import Path
import json
from gensim.models import CoherenceModel
from main.lda.model import LdaGeneratorConfig, LdaModelGenerator

#todo fn cosi non divento scemo su file
results = []
n_folds = 5
# This script can be re-run as often as desired as the history is persisted
for i in range(10):  # How many different configurations we want to see

    config = next(config_generator)
    if config is None:
        break  # We could not find a new configuration in patience time

    print(f"Running configuration = {config} ({i + 1}/10)")
    tops = [3, 10, 25]
    run_result = dict(
        config=config, cv_coh={top: [] for top in tops}, npmi_coh={top: [] for top in tops}, perplexity=[]
    )
    for k in range(n_folds):  # K-fold CV
        # Build the correct data splits
        validation_split = folds[k]  # On what to compute the validation metrics
        train = pd.concat([folds[index] for index in range(len(folds)) if index != k])
        print(f"Running fold = {k}")
        model, dictionary = LdaModelGenerator(LdaGeneratorConfig().from_dict(config)).make_model(train)
        print("Model generation over, evaluating...")
        # Validation part
        texts = validation_split['comments'].apply(lambda x: x.split(' '))
        # Metrics tracking

        perplexity = model.log_perplexity(texts.apply(lambda x: dictionary.doc2bow(x)).tolist())

        run_result['perplexity'].append(perplexity)

        cv_coh_values = []
        npmi_coh_values = []

        for top in [3, 10, 25]:
            cv_coh = CoherenceModel(model, texts=texts, coherence='c_v', topn=top)
            npmi_coh = CoherenceModel(model, texts=texts, coherence='c_npmi', topn=top)
            run_result['cv_coh'][top].append(cv_coh.get_coherence())
            run_result['npmi_coh'][top].append(npmi_coh.get_coherence())

    results.append(run_result)

results_path = "./output/config/hp_tuning_results.default_sentence.json"
if Path(results_path).is_file():
    existing_res = json.load(open("./output/config/hp_tuning_results.default_sentence.json"))
    results = results + existing_res
json.dump(results, open(results_path, 'w'))

Running configuration = {'topics': 14} (1/10)
Running fold = 0
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/187117 [00:00<?, ?it/s]

Model generation over, evaluating...
Running fold = 1
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/187117 [00:00<?, ?it/s]

Model generation over, evaluating...
Running fold = 2
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/187118 [00:00<?, ?it/s]

Model generation over, evaluating...
Running fold = 3
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/187118 [00:00<?, ?it/s]

Model generation over, evaluating...
Running fold = 4
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/187118 [00:00<?, ?it/s]

Model generation over, evaluating...


In [None]:
#todo fix
# https://www.kaggle.com/code/vijaylokithrr/topic-modelling#7)-Hyperparameter-Tuning
plt.figure(figsize=(20, 7))
ax = sns.lineplot(x=num_topics, y=hyper_para_coherence, label="topic coherences")
ax.axes.set_title('Coherence per Number of Topics', fontsize=25)
ax.set_ylabel('Coherence', fontsize=20)
ax.set_xlabel('Number of Topics', fontsize=20)
plt.show()