### What hyperparameters to tune:
Take a look here:
> https://stats.stackexchange.com/questions/349761/reasonable-hyperparameter-range-for-latent-dirichlet-allocation

So our go to are:
- Topics number
- alpha: Document-Topic Density
- beta: Word-Topic Density


### Parameters definition

In [None]:
from main.hp_tuning import UniqueParametersConfigFsGenerator, RandomTunableOffsetParameter

seed = 1408
config_generator = UniqueParametersConfigFsGenerator(100, "./output/config", "seen_configurations.noun_only.json")

# The amount of topics we want to look for
# Topics are by far the most relevant parameter to tune, alpha and beta will come later maybe:
config_generator.add_parameter('topics', RandomTunableOffsetParameter(value_range=(7, 72), step=5, seed=seed))
# config_generator.add_parameter('alpha', RandomTunableOffsetParameter(value_range=(0.005, 1.0), step=0.5, seed=seed))
# config_generator.add_parameter('beta', RandomTunableDiscreteParameter(values_list=beta, seed=seed))

In [None]:
from hp_tuning import LDATuningProcedure
import pandas as pd

stop_words = ['game', 'play', '<game_name>']
corpus = pd.read_csv("../dataset/output/pos_tagged/pre_processed.310k.noun_only.csv")
procedure = LDATuningProcedure(generator=config_generator, top=[3, 10, 25], folds=5)
procedure.run(corpus, 10, stop_words)  # Try 10 different configurations.
procedure.store_results("./output/config/hp_tuning_results.noun_only.json")

In [None]:
from main.hp_tuning import UniqueParametersConfigFsGenerator, RandomTunableOffsetParameter

seed = 1408
config_generator = UniqueParametersConfigFsGenerator(
    100, "./output/config", "seen_configurations.noun_only_sentence.json"
)

# The amount of topics we want to look for
# Topics are by far the most relevant parameter to tune, alpha and beta will come later maybe:
config_generator.add_parameter('topics', RandomTunableOffsetParameter(value_range=(7, 72), step=5, seed=seed))
# config_generator.add_parameter('alpha', RandomTunableOffsetParameter(value_range=(0.005, 1.0), step=0.5, seed=seed))
# config_generator.add_parameter('beta', RandomTunableDiscreteParameter(values_list=beta, seed=seed))

In [None]:
from hp_tuning import LDATuningProcedure
import pandas as pd

stop_words = ['game', 'play', '<game_name>']
corpus = pd.read_csv("../dataset/output/pos_tagged_sentence_level/pre_processed.310k.noun_only.csv")
procedure = LDATuningProcedure(generator=config_generator, top=[3, 10, 25], folds=5)
procedure.run(corpus, 10, stop_words)  # Try 10 different configurations.
procedure.store_results("./output/config/hp_tuning_results.noun_only_sent.json")

In [None]:
from main.hp_tuning import UniqueParametersConfigFsGenerator, RandomTunableOffsetParameter

seed = 1408
config_generator = UniqueParametersConfigFsGenerator(
    100, "./output/config", "seen_configurations.default_sentence.json"
)

# The amount of topics we want to look for
# Topics are by far the most relevant parameter to tune, alpha and beta will come later maybe:
config_generator.add_parameter('topics', RandomTunableOffsetParameter(value_range=(7, 72), step=5, seed=seed))
# config_generator.add_parameter('alpha', RandomTunableOffsetParameter(value_range=(0.005, 1.0), step=0.5, seed=seed))
# config_generator.add_parameter('beta', RandomTunableDiscreteParameter(values_list=beta, seed=seed))

In [None]:
from hp_tuning import LDATuningProcedure
import pandas as pd

stop_words = ['game', 'play', '<game_name>']
corpus = pd.read_csv("../dataset/output/default_sentences/pre_processed.310k.csv")
procedure = LDATuningProcedure(generator=config_generator, top=[3, 10, 25], folds=5)
results = procedure.run(corpus, 10, stop_words)  # Try 10 different configurations.
procedure.store_results("./output/config/hp_tuning_results.default_sentence.json")

Visualize the results:

In [None]:
import json
import pandas as pd

data = pd.DataFrame(json.load(open("./output/config/hp_tuning_results.default_sentence.json")))
data['topics'] = data['config'].map(lambda o: o['topics'])
data['perplexity'] = data['perplexity'].map(lambda x: np.mean(x))
for i in [3, 10, 25]:
    data[f'{i}_npmi_coh'] = data['npmi_coh'].map(lambda x: np.mean(x[str(i)]))
    data[f'{i}_cv_coh'] = data['cv_coh'].map(lambda x: np.mean(x[str(i)]))
# Refined data
data = data.drop(columns=['config', 'npmi_coh', 'cv_coh'])

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
data = data.sort_values(by="topics")
fig.add_trace(go.Scatter(x=data['topics'], y=data['3_cv_coh'], mode='lines', name='top-3'))
fig.add_trace(go.Scatter(x=data['topics'], y=data['10_cv_coh'], mode='lines', name='top-10'))
fig.add_trace(go.Scatter(x=data['topics'], y=data['25_cv_coh'], mode='lines', name='top-25'))

fig.add_trace(go.Scatter(x=data['topics'], y=data['3_npmi_coh'], mode='lines', name='top-3', line=dict(dash='dash')))
fig.add_trace(go.Scatter(x=data['topics'], y=data['10_npmi_coh'], mode='lines', name='top-10', line=dict(dash='dash')))
fig.add_trace(go.Scatter(x=data['topics'], y=data['25_npmi_coh'], mode='lines', name='top-25', line=dict(dash='dash')))
fig.update_traces(mode='lines+markers')
fig.update_layout(
    title=dict(text='Average CV coherence over k-folds per model (K different) - Default sentences'),
    xaxis=dict(title=dict(text='Model topics K')),
    yaxis=dict(title=dict(text='CV coherence')),
)
fig.show()
print(f"Average perplexity for the current dataset LDA approach is of: {data[['perplexity', 'topics']]}")

In [None]:
data = pd.DataFrame(json.load(open("./output/config/hp_tuning_results.noun_only.json")))
data['topics'] = data['config'].map(lambda o: o['topics'])
data['perplexity'] = data['perplexity'].map(lambda x: np.mean(x))
for i in [3, 10, 25]:
    data[f'{i}_npmi_coh'] = data['npmi_coh'].map(lambda x: np.mean(x[str(i)]))
    data[f'{i}_cv_coh'] = data['cv_coh'].map(lambda x: np.mean(x[str(i)]))
# Refined data
data = data.drop(columns=['config', 'npmi_coh', 'cv_coh'])

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
data = data.sort_values(by="topics")
fig.add_trace(go.Scatter(x=data['topics'], y=data['3_cv_coh'], mode='lines', name='top-3'))
fig.add_trace(go.Scatter(x=data['topics'], y=data['10_cv_coh'], mode='lines', name='top-10'))
fig.add_trace(go.Scatter(x=data['topics'], y=data['25_cv_coh'], mode='lines', name='top-25'))

fig.add_trace(go.Scatter(x=data['topics'], y=data['3_npmi_coh'], mode='lines', name='top-3', line=dict(dash='dash')))
fig.add_trace(go.Scatter(x=data['topics'], y=data['10_npmi_coh'], mode='lines', name='top-10', line=dict(dash='dash')))
fig.add_trace(go.Scatter(x=data['topics'], y=data['25_npmi_coh'], mode='lines', name='top-25', line=dict(dash='dash')))
fig.update_traces(mode='lines+markers')
fig.update_layout(
    title=dict(text='Average CV coherence over k-folds per model (K different) - Noun Only'),
    xaxis=dict(title=dict(text='Model topics K')),
    yaxis=dict(title=dict(text='CV coherence')),
)
fig.show()
print(f"Average perplexity for the current dataset LDA approach is of: {data[['perplexity', 'topics']]}")

This tuning shows a degradation of coherence with the increase of the topics. <br>
Let's try reducing the range to the values in range [7,20]

In [None]:
data = pd.DataFrame(json.load(open("./output/config/hp_tuning_results.noun_only_sent.json")))
data['topics'] = data['config'].map(lambda o: o['topics'])
data['perplexity'] = data['perplexity'].map(lambda x: np.mean(x))
for i in [3, 10, 25]:
    data[f'{i}_npmi_coh'] = data['npmi_coh'].map(lambda x: np.mean(x[str(i)]))
    data[f'{i}_cv_coh'] = data['cv_coh'].map(lambda x: np.mean(x[str(i)]))
# Refined data
data = data.drop(columns=['config', 'npmi_coh', 'cv_coh'])

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
data = data.sort_values(by="topics")
fig.add_trace(go.Scatter(x=data['topics'], y=data['3_cv_coh'], mode='lines', name='top-3'))
fig.add_trace(go.Scatter(x=data['topics'], y=data['10_cv_coh'], mode='lines', name='top-10'))
fig.add_trace(go.Scatter(x=data['topics'], y=data['25_cv_coh'], mode='lines', name='top-25'))

fig.add_trace(go.Scatter(x=data['topics'], y=data['3_npmi_coh'], mode='lines', name='top-3', line=dict(dash='dash')))
fig.add_trace(go.Scatter(x=data['topics'], y=data['10_npmi_coh'], mode='lines', name='top-10', line=dict(dash='dash')))
fig.add_trace(go.Scatter(x=data['topics'], y=data['25_npmi_coh'], mode='lines', name='top-25', line=dict(dash='dash')))
fig.update_traces(mode='lines+markers')
fig.update_layout(
    title=dict(text='Average CV coherence over k-folds per model (K different) - Noun sentences'),
    xaxis=dict(title=dict(text='Model topics K')),
    yaxis=dict(title=dict(text='CV coherence')),
)
fig.show()
print(f"Average perplexity for the current dataset LDA approach is of: {data[['perplexity', 'topics']]}")

Nouns only on sentences perform worst for our metrics and are rather inconsistent

In [None]:
# On the three datasets seen we just opt to get deeper with the ones:
# - hp_tuning_results.noun_only.json (Most robust in every metric)
# - hp_tuning_results.default_sentence.json (Even if this one seems less promising)

### Parameter redefinition

In [None]:
from main.hp_tuning import UniqueParametersConfigFsGenerator, RandomTunableOffsetParameter

seed = 1408
config_generator = UniqueParametersConfigFsGenerator(100, "./output/config", "seen_configurations.noun_only.json")

# The amount of topics we want to look for
# Topics are by far the most relevant parameter to tune, alpha and beta will come later maybe:
config_generator.add_parameter('topics', RandomTunableOffsetParameter(value_range=(7, 20), step=1, seed=seed))
# config_generator.add_parameter('alpha', RandomTunableOffsetParameter(value_range=(0.005, 1.0), step=0.5, seed=seed))
# config_generator.add_parameter('beta', RandomTunableDiscreteParameter(values_list=beta, seed=seed))

In [None]:
from hp_tuning import LDATuningProcedure
import pandas as pd

stop_words = ['game', 'play', '<game_name>']
corpus = pd.read_csv("../dataset/output/pos_tagged/pre_processed.310k.noun_only.csv")
procedure = LDATuningProcedure(generator=config_generator, top=[3, 10, 25], folds=5)
results = procedure.run(corpus, 10, stop_words)  # Try 10 different configurations.
procedure.store_results("./output/config/hp_tuning_results.noun_only.json")

In [None]:
results

In [None]:
from main.hp_tuning import UniqueParametersConfigFsGenerator, RandomTunableOffsetParameter

seed = 1408
config_generator = UniqueParametersConfigFsGenerator(
    100, "./output/config", "seen_configurations.default_sentence.json"
)

# The amount of topics we want to look for
# Topics are by far the most relevant parameter to tune, alpha and beta will come later maybe:
config_generator.add_parameter('topics', RandomTunableOffsetParameter(value_range=(7, 20), step=1, seed=seed))
# config_generator.add_parameter('alpha', RandomTunableOffsetParameter(value_range=(0.005, 1.0), step=0.5, seed=seed))
# config_generator.add_parameter('beta', RandomTunableDiscreteParameter(values_list=beta, seed=seed))

In [None]:
from hp_tuning import LDATuningProcedure
import pandas as pd

stop_words = ['game', 'play', '<game_name>']
corpus = pd.read_csv("../dataset/output/default_sentences/pre_processed.310k.csv")
procedure = LDATuningProcedure(generator=config_generator, top=[3, 10, 25], folds=5)
results = procedure.run(corpus, 10, stop_words)  # Try 10 different configurations.
procedure.store_results("./output/config/hp_tuning_results.default_sentence.json")

Results graphs:

In [None]:
import json

data = pd.DataFrame(json.load(open("./output/config/hp_tuning_results.noun_only.json")))
data['topics'] = data['config'].map(lambda o: o['topics'])
data['perplexity'] = data['perplexity'].map(lambda x: np.mean(x))
for i in [3, 10, 25]:
    data[f'{i}_npmi_coh'] = data['npmi_coh'].map(lambda x: np.mean(x[str(i)]))
    data[f'{i}_cv_coh'] = data['cv_coh'].map(lambda x: np.mean(x[str(i)]))
# Refined data
data = data.drop(columns=['config', 'npmi_coh', 'cv_coh'])

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
data = data.sort_values(by="topics")
fig.add_trace(go.Scatter(x=data['topics'], y=data['3_cv_coh'], mode='lines', name='top-3'))
fig.add_trace(go.Scatter(x=data['topics'], y=data['10_cv_coh'], mode='lines', name='top-10'))
fig.add_trace(go.Scatter(x=data['topics'], y=data['25_cv_coh'], mode='lines', name='top-25'))

fig.add_trace(go.Scatter(x=data['topics'], y=data['3_npmi_coh'], mode='lines', name='top-3', line=dict(dash='dash')))
fig.add_trace(go.Scatter(x=data['topics'], y=data['10_npmi_coh'], mode='lines', name='top-10', line=dict(dash='dash')))
fig.add_trace(go.Scatter(x=data['topics'], y=data['25_npmi_coh'], mode='lines', name='top-25', line=dict(dash='dash')))
fig.update_traces(mode='lines+markers')
fig.update_layout(
    title=dict(text='Average CV coherence over k-folds per model (K different)'),
    xaxis=dict(title=dict(text='Model topics K')),
    yaxis=dict(title=dict(text='CV coherence')),
)
fig.show()
print(f"Average perplexity for the current dataset LDA approach is of: {data[['perplexity', 'topics']]}")

In [None]:
import json
import numpy as np

data = pd.DataFrame(json.load(open("./output/config/hp_tuning_results.default_sentence.json")))
data['topics'] = data['config'].map(lambda o: o['topics'])
data['perplexity'] = data['perplexity'].map(lambda x: np.mean(x))
for i in [3, 10, 25]:
    data[f'{i}_npmi_coh'] = data['npmi_coh'].map(lambda x: np.mean(x[str(i)]))
    data[f'{i}_cv_coh'] = data['cv_coh'].map(lambda x: np.mean(x[str(i)]))
# Refined data
data = data.drop(columns=['config', 'npmi_coh', 'cv_coh'])

For the given searched configurations the best settings seems to be K=11

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
data = data.sort_values(by="topics")
fig.add_trace(go.Scatter(x=data['topics'], y=data['3_cv_coh'], mode='lines', name='top-3'))
fig.add_trace(go.Scatter(x=data['topics'], y=data['10_cv_coh'], mode='lines', name='top-10'))
fig.add_trace(go.Scatter(x=data['topics'], y=data['25_cv_coh'], mode='lines', name='top-25'))

fig.add_trace(go.Scatter(x=data['topics'], y=data['3_npmi_coh'], mode='lines', name='top-3', line=dict(dash='dash')))
fig.add_trace(go.Scatter(x=data['topics'], y=data['10_npmi_coh'], mode='lines', name='top-10', line=dict(dash='dash')))
fig.add_trace(go.Scatter(x=data['topics'], y=data['25_npmi_coh'], mode='lines', name='top-25', line=dict(dash='dash')))
fig.update_traces(mode='lines+markers')
fig.update_layout(
    title=dict(text='Average CV coherence over k-folds per model (K different)'),
    xaxis=dict(title=dict(text='Model topics K')),
    yaxis=dict(title=dict(text='CV coherence')),
)
fig.show()
print(f"Average perplexity for the current dataset LDA approach is of: {data[['perplexity', 'topics']]}")

Lets try to tune alpha and beta

In [None]:
from model import LdaModelGenerator, LdaGeneratorConfig

stop_words = ['game', 'play', '<game_name>']
corpus = pd.read_csv("../dataset/output/pos_tagged/pre_processed.310k.noun_only.csv")
model, dictionary = LdaModelGenerator(LdaGeneratorConfig("k_tuned", topics=11), stop_words).make_model(corpus)

In [None]:
from gensim.models import CoherenceModel
import pandas as pd

test_df = pd.read_csv("../dataset/output/pos_tagged/pre_processed.310k.noun_only.test.csv")
texts = test_df['comments'].apply(lambda x: x.split(' '))

i_results = dict(cv_coh={}, npmi_coh={})
for top in [3, 5, 10, 15, 20, 25, 50]:
    cv_coh = CoherenceModel(model, texts=texts, coherence='c_v', topn=top)
    npmi_coh = CoherenceModel(model, texts=texts, coherence='c_npmi', topn=top)
    i_results['cv_coh'][top] = cv_coh.get_coherence()
    i_results['npmi_coh'][top] = npmi_coh.get_coherence()

In [None]:
i_results

In [23]:
model.print_topics(num_words=30)

[(0,
  '0.049*"action" + 0.033*"point" + 0.028*"worker" + 0.027*"placement" + 0.024*"card" + 0.024*"resource" + 0.023*"turn" + 0.021*"building" + 0.015*"round" + 0.013*"scoring" + 0.012*"board" + 0.012*"way" + 0.011*"end" + 0.010*"order" + 0.010*"victory" + 0.010*"lot" + 0.010*"mechanism" + 0.009*"thing" + 0.009*"bonus" + 0.009*"tile" + 0.009*"mechanic" + 0.009*"area" + 0.009*"space" + 0.008*"money" + 0.008*"engine" + 0.008*"opponent" + 0.008*"player" + 0.008*"cube" + 0.007*"management" + 0.007*"selection"'),
 (1,
  '0.205*"card" + 0.036*"luck" + 0.022*"character" + 0.020*"player" + 0.018*"hand" + 0.016*"minute" + 0.014*"filler" + 0.014*"decision" + 0.013*"ability" + 0.012*"set" + 0.012*"draw" + 0.012*"turn" + 0.011*"time" + 0.010*"deck" + 0.009*"round" + 0.009*"opponent" + 0.009*"way" + 0.008*"randomness" + 0.007*"mission" + 0.007*"event" + 0.007*"effect" + 0.006*"power" + 0.006*"number" + 0.006*"choice" + 0.006*"hero" + 0.006*"drafting" + 0.005*"end" + 0.005*"mechanic" + 0.005*"skill