In [None]:
import itertools

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE

In [None]:
twaalfletterwoorden = pd.read_csv('tweevoortwaalf/Data/suitable_12_letter_words.txt', header=None).squeeze().rename('Word')
achtletterwoorden = pd.read_csv('tweevoortwaalf/Data/suitable_8_letter_words.txt', header=None).squeeze().rename('Word')

# Completeness of the word lists
A lot of effor was put into creating word lists of as high quality as possible. We want to know both recall and precision of these word lists: do they capture all words that are actually used in the game show, and could this all be used. 

If both of these are ok, we can actually use the lists for training and analyzing

If the precision is too low, meaning there are many words on the word lists that wouldn't actually occur in Twee Voor Twaalf, practice would be inefficient: we might be puzzling for inexistent or very rare words - and even training for patters that do not exist. It can be demotivating too, if the puzzle is undoable because you have to guess an inexistent word. There is no easy way to automatically check this, since we cannot look into the heads of the editors of the show. However, based on actually playing this game, this does not seem to be a big problem
If the recall is too low, meaning there are many words used in the show that are not on the word lists, we may become overconfident, learning not enough patters, and or memorizing words where other options are available. If the recall is very high, one could even suffice with memorizing the entire word list.



In [None]:
# This is an unbased sample from memory and google
true_12_letter_words = [
    'koekoeksklok',
    'kalenderjaar',
    'zeekomkommer',
    'zonnestelsel',
    'biljartlaken',
    'kroonprinses',
    'ballingschap',
]


def standard_error_binomial(p, n):
    return ((p * (1 - p) ) / n) ** 0.5


def bootstrap_sample(sample, alpha=0.95, n_bootstrap=10_000):
    replications = np.array([np.random.choice(sample, len(sample), replace=True) for _ in range(n_bootstrap)])
    recalls = np.mean(replications, axis=1)
    p = ((1.0 - alpha) / 2.0) * 100
    
    lower = np.percentile(recalls, p)
    median = np.percentile(recalls, 50)
    upper = np.percentile(recalls, 100 - p)

    return lower, median, upper

occ_twaalfletter_woorden = [true_word in set(twaalfletterwoorden) for true_word in true_12_letter_words]
recall_twaalfletterwoorden = sum(occ_twaalfletter_woorden) / len(occ_twaalfletter_woorden)
se_binomial = standard_error_binomial(recall_twaalfletterwoorden, len(occ_twaalfletter_woorden))
lower, median, upper = bootstrap_sample(occ_twaalfletter_woorden)
print('Recall of 12 letter words')
print(f'Based on a parametric test: {recall_twaalfletterwoorden:.1%} +- {1.96 * se_binomial:.1%}')
print(f'Based on bootstrap sampling, estimate is {median:.1%}, with a confidence interval of [{lower:.1%}, {upper:.1%}]' )

# I do not have any 8 letter words from memory, and only one 9 letter word, which
# is not enough to calculate the estimated recall
true_9_letter_words = [
    'fauteuil',
]

# Structure of words
By analyzing the common structure of 12-letter words, we can improve the guessing and buying process

## Occurrence and placement of letters
We can improve our strategy of buying letters by knowing which letters occur at which places. The first letter often helps most. Some letters we may not have to buy, because we can figure out where they will go. We will calculate for each letter how often it occurs on each position

In [None]:
def get_occurence_ngrams(ngram_length=1, wordlist=twaalfletterwoorden):
    cv = CountVectorizer(analyzer='char_wb', ngram_range=(ngram_length, ngram_length))
    occurences = cv.fit_transform(twaalfletterwoorden)
    df = pd.DataFrame(occurences.toarray(), columns = cv.get_feature_names_out()).rename(columns=lambda s: s.replace(' ', '_'))
    return df.sum()

In [None]:
letter_occurences = get_occurence_ngrams()
letter_occurences.sort_values(ascending=False).to_frame('# occurrences in word list')

In [None]:
letter_position = (twaalfletterwoorden
            .apply(lambda x: list(enumerate(x, start=1)))
            .explode()
            .to_frame()
            .assign(Location = lambda df: df['Word'].apply(lambda tup: tup[0]),
                    Letter = lambda df: df['Word'].apply(lambda tup: tup[1]),
            )
)
letter_position_frequency = (letter_position.groupby('Letter')['Location']
                            .value_counts(normalize=True)
                            .unstack(fill_value=0)
                            )
(letter_position_frequency.agg(['idxmax', 'max'], axis='columns')
 .sort_values('max', ascending=False)
 .rename(columns={'idxmax': 'MostCommonPosition', 'max': 'Percentage'})
 .astype({'MostCommonPosition': int})
 .style.format({"Percentage": '{:.1%}'})
)

Because a table of 27 letters * 12 positions is impossible to learn, we try to cluster which letters are of the same type. We use TSNE to visualise this and find there are 5 distinct groups

In [None]:
tsne = TSNE(perplexity=5, random_state=42)
transformed = tsne.fit_transform(letter_position_frequency)
transformed = pd.DataFrame(transformed, index=letter_position_frequency.index, columns=['FirstComponent', 'SecondComponent'])
ax = transformed.plot(kind='scatter', x='FirstComponent', y='SecondComponent')
for k, v in transformed.iterrows():
    ax.annotate(k, v,
                xytext=(10,-5), textcoords='offset points',
                family='sans-serif', fontsize=18, color='darkslategrey')


In [None]:
letter_groups = [
    list('bzvw'),
    list('mpskh'),
    list('fdtrlg'),
    list('uaoiyc'),
    list('jĳne')
]
for group in letter_groups:
    display(letter_position_frequency.loc[group].style.format('{:.1%}'))

In terms of letter placements, we have a small number of groups:
1. b,z,v, w: Very often the starting letter. Basically never the last letter(s)
2. m,p,s,k (h): are often at the beginning or, a bit more rarely at the end
3. f,d,t,r,l,g: are often the last letter. G stands out as very often the last letter, or at the beginning. The rest is seldom at the beginning
4. u,a,o,i,y,c: Often the second or third letter 
5. e,n,j,ij: Often the second-to-last letter (for "n", this is often "ing")


## Occurrence of letter combinations
We can improve even more, and buy much fewer letters, by knowing which letters come together, so we have to buy only one of them.

In [None]:
twograms = get_occurence_ngrams(2)
twograms.index = pd.MultiIndex.from_arrays([twograms.index.str[0], twograms.index.str[1]], names=['FirstLetter', 'SecondLetter'])
twograms.nlargest(15).to_frame('N Occurences')

Remember: 
- "je", "ch", "er", "rij", "ng" 

In [None]:
odds_secondletter_given_first_letter = twograms.to_frame('Occurrences').assign(Percentage = lambda df: df['Occurrences'] / df.groupby('FirstLetter')['Occurrences'].sum())
display(odds_secondletter_given_first_letter.nlargest(15, 'Percentage'))
odds_secondletter_given_first_letter.plot(kind='scatter', x='Occurrences', y='Percentage')

In [None]:
odds_firstletter_given_second_letter = twograms.to_frame('Occurrences').assign(Percentage = lambda df: df['Occurrences'] / df.groupby('SecondLetter')['Occurrences'].sum())
display(odds_firstletter_given_second_letter.nlargest(15, 'Percentage'))
odds_firstletter_given_second_letter.plot(kind='scatter', x='Occurrences', y='Percentage')

### Longer letter combinations

Words often end on "ing" or "(a)tie". Dont forget "teit" and "schap", ending "ter" and "der" and "meester"

In [None]:
threegrams = get_occurence_ngrams(3)
threegrams.nlargest(15)

In [None]:
threegrams = get_occurence_ngrams(4)
threegrams.nlargest(15)

In [None]:
threegrams = get_occurence_ngrams(5)
threegrams.nlargest(15)

In [None]:
threegrams = get_occurence_ngrams(6)
threegrams.nlargest(15)

# Special words
We look for words that are immediately recognizable because they have one letter very often, or a specific combination of two letters that is unique, so we don't have to buy any letters

## One letter occurs many times 

In [None]:
cv = CountVectorizer(analyzer='char_wb', ngram_range=(1, 1))
occurences = cv.fit_transform(twaalfletterwoorden)
letter_occs = pd.DataFrame(occurences.toarray(), columns = cv.get_feature_names_out()).rename(columns=lambda s: s.replace(' ', '_'))

In [None]:
def return_all_most_occuring_words(letter, letter_occs=letter_occs):
    return letter_occs.loc[lambda df: df[letter] == df[letter].max(), letter]

def print_special_words(letter):
    most_occuring_words_index = return_all_most_occuring_words(letter)
    if len(most_occuring_words_index) <= 3:
        most_occuring_words = twaalfletterwoorden.loc[most_occuring_words_index.index].tolist()
        print(f"The letter {letter!r} occurs at most {most_occuring_words_index.max()} times")
        print(', '.join(most_occuring_words))
        print()

for letter in letter_occs.columns:
    print_special_words(letter)

## Special combinations

In [None]:
all_letters = letter_occs.loc[:, lambda df: ~df.columns.str.contains('_')].columns

for first_letter, second_letter in itertools.combinations(all_letters, 2):
    words_with_combination_index = letter_occs.loc[lambda df: df[first_letter].gt(0) & df[second_letter].gt(0)].index
    if 0 < len(words_with_combination_index) <= 3:
        most_occuring_words = twaalfletterwoorden.loc[words_with_combination_index].tolist()
        print(f'The combination of {first_letter!r} and {second_letter!r}  is rare')
        print(', '.join(most_occuring_words))
        print()
