# Chapter 7 - Training Flair Embeddings

This Jupyter notebook provides a resource to help you follow the code examples from the book more easily. The notebook covers all practical code snippets and exercises found in: Chapter 7 - Training Flair Embeddings.

## Training Flair embeddings on the world’s smallest language

### Preparing the dictionary

In [None]:
from flair.data import Dictionary

dictionary = Dictionary.load('chars')

In [None]:
from flair.data import Dictionary

dictionary = Dictionary()
toki_pona_symbols = 'ptksmnljwaeiou'
toki_pona_symbols += toki_pona_symbols.upper()

for c in toki_pona_symbols + '?. ':
    dictionary.add_item(c)

### Preparing the corpus

In [None]:
import requests

response = requests.get("https://git.io/J1dgd")

sentences = response.text.splitlines()
one_tenth_corp_len = int(len(sentences)/10)

test, valid, train = (
    sentences[:one_tenth_corp_len],
    sentences[one_tenth_corp_len:one_tenth_corp_len*2],
    sentences[one_tenth_corp_len*2:])

In [None]:
from tempfile import TemporaryDirectory
from os.path import join
from os import mkdir

dataset_dir_obj = TemporaryDirectory()
dataset_dir = dataset_dir_obj.name
train_dir = join(dataset_dir, 'train')
mkdir(train_dir)

with open(join(dataset_dir, "test.txt"), "w") as file:
    file.write(' '.join(test))

with open(join(dataset_dir, "valid.txt"), "w") as file:
    file.write(' '.join(valid))

with open(join(train_dir, "train_split_1"), "w") as file:
    file.write(' '.join(train))

In [None]:
from flair.trainers.language_model_trainer import TextCorpus

corpus = TextCorpus(dataset_dir,
                    dictionary,
                    forward=True,
                    character_level=True)

### Training the language model

In [None]:
from flair.models import LanguageModel
from flair.trainers.language_model_trainer import (
    LanguageModelTrainer)

language_model = LanguageModel(dictionary,
                               is_forward_lm=True,
                               hidden_size=64,
                               nlayers=1)

trainer = LanguageModelTrainer(language_model, corpus)
trainer.train('forward_model_directory',
              sequence_length=25,
              mini_batch_size=10,
              max_epochs=100)

In [None]:
t = language_model.generate_text(number_of_characters=40)[0]
print(t)

### Using custom embeddings on downstream tasks

In [None]:
from flair.embeddings import FlairEmbeddings

fw = FlairEmbeddings('forward_model_directory/best-lm.pt')

### Performing intrinsic evaluation on custom Flair embeddings

In [None]:
from flair.embeddings import FlairEmbeddings
from flair.data import Sentence

synonym_1 = Sentence('lukin')
synonym_2 = Sentence('oko')
rand_word = Sentence('jan')

fw = FlairEmbeddings('forward_model_directory/best-lm.pt')
fw.embed(synonym_1)
fw.embed(synonym_2)
fw.embed(rand_word)

embedding_syn_1 = synonym_1[0].embedding.tolist()
embedding_syn_2 = synonym_2[0].embedding.tolist()
embedding_rnd_wrd = rand_word[0].embedding.tolist()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity as sim

s_synonym = sim([embedding_syn_1], [embedding_syn_2])[0][0] 
s_rand_1 = sim([embedding_syn_1], [embedding_rnd_wrd])[0][0]
s_rand_2 = sim([embedding_syn_2], [embedding_rnd_wrd])[0][0]