Summary:\
Experimentation to check "cache misses" for vectorizer vocabularies. In essence, rudimentary methods result in only encoding ~half of our words - however many are due to OCR errors.

In [None]:
import numpy as np
import pandas as pd
from DataLoaders import *
from Vectorizers import *
import matplotlib.pyplot as plt

In [None]:
np.random.seed(17)

dataloader: AbstractDataLoader = OriginalDataLoader(data_path='../william_data/test_xml/')
data: ProcessedData = dataloader.load_and_preprocess_data()

In [None]:
vectorizer: AbstractVectorizer = PreW2V(path_to_bin='fr_w2v_web_w5')
model: KeyedVectors = vectorizer.model

In [None]:
# iterate over every single word in the data, and if the word was not already checked, check if it is in the key_to_index. 
# keep a running tally of the number of words that are in the key_to_index, as well as which words are not in the key_to_index
# after this experiment, create a simple plot using matplotlib to show the number of words that are in the key_to_index vs. the number of words that are not in the key_to_index

in_vocab = []
out_vocab = []

for excerpt in data['good'] + data['bad']:
    for word in excerpt:
        if word not in in_vocab and word not in out_vocab:
            if word in model.key_to_index:
                in_vocab.append(word)
            else:
                out_vocab.append(word)

print(f'Number of words in vocab: {len(in_vocab)}')
print(f'Number of words not in vocab: {len(out_vocab)}')


public: \
Number of words in vocab: 30078 \
Number of words not in vocab: 20640 \
Saved as "in_vocab_public.pkl"

private: \
Number of words in vocab: 34505\
Number of words not in vocab: 16213\
saved as similar pattern

In [None]:
len(out_vocab) / (len(in_vocab) + len(out_vocab))

In [None]:
# load back in out vocab from pkl
out_vocab_public = pickle.load(open('out_vocab_public.pkl', 'rb'))

In [None]:
# create a list of words that are in out_vocab_public but not in out_vocab
out_vocab_public_not_in_vocab = []
for word in out_vocab_public:
    if word not in out_vocab:
        out_vocab_public_not_in_vocab.append(word)

print(f'Number of words in out_vocab_public but not in out_vocab: {len(out_vocab_public_not_in_vocab)}')

In [None]:
model.most_similar("d")

In [None]:
out_vocab_public_not_in_vocab[0:1000]

In [None]:
#randomly sample 100 words from the out_vocab_idxs list
sampled_out_vocab = np.random.choice(out_vocab, 1000)

sampled_out_vocab


Looks like almost entirely OCR errors...

Moving on to checking stopwords:

In [None]:
from nltk.corpus import stopwords
sw = set(stopwords.words('french') + ['ici', 'là', 'elles', 'trop', 'tous', 'selon', 'presque', 'tant', 
                                                'fois', 'quant', 'ainsi', 'cette', 'doit', 'tout', 'bien', 'toute', 
                                                'si', 'autre', 'sans', 'comment', 'rien', 'là', 'peu', 'mêmes', 'si', 
                                                'plutôt', 'ceux', 'faire', 'moins', 'être', 'faudra', 
                                                'deux', 'a', 'paris', 'plus', 'où', 'saint', 'cette'])
sw

In [None]:
# check if the stopwords are in the key_to_index
sw_in_vocab = []
sw_out_vocab = []

for word in sw:
    if word not in sw_in_vocab and word not in sw_out_vocab:
        if word in model.key_to_index:
            sw_in_vocab.append(word)
        else:
            sw_out_vocab.append(word)

print(f'Number of stopwords in vocab: {len(sw_in_vocab)}')
print(f'Number of stopwords not in vocab: {len(sw_out_vocab)}')

In [None]:
sw_out_vocab

from public, ['eûtes',
 'fusses',
 'ayantes',
 'étante',
 'ayante',
 'étantes',
 'étées',
 'fussiez',
 'eusses',
 'fussions'] \

 from private ['étantes', 'étées', 'ayante', 'étante', 'ayantes']

In [None]:
import fasttext

ft = fasttext.load_model('cc.fr.300.bin')

In [None]:
# using, ft repeat the same process above of finding % of words in and out of vocab
from tqdm import tqdm
in_vocab_ft = []
out_vocab_ft = []

words = ft.words

for excerpt in tqdm(data['good'] + data['bad']):
    for word in excerpt:
        if word in words:
            in_vocab_ft.append(word)
        else:
            out_vocab_ft.append(word)

in_vocab_ft = set(in_vocab_ft)
out_vocab_ft = set(out_vocab_ft)

print(f'Number of words in vocab: {len(in_vocab_ft)}')
print(f'Number of words not in vocab: {len(out_vocab_ft)}')

cleaning experiments / math

In [None]:
from DataLoaders import MatchLoader
ml = MatchLoader(data_path='../william_data/test_xml/')
matches = ml.load_and_preprocess_data()

In [None]:
# count the number of characters in all the matches
total = 0
for match in matches['bad'] + matches['good']:
    total += len(match['snippet'])

total

In [None]:
tokens = (total * 2) / 3
tokens

In [None]:
price = (tokens / 1000) * .002
price

demo of vector vocabulary misses on old vs new ocr

In [None]:
from typing import List
import re

In [None]:
with open('../william_data/ocr_output/res.txt') as f:
    new_ocr = f.read()
with open('../william_data/ocr_output/Balzac_1841_bpt6k1133819_CL.txt') as f:
    old_ocr = f.read()

In [None]:
def _tokenize_text(text: str) -> List[str]:
    clean_text = re.sub(r'[^\s0123456789abcdefghijklmnopqrstuvwxyzàâäæçèéêëîïñôùûüÿœ̀œ]',
                    ' ',
                    text.lower())
    clean_text = re.sub(r'\s+', ' ', clean_text)

    return clean_text.split()

new_ocr_tokens = _tokenize_text(new_ocr)
old_ocr_tokens = _tokenize_text(old_ocr)

In [None]:
new_ocr_tokens

In [None]:
from Vectorizers import *
vectorizer: AbstractVectorizer = PreW2V(path_to_bin='fr_w2v_web_w5')
model: KeyedVectors = vectorizer.model

In [None]:
# iterate over every single word in the data, and if the word was not already checked, check if it is in the key_to_index. 
# keep a running tally of the number of words that are in the key_to_index, as well as which words are not in the key_to_index
# after this experiment, create a simple plot using matplotlib to show the number of words that are in the key_to_index vs. the number of words that are not in the key_to_index

in_vocab = []
out_vocab = []


for word in new_ocr_tokens:
    if word not in in_vocab and word not in out_vocab:
        if word in model.key_to_index:
            in_vocab.append(word)
        else:
            out_vocab.append(word)

print(f'Number of words in vocab: {len(in_vocab)}')
print(f'Number of words not in vocab: {len(out_vocab)}')


In [None]:
out_vocab

new: 3876, 171
old: 3902, 403