Summary:\
Experimentation to check "cache misses" for vectorizer vocabularies. In essence, rudimentary methods result in only encoding ~half of our words - however many are due to OCR errors.

In [1]:
import numpy as np
import pandas as pd
from DataLoaders import *
from Vectorizers import *
import matplotlib.pyplot as plt

In [2]:
np.random.seed(17)

dataloader: AbstractDataLoader = OriginalDataLoader(data_path='../william_data/test_xml/')
data: ProcessedData = dataloader.load_and_preprocess_data()

In [3]:
vectorizer: AbstractVectorizer = PreW2V(path_to_bin='fr_w2v_web_w5')
model: KeyedVectors = vectorizer.model

KeyboardInterrupt: 

In [None]:
# iterate over every single word in the data, and if the word was not already checked, check if it is in the key_to_index. 
# keep a running tally of the number of words that are in the key_to_index, as well as which words are not in the key_to_index
# after this experiment, create a simple plot using matplotlib to show the number of words that are in the key_to_index vs. the number of words that are not in the key_to_index

in_vocab = []
out_vocab = []

for excerpt in data['good'] + data['bad']:
    for word in excerpt:
        if word not in in_vocab and word not in out_vocab:
            if word in model.key_to_index:
                in_vocab.append(word)
            else:
                out_vocab.append(word)

print(f'Number of words in vocab: {len(in_vocab)}')
print(f'Number of words not in vocab: {len(out_vocab)}')


Number of words in vocab: 34505
Number of words not in vocab: 16213


public: \
Number of words in vocab: 30078 \
Number of words not in vocab: 20640 \
Saved as "in_vocab_public.pkl"

private: \
Number of words in vocab: 34505\
Number of words not in vocab: 16213\
saved as similar pattern

In [None]:
len(out_vocab) / (len(in_vocab) + len(out_vocab))

0.3196695453290745

In [None]:
# load back in out vocab from pkl
out_vocab_public = pickle.load(open('out_vocab_public.pkl', 'rb'))

In [None]:
# create a list of words that are in out_vocab_public but not in out_vocab
out_vocab_public_not_in_vocab = []
for word in out_vocab_public:
    if word not in out_vocab:
        out_vocab_public_not_in_vocab.append(word)

print(f'Number of words in out_vocab_public but not in out_vocab: {len(out_vocab_public_not_in_vocab)}')

Number of words in out_vocab_public but not in out_vocab: 4489


In [None]:
model.most_similar("d")

[('autod', 0.47202757000923157),
 ('dã', 0.45885589718818665),
 ('l', 0.4341951608657837),
 ('co-d', 0.4299396872520447),
 ('interd', 0.4105587899684906),
 ('non-d', 0.4102543890476227),
 ('re-d', 0.3999244272708893),
 ('auto-d', 0.3988338112831116),
 ('sous-d', 0.39867985248565674),
 ('cyberd', 0.3930314779281616)]

In [None]:
out_vocab_public_not_in_vocab[0:1000]

['brouillonnes',
 'ieur',
 'décapitalisation',
 'rapi',
 'pondit',
 'voula',
 'chœur',
 'cueil',
 'souvins',
 'superga',
 'funt',
 'cœurs',
 'rons',
 'conq',
 'cœur',
 'épargnons',
 'finé',
 'lant',
 'tée',
 'efet',
 'suspendit',
 'œil',
 'geur',
 'écriai',
 'compli',
 'fimes',
 'arra',
 'sésostris',
 'retinrent',
 'flanquent',
 'spece',
 'entassée',
 'craintives',
 'sépulcral',
 'versai',
 'exer',
 'œuvre',
 'violaient',
 'xle',
 'dentelures',
 'grimacent',
 'tecte',
 'topazes',
 'exi',
 'cienne',
 'lippe',
 'monu',
 'velles',
 'récep',
 'guste',
 'conjecturé',
 'deur',
 'décora',
 'estun',
 'quoiqu',
 'œuvres',
 'lutterait',
 'rité',
 'assem',
 'etau',
 'fléchier',
 'entou',
 'toire',
 'semblée',
 'nese',
 'exé',
 'pital',
 'mière',
 'architec',
 'assombris',
 'dépla',
 'cée',
 'enlévement',
 'desbrosses',
 'chevins',
 'vaient',
 'mémes',
 'estune',
 'dimen',
 'sions',
 'marroniers',
 'reusement',
 'relle',
 'enceindre',
 'gues',
 'hiers',
 'distribu',
 'truire',
 'maronniers',
 'cor

In [None]:
#randomly sample 100 words from the out_vocab_idxs list
sampled_out_vocab = np.random.choice(out_vocab, 1000)

sampled_out_vocab


array(['evéque', 'diquées', 'stivants', 'eucens', 'plapart', 'gnrex',
       'ébritméghe', 'eroyez', 'humphles', 'empestât', 'nalionaux',
       'exagéra', 'priété', 'volelter', 'daleuse', 'êtré', 'cartousel',
       'aticindre', 'puradis', 'brocheuse', 'fairedisparaître', 'tinées',
       'toutest', 'queique', 'ocrave', 'arcur', 'strasbonre', 'couserr',
       'irascihle', 'jeao', 'oupes', 'oùvert', 'pénétrât',
       'aboutissantes', 'ffacer', 'dimanché', 'uikaléidoscope', 'leû',
       'maueonseil', 'léchier', 'emmenai', 'populeusé', 'faub', 'curait',
       'enthou', 'demicercle', '2h', 'manans', 'mépri', 'fastueusement',
       'rambuneau', 'révüe', 'notye', 'coësre', 'faireune',
       'funamgigantesques', 'rfois', 'touteslesadministrations', 'férou',
       'successi', 'vanche', 'üté', 'coesre', 'morskaia', '59p',
       'pierréries', 'fhôpital', 'extrèémement', 'brétagne',
       'cherchaïent', 'solpice', 'hanaps', 'légen', 'simoné', 'pelits',
       'lithurgie', 'unifor', 'del

Looks like almost entirely OCR errors...

Moving on to checking stopwords:

In [None]:
from nltk.corpus import stopwords
sw = set(stopwords.words('french') + ['ici', 'là', 'elles', 'trop', 'tous', 'selon', 'presque', 'tant', 
                                                'fois', 'quant', 'ainsi', 'cette', 'doit', 'tout', 'bien', 'toute', 
                                                'si', 'autre', 'sans', 'comment', 'rien', 'là', 'peu', 'mêmes', 'si', 
                                                'plutôt', 'ceux', 'faire', 'moins', 'être', 'faudra', 
                                                'deux', 'a', 'paris', 'plus', 'où', 'saint', 'cette'])
sw

{'a',
 'ai',
 'aie',
 'aient',
 'aies',
 'ainsi',
 'ait',
 'as',
 'au',
 'aura',
 'aurai',
 'auraient',
 'aurais',
 'aurait',
 'auras',
 'aurez',
 'auriez',
 'aurions',
 'aurons',
 'auront',
 'autre',
 'aux',
 'avaient',
 'avais',
 'avait',
 'avec',
 'avez',
 'aviez',
 'avions',
 'avons',
 'ayant',
 'ayante',
 'ayantes',
 'ayants',
 'ayez',
 'ayons',
 'bien',
 'c',
 'ce',
 'ces',
 'cette',
 'ceux',
 'comment',
 'd',
 'dans',
 'de',
 'des',
 'deux',
 'doit',
 'du',
 'elle',
 'elles',
 'en',
 'es',
 'est',
 'et',
 'eu',
 'eue',
 'eues',
 'eurent',
 'eus',
 'eusse',
 'eussent',
 'eusses',
 'eussiez',
 'eussions',
 'eut',
 'eux',
 'eûmes',
 'eût',
 'eûtes',
 'faire',
 'faudra',
 'fois',
 'furent',
 'fus',
 'fusse',
 'fussent',
 'fusses',
 'fussiez',
 'fussions',
 'fut',
 'fûmes',
 'fût',
 'fûtes',
 'ici',
 'il',
 'ils',
 'j',
 'je',
 'l',
 'la',
 'le',
 'les',
 'leur',
 'lui',
 'là',
 'm',
 'ma',
 'mais',
 'me',
 'mes',
 'moi',
 'moins',
 'mon',
 'même',
 'mêmes',
 'n',
 'ne',
 'nos',
 'no

In [None]:
# check if the stopwords are in the key_to_index
sw_in_vocab = []
sw_out_vocab = []

for word in sw:
    if word not in sw_in_vocab and word not in sw_out_vocab:
        if word in model.key_to_index:
            sw_in_vocab.append(word)
        else:
            sw_out_vocab.append(word)

print(f'Number of stopwords in vocab: {len(sw_in_vocab)}')
print(f'Number of stopwords not in vocab: {len(sw_out_vocab)}')

Number of stopwords in vocab: 187
Number of stopwords not in vocab: 5


In [None]:
sw_out_vocab

['étantes', 'étées', 'ayante', 'étante', 'ayantes']

from public, ['eûtes',
 'fusses',
 'ayantes',
 'étante',
 'ayante',
 'étantes',
 'étées',
 'fussiez',
 'eusses',
 'fussions'] \

 from private ['étantes', 'étées', 'ayante', 'étante', 'ayantes']

In [None]:
import fasttext

ft = fasttext.load_model('cc.fr.300.bin')



In [None]:
# using, ft repeat the same process above of finding % of words in and out of vocab
from tqdm import tqdm
in_vocab_ft = []
out_vocab_ft = []

words = ft.words

for excerpt in tqdm(data['good'] + data['bad']):
    for word in excerpt:
        if word in words:
            in_vocab_ft.append(word)
        else:
            out_vocab_ft.append(word)

in_vocab_ft = set(in_vocab_ft)
out_vocab_ft = set(out_vocab_ft)

print(f'Number of words in vocab: {len(in_vocab_ft)}')
print(f'Number of words not in vocab: {len(out_vocab_ft)}')

  8%|▊         | 199/2618 [01:20<16:19,  2.47it/s]


KeyboardInterrupt: 

cleaning experiments / math

In [None]:
from DataLoaders import MatchLoader
ml = MatchLoader(data_path='../william_data/test_xml/')
matches = ml.load_and_preprocess_data()


# good 328
# bad 2290


In [None]:
# count the number of characters in all the matches
total = 0
for match in matches['bad'] + matches['good']:
    total += len(match['snippet'])

total

7793545

In [None]:
tokens = (total * 2) / 3
tokens

5195696.666666667

In [None]:
price = (tokens / 1000) * .002
price

10.391393333333333

demo of vector vocabulary misses on old vs new ocr

In [None]:
from typing import List
import re

In [None]:
with open('../william_data/ocr_output/res.txt') as f:
    new_ocr = f.read()
with open('../william_data/ocr_output/Balzac_1841_bpt6k1133819_CL.txt') as f:
    old_ocr = f.read()

In [None]:
def _tokenize_text(text: str) -> List[str]:
    clean_text = re.sub(r'[^\s0123456789abcdefghijklmnopqrstuvwxyzàâäæçèéêëîïñôùûüÿœ̀œ]',
                    ' ',
                    text.lower())
    clean_text = re.sub(r'\s+', ' ', clean_text)

    return clean_text.split()

new_ocr_tokens = _tokenize_text(new_ocr)
old_ocr_tokens = _tokenize_text(old_ocr)

In [None]:
new_ocr_tokens

['bnf',
 'gallica',
 'physiologie',
 'du',
 'rentier',
 'de',
 'paris',
 'et',
 'de',
 'province',
 'par',
 'mm',
 'de',
 'balzac',
 'et',
 'arnould',
 'frémy',
 'dessins',
 'par',
 'gavarni',
 'henri',
 'source',
 'gallica',
 'bnf',
 'fr',
 'bibliothèque',
 'nationale',
 'de',
 'france',
 'bnf',
 'gallica',
 'balzac',
 'honoré',
 'de',
 '1799',
 '1850',
 'auteur',
 'du',
 'texte',
 'physiologie',
 'du',
 'rentier',
 'de',
 'paris',
 'et',
 'de',
 'province',
 'par',
 'mm',
 'de',
 'balzac',
 'et',
 'arnould',
 'frémy',
 'dessins',
 'par',
 'gavarni',
 'henri',
 'monnier',
 'daumier',
 'et',
 'meissonier',
 '1841',
 '1',
 'les',
 'contenus',
 'accessibles',
 'sur',
 'le',
 'site',
 'gallica',
 'sont',
 'pour',
 'la',
 'plupart',
 'des',
 'reproductions',
 'numériques',
 'd',
 'oeuvres',
 'tombées',
 'dans',
 'le',
 'domaine',
 'public',
 'provenant',
 'des',
 'collections',
 'de',
 'la',
 'bnf',
 'leur',
 'réutilisation',
 's',
 'inscrit',
 'dans',
 'le',
 'cadre',
 'de',
 'la',
 'loi'

In [None]:
from Vectorizers import *
vectorizer: AbstractVectorizer = PreW2V(path_to_bin='fr_w2v_web_w5')
model: KeyedVectors = vectorizer.model

In [20]:
# iterate over every single word in the data, and if the word was not already checked, check if it is in the key_to_index. 
# keep a running tally of the number of words that are in the key_to_index, as well as which words are not in the key_to_index
# after this experiment, create a simple plot using matplotlib to show the number of words that are in the key_to_index vs. the number of words that are not in the key_to_index

in_vocab = []
out_vocab = []


for word in new_ocr_tokens:
    if word not in in_vocab and word not in out_vocab:
        if word in model.key_to_index:
            in_vocab.append(word)
        else:
            out_vocab.append(word)

print(f'Number of words in vocab: {len(in_vocab)}')
print(f'Number of words not in vocab: {len(out_vocab)}')


Number of words in vocab: 3876
Number of words not in vocab: 171


In [21]:
out_vocab

['nrousseau',
 '318134hozë',
 'ÿ2',
 'ÿÿÿÿ',
 'ÿÿ',
 'tlqlobalizarx',
 '1808000',
 'beferezobobotes',
 'kardi',
 '222829q',
 '120000980221',
 'droon',
 'ngphysiologie',
 'dauhier',
 'honer',
 '4841to',
 'nthropomorphe',
 'actionnai',
 'linnéres',
 'marquable',
 'micographes',
 'roussâtres',
 'ombelliformes',
 'farcissent',
 'septivalve',
 'hébétants',
 'desanimaux',
 'dutrochet',
 'autresindividus',
 'bonnasserie',
 'phrénologues',
 'loppe',
 'rienologie',
 'grandslivres',
 'loupscerviers',
 'retrancherce',
 'instinet',
 'physionomic',
 'wher',
 'intrépidement',
 'reils',
 'chaur',
 'transnonain',
 'enraient',
 'superlativement',
 'cratique',
 'beefsteaks',
 'demandet',
 'roâ',
 'avouezle',
 'unsoin',
 'wape',
 'grandlivre',
 'rognait',
 'entin',
 'dameret',
 'tromperiez',
 'vi55',
 'goureusement',
 'arrièreplans',
 'barbifie',
 'hebêtement',
 'môsieu',
 'affùt',
 'plusieursannées',
 'henres',
 'gagniet',
 'safemme',
 'messede',
 'danstrimestre',
 'narcotisés',
 'vaet',
 'mitouflet',
 

new: 3876, 171
old: 3902, 403