## Grab a list of XML files to run through the classifier

In [1]:
import glob

files_to_classify = [p for p in sorted(glob.glob('/home/seth/Notes/HDW/textbase/xml/*.xml')) \
                   if '_clean' not in p]

print(len(files_to_classify))

200


## Are any of the XML files not well-formed?

In [2]:
from lxml import etree

valid_input_files = []
broken_xml_files = []

for p in files_to_classify:
    try:
        tree = etree.parse(p)
        valid_input_files.append(p)
    except etree.XMLSyntaxError:
        broken_xml_files.append(p)
        
print('len(valid_input_files)', len(valid_input_files),
     'len(broken_xml_files)', len(broken_xml_files))

print()
for p in broken_xml_files:
    print('BROKEN?', p)

len(valid_input_files) 200 len(broken_xml_files) 0



## Load the classifier and dictionary  . . . 

 . . . from 12_make_svc_classifier_SG.ipynb.

In [3]:
import pickle, re
from gensim import corpora, matutils
from sklearn.naive_bayes import *
from sklearn.ensemble import *

filename = 'SVC.saved_classifier.sav'
classifier = pickle.load(open(filename, 'rb'))

dictionary = corpora.Dictionary.load('SVC.classifier_dictionary.dict')

words_in_classifier = set(list(dictionary.values()))

print('len(words_in_classifier)', len(words_in_classifier))
print(words_in_classifier)  #just checking

len(words_in_classifier) 50714
{'raretés', 'neve', 'vigilant', 'brocanteurs', 'complétaient', 'rougissante', 'enfa', 'palu', 'habilleuse', 'aéronaute', 'deläcombe', 'run', 'rayonné', 'osaient', 'sale', 'présider', 'ciselure', 'mobpital', 'accueillies', 'sangle', 'anxieux', 'treilles', 'entrainé', 'ésprit', 'rôder', 'religieusement', 'jaigoantà', 'approxima', 'cuapirre', 'leursavénements', 'effacent', 'jugeämes', 'cosmaceti', 'méconnaitre', 'absorbée', 'vanves', 'religeuse', 'aristocratie', 'conunise', 'maisonnêtte', 'ærvers', 'fouéde', 'boufflers', 'enfanta', 'flandre', 'local', 'ignoble', 'miasmes', 'nee', 'attaqués', 'assemblée', 'bâtissait', 'cilait', 'presqué', 'cirées', 'hostilités', 'souv', 'bliquel', 'égarer', 'narration', 'anneau', 'x1le', 'tonneau', 'possédaient', 'pollalion', '4re', 'bornerait', 'priseurs', 'abrêgé', 'princesse', 'türe', 'invendues', 'ency', 'enfants', 'tenaien', 'détfuire', 'nile', 'doré', 'fondit', 'estocard', 'slatues', 'parquée', 'muraille', 'institution'

## Process the XML files

In [4]:
grand_total_snippets = 0
grand_total_good_snippets = 0

for p in valid_input_files:
    
    tree = etree.parse(p)
    
    n_total_snippets = len(tree.xpath('//snippet'))
    n_good_snippets = 0
    
    matches_and_snippets = etree.Element('matches_and_snippets')
    
    for match in tree.xpath('//match'):
    
        snippet = match.xpath('descendant::snippet')[0]
        #get rid of extra space artefacts from OCR
        clean_text = re.sub(r'[^\s0123456789abcdefghijklmnopqrstuvwxyzàâäæçèéêëîïñôùûüÿœ̀]',
                           ' ',
                           snippet.text.lower())
        clean_text = re.sub('\s+', ' ', clean_text)
        #tokenize, but using only words previously determined to be meaningful
        tokens = [t for t in clean_text.split(' ') if t > '' and t in words_in_classifier]

        corpus = [dictionary.doc2bow(tokens),]

        matrix = matutils.corpus2dense(corpus, len(dictionary))
        matrix = matrix.T

        result = classifier.predict(matrix)[0]

        if result == 'good':
            n_good_snippets += 1
        
        snippet.set('classifier_result', result)
        matches_and_snippets.append(match)
        
            
    base_xml_name = p.split('/')[-1]
    
    print(base_xml_name.ljust(45), 
          'n_total_snippets', n_total_snippets, 
          '     n_good_snippets', n_good_snippets)
    
    #if n_good_snippets > 0: #Commented out so as to add match regardless of whether good.
        
    new_tree = etree.ElementTree(matches_and_snippets)
        
    new_tree.write('SVC.test_results_xml/' + base_xml_name, encoding='utf-8')
        
    grand_total_snippets += n_total_snippets
    grand_total_good_snippets += n_good_snippets
    
print()
print('grand_total_snippets', grand_total_snippets,
     '    grand_total_good_snippets', grand_total_good_snippets)

About_1861_bpt6k64392756.xml                  n_total_snippets 3      n_good_snippets 0
Abrantès_1844_bpt6k6472523f.xml              n_total_snippets 16      n_good_snippets 0
Achard_1860_bpt6k113880g.xml                  n_total_snippets 35      n_good_snippets 0
Allix_1830_bpt6k6471452h.xml                  n_total_snippets 15      n_good_snippets 0
Amigues_1871_bpt6k54697084.xml                n_total_snippets 2      n_good_snippets 0
André_1874_bpt6k6112273x.xml                 n_total_snippets 4      n_good_snippets 0
Arènes_de_Paris_1870_bpt6k1413153t.xml       n_total_snippets 26      n_good_snippets 0
Asti_1843_bpt6k6471672z.xml                   n_total_snippets 28      n_good_snippets 8
Auberive_1860_bpt6k6394066d.xml               n_total_snippets 175      n_good_snippets 57
Avenir_de_Paris_1871_bpt6k5459219x.xml        n_total_snippets 2      n_good_snippets 0
Balleydier_1849_bpt6k105490w.xml              n_total_snippets 136      n_good_snippets 33
Balzac_1841_bpt6k1133

Hugo_1831_1_bpt6k6497134z.xml                 n_total_snippets 149      n_good_snippets 66
Hugo_1831_2_bpt6k6497803t.xml                 n_total_snippets 123      n_good_snippets 2
Hugo_1858_bpt6k406195r.xml                    n_total_snippets 170      n_good_snippets 59
Hugo_1867_bpt6k6439743z.xml                   n_total_snippets 43      n_good_snippets 0
Husson_1873_bpt6k5819785h.xml                 n_total_snippets 1      n_good_snippets 0
Indicateur_officiel_1858_bpt6k5481667q.xml    n_total_snippets 61      n_good_snippets 0
Janin_1843_bpt6k102791c.xml                   n_total_snippets 163      n_good_snippets 0
Jeandel_1854_bpt6k63756274.xml                n_total_snippets 687      n_good_snippets 0
Joakim-Isa_1867_bpt6k6489017f.xml             n_total_snippets 2      n_good_snippets 0
Joanne_1857_bpt6k64659860.xml                 n_total_snippets 35      n_good_snippets 0
Kock_1842_bpt6k206782q.xml                    n_total_snippets 65      n_good_snippets 0
Kock_1844_bpt6k1

Tanski_1869_bpt6k6366019j.xml                 n_total_snippets 6      n_good_snippets 0
Tissot_1830_bpt6k6464072j.xml                 n_total_snippets 34      n_good_snippets 0
Troche_1837_bpt6k58469052.xml                 n_total_snippets 28      n_good_snippets 0
TurpindeCrissé_1835_bpt6k9765736r.xml        n_total_snippets 87      n_good_snippets 0
Ulbach_1871_bpt6k5802400x.xml                 n_total_snippets 59      n_good_snippets 0
Van_Tenac_1845_bpt6k6447540p.xml              n_total_snippets 271      n_good_snippets 6
Vidocq_1830_bpt6k1050540s.xml                 n_total_snippets 17      n_good_snippets 0
Villemessant_1860_vol1_bpt6k30443585.xml      n_total_snippets 30      n_good_snippets 0
Villemessant_1860_vol2_bpt6k3044359k.xml      n_total_snippets 23      n_good_snippets 0
Virmaître_1871_bpt6k5448441v.xml             n_total_snippets 123      n_good_snippets 0
Virmaître_1868_bpt6k6436639s.xml              n_total_snippets 64      n_good_snippets 0
Véron_1866_bpt6k6481