## Grab a list of XML files to run through the classifier

In [1]:
import glob

clean_files = [p for p in sorted(glob.glob('/home/seth/Notes/HDW/textbase/xml/*.xml')) \
                   if '_clean' not in p]

print(len(clean_files))

200


## Are any of the XML files not well-formed?

In [2]:
from lxml import etree

valid_input_files = []
broken_xml_files = []

for p in clean_files:
    try:
        tree = etree.parse(p)
        valid_input_files.append(p)
    except etree.XMLSyntaxError:
        broken_xml_files.append(p)
        
print('len(valid_input_files)', len(valid_input_files),
     'len(broken_xml_files)', len(broken_xml_files))

print()
for p in broken_xml_files:
    print('BROKEN?', p)

len(valid_input_files) 200 len(broken_xml_files) 0



## Load the classifier and dictionary  . . . 

 . . . from 14_make_SGD_classifier.ipynb.

In [3]:
import pickle, re
from gensim import corpora, matutils
from sklearn.naive_bayes import *
from sklearn.ensemble import *

filename = 'SGD.saved_classifier.sav'
classifier = pickle.load(open(filename, 'rb'))

dictionary = corpora.Dictionary.load('SGD.classifier_dictionary.dict')

words_in_classifier = set(list(dictionary.values()))

print('len(words_in_classifier)', len(words_in_classifier))

ModuleNotFoundError: No module named 'sklearn.linear_model._stochastic_gradient'

## Process the XML files

In [4]:
grand_total_snippets = 0
grand_total_good_snippets = 0

for p in valid_input_files:
    
    tree = etree.parse(p)
    
    n_total_snippets = len(tree.xpath('//snippet'))
    n_good_snippets = 0
    
    matches_and_snippets = etree.Element('matches_and_snippets')
    
    for match in tree.xpath('//match'):
    
        snippet = match.xpath('descendant::snippet')[0]

        clean_text = re.sub(r'[^\s0123456789abcdefghijklmnopqrstuvwxyzàâäæçèéêëîïñôùûüÿœ̀]',
                           ' ',
                           snippet.text.lower())
        clean_text = re.sub('\s+', ' ', clean_text)

        tokens = [t for t in clean_text.split(' ') if t > '' and t in words_in_classifier]

        corpus = [dictionary.doc2bow(tokens),]

        matrix = matutils.corpus2dense(corpus, len(dictionary))
        matrix = matrix.T

        result = classifier.predict(matrix)[0]

        if result == 'good':
            n_good_snippets += 1
        
        snippet.set('classifier_result', result)
        matches_and_snippets.append(match)
            
    base_xml_name = p.split('/')[-1]
    
    print(base_xml_name.ljust(45), 
          'n_total_snippets', n_total_snippets, 
          '     n_good_snippets', n_good_snippets)
    
    #if n_good_snippets > 0:
        
    new_tree = etree.ElementTree(matches_and_snippets)
        
    new_tree.write('SGD.test_results_xml/' + base_xml_name, encoding='utf-8')
    
    grand_total_snippets += n_total_snippets
    grand_total_good_snippets += n_good_snippets
    
print()
print('grand_total_snippets', grand_total_snippets,
     '    grand_total_good_snippets', grand_total_good_snippets)

About_1861_bpt6k64392756.xml                  n_total_snippets 3      n_good_snippets 0
Achard_1860_bpt6k113880g.xml                  n_total_snippets 35      n_good_snippets 0
Allix_1830_bpt6k6471452h.xml                  n_total_snippets 15      n_good_snippets 0
Amigues_1871_bpt6k54697084.xml                n_total_snippets 2      n_good_snippets 1
Asti_1843_bpt6k6471672z.xml                   n_total_snippets 28      n_good_snippets 9
Auberive_1860_bpt6k6394066d.xml               n_total_snippets 174      n_good_snippets 97
Avenir_de_Paris_1871_bpt6k5459219x.xml        n_total_snippets 2      n_good_snippets 0
Balleydier_1849_bpt6k105490w.xml              n_total_snippets 136      n_good_snippets 43
Balzac_1841_bpt6k1133819.xml                  n_total_snippets 11      n_good_snippets 0
Bamboches_amoureuses_1840_bpt6k1164416j.xml   n_total_snippets 13      n_good_snippets 0
Banville_1857_bpt6k6447544b.xml               n_total_snippets 4      n_good_snippets 0
Banville_1866_bpt6k20

Langlois_1871_bpt6k5426891z.xml               n_total_snippets 5      n_good_snippets 0
Lazare(2)_1870_bpt6k6465697r.xml              n_total_snippets 158      n_good_snippets 3
Lazare_1870_bpt6k6489749t.xml                 n_total_snippets 65      n_good_snippets 0
Lazare_1872_bpt6k6439523h.xml                 n_total_snippets 105      n_good_snippets 0
LeHir_1855_bpt6k5828882k.xml                  n_total_snippets 48      n_good_snippets 0
LeVerdier_1871_bpt6k6497565w.xml              n_total_snippets 36      n_good_snippets 1
Lecour_1876_bpt6k6452320z.xml                 n_total_snippets 78      n_good_snippets 0
Lecouturier_1848_bpt6k94648f.xml              n_total_snippets 7      n_good_snippets 0
Lefeuve_1854_bpt6k6438988x.xml                n_total_snippets 52      n_good_snippets 2
Lemer_1855_bpt6k64417126.xml                  n_total_snippets 61      n_good_snippets 7
Lemoine_1844_bpt6k6474411h.xml                n_total_snippets 13      n_good_snippets 0
Lespès_1863_bpt6k2153