## Grab a list of XML files to run through the classifier

In [1]:
import glob

clean_files = sorted(glob.glob('/home/spenteco/Downloads/Paris-Project-master/xml/*.xml'))

print(len(clean_files))

215


## Are any of the XML files not well-formed?

In [2]:
from lxml import etree

valid_input_files = []
broken_xml_files = []

for p in clean_files:
    try:
        tree = etree.parse(p)
        valid_input_files.append(p)
    except etree.XMLSyntaxError:
        broken_xml_files.append(p)
        
print('len(valid_input_files)', len(valid_input_files),
     'len(broken_xml_files)', len(broken_xml_files))

print()
for p in broken_xml_files:
    print('BROKEN?', p)

len(valid_input_files) 201 len(broken_xml_files) 14

BROKEN? /home/spenteco/Downloads/Paris-Project-master/xml/Abrantès_1844_bpt6k6472523f_clean.xml
BROKEN? /home/spenteco/Downloads/Paris-Project-master/xml/Amigues_1871_bpt6k54697084_clean.xml
BROKEN? /home/spenteco/Downloads/Paris-Project-master/xml/Asti_1843_bpt6k6471672z_clean.xml
BROKEN? /home/spenteco/Downloads/Paris-Project-master/xml/Balleydier_1849_bpt6k105490w_clean.xml
BROKEN? /home/spenteco/Downloads/Paris-Project-master/xml/Bamboches_amoureuses_1840_bpt6k1164416j_clean.xml
BROKEN? /home/spenteco/Downloads/Paris-Project-master/xml/Banville_1866_bpt6k205836j_clean.xml
BROKEN? /home/spenteco/Downloads/Paris-Project-master/xml/Bellet_1857_bpt6k6456840p_clean.xml
BROKEN? /home/spenteco/Downloads/Paris-Project-master/xml/Bonneville_1830_bpt6k5530903c_clean.xml
BROKEN? /home/spenteco/Downloads/Paris-Project-master/xml/Edme_1871_bpt6k6549886k_clean.xml
BROKEN? /home/spenteco/Downloads/Paris-Project-master/xml/Fournier_1860_bpt6k641

## Load the classifier and dictionary  . . . 

 . . . from 04_create_final_classifier.ipynb.

In [3]:
import pickle, re
from gensim import corpora, matutils
from sklearn.naive_bayes import *
from sklearn.ensemble import *

filename = 'saved_classifier.sav'
classifier = pickle.load(open(filename, 'rb'))

dictionary = corpora.Dictionary.load('classifier_dictionary.dict')

words_in_classifier = set(list(dictionary.values()))

print('len(words_in_classifier)', len(words_in_classifier))

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


len(words_in_classifier) 32322


## Process the XML files

In [4]:
grand_total_snippets = 0
grand_total_good_snippets = 0

for p in valid_input_files:
    
    tree = etree.parse(p)
    
    n_total_snippets = len(tree.xpath('//snippet'))
    n_good_snippets = 0
    
    for snippet in tree.xpath('//snippet'):
    
        clean_text = re.sub(r'[^\s0123456789abcdefghijklmnopqrstuvwxyzàâäæçèéêëîïñôùûüÿœ̀]',
                           ' ',
                           snippet.text.lower())
        clean_text = re.sub('\s+', ' ', clean_text)

        tokens = [t for t in clean_text.split(' ') if t > '' and t in words_in_classifier]
        
        corpus = [dictionary.doc2bow(tokens),]

        matrix = matutils.corpus2dense(corpus, len(dictionary))
        matrix = matrix.T
        
        result = classifier.predict(matrix)[0]
        
        if result == 'good':
            snippet.set('classifier_result', result)
            n_good_snippets += 1
            
    base_xml_name = p.split('/')[-1]
    
    print(base_xml_name.ljust(45), 
          'n_total_snippets', n_total_snippets, 
          '     n_good_snippets', n_good_snippets)
    
    tree.write('test_results_xml/' + base_xml_name, encoding='utf-8')
    
    grand_total_snippets += n_total_snippets
    grand_total_good_snippets += n_good_snippets
    
print()
print('grand_total_snippets', grand_total_snippets,
     '    grand_total_good_snippets', grand_total_good_snippets)

About_1861_bpt6k64392756.xml                  n_total_snippets 3      n_good_snippets 0
Abrantès_1844_bpt6k6472523f.xml              n_total_snippets 16      n_good_snippets 0
Achard_1860_bpt6k113880g.xml                  n_total_snippets 35      n_good_snippets 3
Allix_1830_bpt6k6471452h.xml                  n_total_snippets 15      n_good_snippets 3
Amigues_1871_bpt6k54697084.xml                n_total_snippets 2      n_good_snippets 1
André_1874_bpt6k6112273x.xml                 n_total_snippets 4      n_good_snippets 2
Arènes_de_Paris_1870_bpt6k1413153t.xml       n_total_snippets 26      n_good_snippets 11
Asti_1843_bpt6k6471672z.xml                   n_total_snippets 28      n_good_snippets 16
Auberive_1860_bpt6k6394066d.xml               n_total_snippets 174      n_good_snippets 159
Avenir_de_Paris_1871_bpt6k5459219x.xml        n_total_snippets 2      n_good_snippets 0
Balleydier_1849_bpt6k105490w.xml              n_total_snippets 136      n_good_snippets 67
Balzac_1841_bpt6k1