In [53]:
%load_ext autoreload
%autoreload 2
from collections import Counter
from itertools import combinations
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
import xgboost as xgb


import fitz
from par_seg import get_par_text
from helper import get_all_acts, has_act

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
doc = fitz.open('pdf/2020_04_30.pdf')
pars = get_par_text(doc)
len(pars)

355

In [28]:
%%time
df = pd.DataFrame({
    'X': pars,
    'y': map(has_act, pars)
})
df.y.value_counts()

CPU times: user 967 ms, sys: 0 ns, total: 967 ms
Wall time: 971 ms


False    334
True      21
Name: y, dtype: int64

In [37]:
%%time
pipe = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('model', xgb.XGBClassifier(
        objective='multi:softprob',
        random_state=42,
        num_class=2
    ))
])
pipe.fit(df.X, df.y);

CPU times: user 1.37 s, sys: 1.62 ms, total: 1.37 s
Wall time: 241 ms


Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('model',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, gpu_id=-1,
                               importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=0, num_class=2, num_parallel_tree=1,
                               objective='multi:softprob', random_state=42,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
                               subsample=1, tree_method='exact',
                               validate_parameters=1, verbosity=None))])

In [39]:
%%time
metrics = [
    'f1_macro',
    'f1_micro',
    'f1_weighted',
    'accuracy',
    'balanced_accuracy'
]
cv = cross_validate(pipe, df.X, df.y, scoring=metrics)

CPU times: user 6.75 s, sys: 12.7 ms, total: 6.76 s
Wall time: 1.1 s


In [40]:
cv

{'fit_time': array([0.21181226, 0.1694994 , 0.18326235, 0.19520426, 0.18693781]),
 'score_time': array([0.02371311, 0.02539015, 0.02356195, 0.02424622, 0.02419662]),
 'test_f1_macro': array([0.82598039, 0.68189964, 0.92486772, 0.82598039, 0.86753731]),
 'test_f1_micro': array([0.97183099, 0.85915493, 0.98591549, 0.97183099, 0.97183099]),
 'test_f1_weighted': array([0.96734328, 0.89259932, 0.98496162, 0.96734328, 0.96852008]),
 'test_accuracy': array([0.97183099, 0.85915493, 0.98591549, 0.97183099, 0.97183099]),
 'test_balanced_accuracy': array([0.75      , 0.92537313, 0.875     , 0.75      , 0.8       ])}

In [81]:
%%time
file_data = {
    'pdf/' '2020_04_03.pdf':None,
    'pdf/' '2020_04_30.pdf':None,
    'pdf/' '2018_12_03.pdf':None,
    'pdf/' '2_jan_2019.pdf':None,
    'pdf/' '22_jan_2019.pdf':None,
    'pdf/' '2001_11_01.pdf':None,
    'pdf/' '2002_11_04.pdf':None,
}
for fname in file_data:
    doc = fitz.open(fname)
    pars = get_par_text(doc)
    file_data[fname] = pd.DataFrame({
        'X': pars,
        'y': map(has_act, pars)
    })

CPU times: user 7.49 s, sys: 48 ms, total: 7.54 s
Wall time: 7.54 s


In [82]:
%%time
combs = combinations(file_data, len(file_data) - 1)
cvs = {}
all_keys = set(file_data)
for c in combs:
    df = pd.concat(map(lambda x: file_data[x], c))
#     continue
    diff = list(all_keys - set(c))
    cvs[diff[0]] = cross_validate(
        pipe, df.X, df.y, scoring=metrics
    )
        

CPU times: user 4min 58s, sys: 175 ms, total: 4min 59s
Wall time: 49.3 s


In [83]:
for k,v in cvs.items():
    print("leave-one-out:", k)
    print("metrics (mean):")
    for k in v:
        if 'test' in k:
            nums = [round(i, 2) for i in v[k]]
            print('\t', k, '\t:', nums)
    print('\n')

leave-one-out: pdf/2002_11_04.pdf
metrics (mean):
	 test_f1_macro 	: [0.9, 0.94, 0.92, 0.91, 0.99]
	 test_f1_micro 	: [0.97, 0.98, 0.97, 0.97, 1.0]
	 test_f1_weighted 	: [0.97, 0.98, 0.97, 0.97, 1.0]
	 test_accuracy 	: [0.97, 0.98, 0.97, 0.97, 1.0]
	 test_balanced_accuracy 	: [0.95, 0.92, 0.96, 0.86, 0.97]


leave-one-out: pdf/2001_11_01.pdf
metrics (mean):
	 test_f1_macro 	: [0.91, 0.94, 0.89, 0.91, 0.99]
	 test_f1_micro 	: [0.96, 0.98, 0.95, 0.97, 0.99]
	 test_f1_weighted 	: [0.97, 0.98, 0.96, 0.97, 0.99]
	 test_accuracy 	: [0.96, 0.98, 0.95, 0.97, 0.99]
	 test_balanced_accuracy 	: [0.95, 0.9, 0.96, 0.86, 0.97]


leave-one-out: pdf/22_jan_2019.pdf
metrics (mean):
	 test_f1_macro 	: [0.9, 0.97, 0.85, 0.96, 0.9]
	 test_f1_micro 	: [0.97, 0.99, 0.96, 0.99, 0.98]
	 test_f1_weighted 	: [0.97, 0.99, 0.96, 0.99, 0.97]
	 test_accuracy 	: [0.97, 0.99, 0.96, 0.99, 0.98]
	 test_balanced_accuracy 	: [0.94, 0.97, 0.86, 0.94, 0.84]


leave-one-out: pdf/2_jan_2019.pdf
metrics (mean):
	 test_f1_macr