In [407]:
import mongoengine
from mongoengine import *
from mongoengine.queryset.visitor import Q
from dbmodels import *
from hashfunction import *
from readref import *

import numpy as np
import pandas as pd
from bson.objectid import ObjectId
import configparser
from multiprocessing import Pool
import pickle

import matplotlib.pyplot as plt
%matplotlib inline

In [408]:
# Read the confidentials.
credentials = configparser.ConfigParser()
credentials.read('credentials.ini')

# Connect to the database.
connect(
    db = credentials.get('lb', 'db'),
    username = credentials.get('lb', 'username'),
    password = credentials.get('lb', 'password'),
    host = credentials.get('lb', 'host'), 
    port = int(credentials.get('lb', 'port')), 
)

MongoClient(host=['128.178.60.49:27017'], document_class=dict, tz_aware=False, connect=True, read_preference=Primary())

---

## Ground truth

In [409]:
from local_ground_truth.adjudication import clean_bid, process_bid

In [410]:
file_path = "./local_ground_truth/"

df_full_1 = pd.read_csv(file_path+"secondary_full_23052017_1.csv")
df_full_2 = pd.read_csv(file_path+"secondary_full_10052017_2.csv")

In [411]:
df_full_1 = df_full_1[['article_id', 'article_title', 'image_number', 'reference', 'BID_SBN']]
df_full_2 = df_full_2[['article_id', 'article_title', 'image_number', 'reference', 'BID_SBN']]
df_full_1.dropna(inplace=True)
df_full_2.dropna(inplace=True)

In [412]:
df_full_1['BID_SBN'] = df_full_1['BID_SBN'].apply(process_bid)
df_full_2['BID_SBN'] = df_full_2['BID_SBN'].apply(process_bid)

In [413]:
df_full_merge = pd.merge(df_full_1, df_full_2, on=['article_id', 'reference'], how='outer', suffixes=['_1','_2'])

In [414]:
def nan_list(a, b):
    if a!=a:
        return b
    if b!=b:
        return a
    return list(set(a+b))

def pd_nan_list(f):
    a = f['BID_SBN_1']
    b = f['BID_SBN_2']
    return nan_list(a, b)

def non_nan_article(f):
    a = f['article_title_1']
    b = f['article_title_2']
    return a if a==a else b

def non_nan_image(f):
    a = f['image_number_1']
    b = f['image_number_2']
    return a if a==a else b

In [415]:
df_full = df_full_merge[['article_id', 'reference']]
df_full['BID_SBN'] = df_full_merge.apply(pd_nan_list, axis=1)
df_full['article_title'] = df_full_merge.apply(non_nan_article, axis=1)
df_full['image_number'] = df_full_merge.apply(non_nan_image, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [416]:
df_full.head()

Unnamed: 0,article_id,reference,BID_SBN,article_title,image_number
0,LO10016101:2013_67:17,"M. Milani, Antiche pratiche di medicina popola...",[CFI0018701],Una guaritrice processata dal Tribunale del Sa...,448
1,LO10016101:2013_67:17,"S. Malavasi, Storie di Streghe a Venezia nel C...",[PUV1168228],Una guaritrice processata dal Tribunale del Sa...,449
2,LO10016101:2013_67:17,I. Cacciavillani in Storia dell'Avvocatura Ve...,[MIL0557687],Una guaritrice processata dal Tribunale del Sa...,449
3,LO10016101:2013_67:17,"G. Lorenzetti, Venezia e il suo estuario, Roma...","[SBL0254799, CUB0380318, CSA0050321, VEA0049893]",Una guaritrice processata dal Tribunale del Sa...,450
4,LO10016101:2013_67:17,"G. Boerio, Dizionario del Dialetto Veneto,Vene...","[SBL0380246, PUV0913148]",Una guaritrice processata dal Tribunale del Sa...,450


In [417]:
df_partial = pd.read_csv(file_path+"secondary_partial_10052017.csv")
df_partial = df_partial[['article_id', 'article_title', 'image_number', 'reference', 'BID_SBN']]
df_partial.dropna(inplace=True)
df_partial['BID_SBN'] = df_partial['BID_SBN'].apply(process_bid)

In [418]:
article = list(set(df_partial.article_id).intersection(set(df_full.article_id)))

53

In [516]:
from fuzzywuzzy import fuzz



In [532]:
def find_similar(r, match):
    for k in match.keys():
        if fuzz.ratio(r,k)>=98:
            return k
    return None

In [540]:
def find_match(a):
    full_ref = df_full[df_full.article_id==a][['reference','image_number','BID_SBN']]
    part_ref = df_partial[df_partial.article_id==a][['reference','image_number','BID_SBN']]
    match = dict()
    for _, row_f in full_ref.iterrows():
        ref_f = row_f.reference
        im_f = row_f.image_number
        sbn_f = row_f.BID_SBN
        for _, row_p in part_ref.iterrows():
            ref_p = row_p.reference
            sbn_p = row_p.BID_SBN
            if len(set(sbn_f)&set(sbn_p))>0:
#                 ref_f = remove_nt(ref_f)
#                 ref_p = remove_nt(ref_p)
                if ref_f!=ref_p:
                    k = find_similar(ref_f, match)
                    if k!=None:
                        if ref_p not in match[k]:
                            match[k].append(ref_p)
                    else:
                        match[ref_f] = [ref_p]
    return match

Test

In [561]:
a = article[5]
for k, v in find_match(a).items():
    print('*'*20)
    print(k)
    print('-'*10)
    for i, vv in enumerate(v):
        print(i, vv)

********************
A. Mazzacane, Lo stato e il dominio nei giuristi veneti durante il «secolo della terraferma», in Storia della Cultura Veneta, III, 1, Vicenza 1980, pp. 639 s.
----------
0 G. Arnaldi - L. Capo, I cronisti di Venezia e della Marca Trevigiana, in  Storia della Cultura Veneta, II, Vicenza 1976, pp. 287 ss.
1 Cozzi, Ambiente veneziano, ambiente veneto. Governanti e governati nel dominio di qua dal Mincio nei secoli XV-XVIII, in Storia della Cultura Veneta, IV, 2, Vicenza 1984, pp. 495 ss.
********************
V. F. Gaeta, L'idea di Venena, in Storia della Cultura Veneta, III, 3,Vicenza 1981, pp. 565 ss.
----------
0 G. Arnaldi - L. Capo, I cronisti di Venezia e della Marca Trevigiana, in  Storia della Cultura Veneta, II, Vicenza 1976, pp. 287 ss.
1 Cozzi, Ambiente veneziano, ambiente veneto. Governanti e governati nel dominio di qua dal Mincio nei secoli XV-XVIII, in Storia della Cultura Veneta, IV, 2, Vicenza 1984, pp. 495 ss.
********************
F. Gilbert, Le idee 

Match information summary

In [542]:
full = []
part = []
for a in article:
    d = find_match(a) 
    n_full = len(d.keys())
    n_part = sum([len(v) for v in d.values()])
    full.append(n_full)
    part.append(n_part)

In [543]:
len(article), sum(full), sum(part)

(53, 1806, 5249)

---

## Get all of the refs of those article

In [None]:
article_ref = dict()
for a_id in article:
    a = Article.objects(internal_id=a_id).first()
    article_ref[a_id] = {
        "title": a.title,
        "start": a.start_img_number,
        "end": a.end_img_number,
        "doc_id": a.document_id.id,
    }

In [373]:
for a in article_ref:
    doc_id = article_ref[a]['doc_id']
    start = article_ref[a]['start']
    end = article_ref[a]['end']
    refs = [r.id for r in Reference.objects(document_id=doc_id, start_img_number__gte=start, end_img_number__lte=end)]
    article_ref[a]['refs'] = read_refs_publication(refs)  



In [374]:
pd.Series(article_ref).to_pickle('local_article_ref.pkl')

In [422]:
article_ref = pd.read_pickle('local_article_ref.pkl')

## Apply local clustering algorithm

In [264]:
from nltk.tokenize import RegexpTokenizer
from unidecode import unidecode

def normalize(s_list):
    tokenizer = RegexpTokenizer(r'\w+')
    return [unidecode(i.lower()) for s in s_list for i in tokenizer.tokenize(s)]

In [229]:
def list_contains(abbre_parse, abbre):
    for a in abbre:
        if a in abbre_parse:
            return True
    return False

In [329]:
import string
table = str.maketrans({key: None for key in string.punctuation+'”“'})

In [553]:
a = article[5]
d = article_ref[a]['refs']

In [565]:
key = np.array(list(d.keys()))
for i, (k, v) in enumerate(d.items()):
    d[k]['match'] = k
    if 'abbre' in v:
        pos = v['abbre_pos'][0]
        abbre = normalize(v['abbre'])
        # Change author
        # idem / id / eadem / ead
        if list_contains(abbre, ['idem','id.','eadem', 'ead.']):
            # Find the nearest one with author
            for j in range(i-1,-1,-1):
                previous_k = key[j]
                previous_v = d[previous_k]
                if 'author' in previous_v:
                    d[k]['author'] = previous_v['author']
                    break
            print('-'*10)
            print(f'Change author: {k}_{abbre} to {previous_k}_{previous_v["author"]}')
        # if abbre contains one of following
        if list_contains(abbre, ['op','cit','ctt']):
            print('-'*10)
            print(f"{k} has abbreviation")
            # if there is only author information in the partitial
            if 'author' in v and 'title' not in v:
                author = v['author']
                for j in range(i-1, -1, -1):
                    find = 0
                    previous_k = key[j]
                    previous_v = d[previous_k]
                    previous_s = previous_v['surface']
                    # remove punctations and extra spaces
                    previous_s = previous_s.translate(table)
                    previous_s = " ".join(previous_s.split())
                    for a in author:
                        # remove punctations and extra spaces
                        a = a.translate(table)
                        a = " ".join(a.split())
                        if a in previous_s:
                            find = 1
                            d[k]['match'] = previous_v['match']
                            break
                    if find: break
                print('-'*10)
                print(f'Only Author: {k} to {previous_k}, {find}')
                        
            # if there is only title information in the partitial        
            elif 'title' in v and 'author' not in v:
                title = v['title']
                for j in range(i-1, -1, -1):
                    find = 0
                    previous_k = key[j]
                    previous_v = d[previous_k]
                    previous_s = previous_v['surface']
                    # remove punctations and extra spaces
                    previous_s = previous_s.translate(table)
                    previous_s = " ".join(previous_s.split())
                    for t in title:
                        # remove punctations and extra spaces
                        t = t.translate(table)
                        t = " ".join(t.split())
                        # only check the first 10 characters
                        if t[:10] in previous_s:
                            find = 1
                            d[k]['match'] = previous_v['match']
                            break
                    if find: break
                print('-'*10)
                print(f'Only Title: {k} to {previous_k}, {find}')
                
            # if there are both author and title information               
            elif 'author' in v and 'title' in v:
                author = v['author']
                title = v['title']
                for j in range(i-1, -1, -1):
                    find = 0 
                    previous_k = key[j]
                    previous_v = d[previous_k]
                    previous_s = previous_v['surface']
                    # remove punctations and extra spaces
                    previous_s = previous_s.translate(table)
                    previous_s = " ".join(previous_s.split())
                    # first check if the author is in
                    for a in author:
                        a = a.translate(table)
                        a = " ".join(a.split())
                        if a in previous_s:
                            # second if the beginning part of the title is in
                            for t in title:
                                t = t.translate(table)
                                t = " ".join(t.split())
                                if t[:10] in previous_s:
                                    find = 1
                                    d[k]['match'] = previous_v['match']
                                    break
                    if find: break  
                print('-'*10)
                print(f'Author&Title: {k} to {previous_k}, {find}')
            
            # with nothing
            else:
                # match it with the one right before
                previous_k = key[i-1]
                previous_v = d[previous_k]
                d[k]['match'] = previous_v['match'] 
                print('-'*10)
                print(f'Nothing: {k} to {previous_k}')
   
    elif 'title' in v:
        title = v['title']
        if '...' in title:
            for j in range(i-1, -1, -1):
                find = 0
                previous_k = key[j]
                previous_v = d[previous_k]
                previous_s = previous_v['surface']
                # remove punctations and extra spaces
                previous_s = previous_s.translate(table)
                previous_s = " ".join(previous_s.split())
                for t in title:
                    # remove punctations and extra spaces
                    t = t.translate(table)
                    t = " ".join(t.split())
                    # only check the first 10 characters
                    if t[:10] in previous_s:
                        find = 1
                        d[k]['match'] = previous_v['match']
                        break
                if find: break
            print('-'*10)
            print(f'...: {k} to {previous_k}, {find}')

----------
9004 has abbreviation
----------
Author&Title: 9004 to 6001, 0
----------
14007 has abbreviation
----------
Only Author: 14007 to 14001, 1
----------
15005 has abbreviation
----------
Nothing: 15005 to 15004
----------
16007 has abbreviation
----------
Nothing: 16007 to 16006
----------
17003 has abbreviation
----------
Nothing: 17003 to 17002
----------
20007 has abbreviation
----------
Author&Title: 20007 to 6001, 0
----------
24003 has abbreviation
----------
Only Author: 24003 to 6002, 1
----------
24007 has abbreviation
----------
Only Author: 24007 to 15003, 1
----------
25004 has abbreviation
----------
Only Author: 25004 to 24007, 1
----------
25005 has abbreviation
----------
Only Author: 25005 to 24003, 1
----------
27007 has abbreviation
----------
Only Author: 27007 to 22005, 1
----------
29004 has abbreviation
----------
Only Author: 29004 to 29002, 1
----------
29006 has abbreviation
----------
Only Author: 29006 to 25005, 1
----------
29007 has abbreviation
--

In [569]:
for i, (k, v) in enumerate(d.items()):
    if 'abbre' in v:
        pos = v['abbre_pos']
        abbre = normalize(v['abbre'])
        if 1 in pos:
            if list_contains(abbre, ['ibid', 'ibidem', 'ivi', 'supra']):
                previous_k = key[i-1]
                previous_v = d[previous_k]
                d[k]['match'] = previous_v['match']   
            print('-'*10)
            print(f'{abbre} {k} to {previous_k}')

----------
['ibid'] 14003 to 14002
----------
['ibid', 'loe', 'cit'] 15005 to 15004
----------
['lbid', 'loe', 'cit'] 16007 to 15004
----------
['ibid', 'loe', 'cit'] 17003 to 17002
----------
['ibid'] 24004 to 24003
----------
['ibid'] 24005 to 24004
----------
['ibid'] 24006 to 24005
----------
['ibid', 'loc', 'cit'] 29007 to 29006
----------
['ibid'] 30002 to 30001
----------
['ibid'] 30003 to 30002
----------
['cit'] 30007 to 30002
----------
['ibid'] 30009 to 30008
----------
['ibid'] 30010 to 30009


In [556]:
# # compare everything with everything

# df_d = pd.DataFrame(d).T
# d_left = dict()
# key_left = df_d.groupby('match').groups.keys()
# for k in key_left:
#     d_left[k] = d[k]
# for k in key_left:
    

In [557]:
df_d = pd.DataFrame(d).T
groups = df_d.groupby('match').groups
match_index = dict()
for g in groups:
    if len(groups[g]) > 1:
        match_index[g] = [gg for gg in groups[g] if gg != g]
match = dict()
for g in match_index:
    surf_g = df_d.loc[g].surface
    match[surf_g] = [df_d.loc[gg].surface for gg in match_index[g]]

In [558]:
match_index 

{6002: [24003, 24004, 24005, 24006, 25005, 29006, 29007, 30008, 30009, 30010],
 14001: [14007],
 14002: [14003],
 15003: [24007, 25004],
 15004: [15005],
 17002: [17003],
 22005: [27007],
 29002: [29004],
 30001: [30002, 30003]}

In [559]:
match

{'A. Mazzacane, Lo stato e il dominio nei giuristi veneti durante il «secolo della terraferma», in Storia della Cultura Veneta, III, 1, Vi- cenza 1980, pp. 639 s.),': ['Mazzacane, op. cit., p. 579.',
  'Ibid., p. 582.',
  'Ibid., p. 581.',
  'Ibid., p. 583.',
  'Mazzacane, op. cit., p. 583.',
  'Mazzacane, op. cit., p. 592.',
  'Ibid., loc. cit.',
  'Mazzacane, op. cit., p. 590.',
  'Ibid., p. 597.',
  'Ibid., p. 644.'],
 'B. Paradisi, Deditio, in AA.W., Studi in onore di Arrigo Solmi, I, Milano 1940, p, 288.': ['Paradisi, op. cit., pp. 676 ss.'],
 'C. Pasero, Il dominio veneto fino all’incendio della Loggia (1426-1575), in Storia di Brescia, II, Brescia 1963,': ['Pasero, op. cit., pp. 16 s.'],
 'Chittolini, I capitoli cit., pp. 676 s.': ['Ibid., loe. cit.'],
 'Così V. Arangio Ruiz, Storia del diritto romano, Napoli 1940, p. 111.': ['Ibid., p. 112;'],
 'F. Ercole, Comuni e signori nel Veneto (Scaligeri Caminesi Carraresi), in Id., Dal comune al principato. Saggi sulla storia del diritt

## Evaluate

In [560]:
a = article[5]
find_match(a)

{' A. Ventura, Scrittori politici e scritture di governo, in Storia della\nCultura Veneta, III, 3, cit., p. 522.': ['G. Arnaldi - L. Capo, I cronisti di Venezia e della Marca Trevigiana, in  Storia della Cultura Veneta, II, Vicenza 1976, pp. 287 ss.',
  'Cozzi, Ambiente veneziano, ambiente veneto. Governanti e governati nel dominio di qua dal Mincio nei secoli XV-XVIII, in Storia della Cultura Veneta, IV, 2, Vicenza 1984, pp. 495 ss.'],
 ' B. Belotti, Storia di Bergamo\ne dei bergamaschi, II, Bergamo 1959, p. 364.': ['B. Belotti, Storia di Bergamo e dei bergamaschi, II, Bergamo 1959, p. 364.'],
 ' II significato economico della guerra\ne della protezione, in Id., I mercanti di Venezia, Torino 1982, pp. 169-185': ['I mercanti di Venezia, Torino 1982, pp. 169-185'],
 ' J. Burckhardt (La civiltà del Rinascimento in Italia, Roma 1974, p. 30).': ['Burckhardt, op. cit., p. 23.',
  'ibid. p. 31',
  ' J. Burckhardt (La civiltà del Rinascimento in Italia, Roma 1974, p. 30).'],
 ' La dedizione d