In [1]:
import mongoengine
from mongoengine import *
from dbmodels import *

from bson.objectid import ObjectId
import configparser

In [2]:
# Read the confidentials.
credentials = configparser.ConfigParser()
credentials.read('credentials.ini')

# Connect to the database.
connect(
    db = credentials.get('lb', 'db'),
    username = credentials.get('lb', 'username'),
    password = credentials.get('lb', 'password'),
    host = credentials.get('lb', 'host'), 
    port = int(credentials.get('lb', 'port')), 
);

In [4]:
# Store reference to each article/book into 2 dictionaries where we use ObjectId
# {article1: [ref1, ref2, ...], article2: [ref1, ref2, ...], ...}
# {book1: [ref1, ref2, ...], book2: [ref1, ref2, ...], ...}
article = dict()
book = dict()
num_p = 0  # the number of publications
num_a = 0  # the number of articles
num_b = 0  # the number of books
limit_a = 50
limit_b = 0  # don't collect books now
limit = limit_a + limit_b

# Assuming the reference from the same source are stored together 
# e.g. A1,A2,A3,B1,B2,C1... NOT: A1,B1,A2,A3,B2,C1...
for r in Reference.objects:
    p = r.get_containing_publication()
    if type(p) == Article and num_a <= limit_a:
        if p.id in article:
            article[p.id].append(r.id)
        else:
            num_p += 1  # new publication
            if num_p > limit: break  # exceed limit
            num_a += 1  # new article
            if num_a > limit_a: continue  # to many articles
            article[p.id] = [r.id]
    if type(p) == Book and num_b <= limit_b:
        if p.id in book:
            book[p.id].append(r.id)
        else:
            num_p += 1  # new publication
            if num_p > limit: break  # exceed limit
            num_b += 1  # new book
            if num_b > limit_b: continue  # to many books    
            book[p.id] = [r.id]

In [5]:
def read_contents(refs_id):
    '''
    Read contents of all references from the same publication.
    '''
    d = dict()
    for rid in refs_id:
        ref = Reference.objects(_id=rid).first()
        order = ref.start_img_number * 1000 + ref.order_in_page  # Define the order as page + order_in_page
        d[order] = dict() # order : {content1, content2, ...}
        contents = ref.contents
        # Choose what we want to use
        for i, c in contents.items():
            if c['tag'] == 'title':
                if 'title' in d[order]:
                    d[order]['title'].append(c['surface'])
                else:
                    d[order]['title'] = [c['surface']]
            elif c['tag'] == 'author':
                if 'author' in d[order]:
                    d[order]['author'].append(c['surface'])
                else:
                    d[order]['author'] = [c['surface']]
            elif c['tag'] == 'year':
                if 'year' in d[order]:
                    d[order]['year'].append(c['surface'])
                else:
                    d[order]['year'] = [c['surface']]
            elif c['tag'] == 'abbreviation':
                if 'abbre' in d[order]:
                    d[order]['abbre'].append(c['surface'])
                    d[order]['abbre_pos'].append(i)
                else:
                    d[order]['abbre'] = [c['surface']]
                    d[order]['abbre_pos'] = [i]
    return d

In [6]:
# Iterate all articles
ref_of_article = dict()
for a, r in article.items():
    ref_of_article[a] = read_contents(r)

In [7]:
# Print out in the order of reference
i = 0  # only the first three
for a, rs in ref_of_article.items():
    print("*****{}*****".format(a))
    for key in sorted(rs.keys()):
        print("{}: {}".format(key, rs[key]))
    print("\n")
    i = i + 1
    if i==3: break

*****595f9e11fe7683316b2dd3f5*****
68001: {'author': ['ATTI DELL’ATENEO necrologio GIOVANNI MANETTI'], 'title': ['Fra i soci scomparsi, Γ Ateneo deve rimpiangere il Comm. Dr. Giovanni Manetti, cara e degnissima figura di vero galantuomo. A riposo, dopo avere a lungo coperto a Venezia Γ Ufficio di Consigliere di Prefettura, fu, durante la grande guerra, Commissario per i profughi Veneti rico- verati lungo i lidi Anconitani, profondendo in quest’ opera i tesori della sua filantropia illuminata e squisita. Fu poi Commissario per la fondazione Querini Stampalia e attivissimo Vice- presidente dell’ opera « Nave Asilo Scilla per gli orfani di pescatori » ; presiedette infine per lunghi anni, con costante e fervido amore, la « Cassa del piccolo Credito » e la «S. A. Cooperativa Bagni di Mare fra impiegati e professionisti» al Lido.']}
68002: {}


*****595f9e37fe7683316b2dd4b4*****
48001: {'title': ['un disegno a carboncino della Albertina di Vienna, attribuito a Tiziano e raffigurante due Pad

In [8]:
# Save
import pickle
with open('ref_article.pickle', 'wb') as handle:
    pickle.dump(ref_of_article, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
# Load
with open('ref_article.pickle', 'rb') as handle:
    b = pickle.load(handle)

In [10]:
b == ref_of_article

True