In [1]:
import pandas as pd
import fitz
import re

In [2]:
doc=fitz.open('../papers/Designing of A Based Route Recommendation Service for Multimodal Transportation System in Smart Cities.pdf')

### Finding paper title and authors

In [101]:
def get_header(page):
    header = {}
    for block in page['blocks']:
        try:
            for line in block['lines']:
                for span in line['spans']:
                    line_text = span['text'].replace('\xa0', ' ')
                    if 'abstract' in line_text.lower():
                        return header
                    if (len(line_text)>5) and ('bold' in span['font'].lower()):
                        header[line_text] = float(span['size'])
        except:
            pass
        
def clean_header(title, header):
    to_drop = list(header.keys())
    to_drop = to_drop[:to_drop.index(title[-1])+1]
    for t in to_drop:
        del header[t]
        
def get_title(header):
    title = sorted(header, key=lambda key: header[key], reverse=True)[:4]
    title = [txt for txt in title if header[txt]==header[title[0]]]
    clean_header(title, header)
    title = ''.join(title).strip()
    return title.title()

def get_authors(header):
    authors_font_size = list(header.values())[0]
    clean_author = lambda author: re.sub(r'[^A-Za-z\s]', '', author).strip().title()
    authors = [clean_author(txt) for txt in header.keys() if header[txt]==authors_font_size]
    return authors

def get_info(doc):
    first_page = doc[0].get_text('dict')
    header = get_header(first_page)
    title = get_title(header)
    authors = get_authors(header)
    return title, authors

In [102]:
title, authors = get_info(doc)
print('Title:', title)
print('Authors:', authors)

Title: Designing Of A* Based Route Recommendation Service For Multimodal Transportation System In Smart Cities
Authors: ['Md Ashifuddin Mondal', 'Zeenat Rehena']


### Find references

In [97]:
def correct_name(match, text):
    ref_title = text[match.start(): match.end()].replace('.', '').strip()
    ref_title = ref_title.replace('\xa0 ',' ').replace('\xa0','')
    return ref_title.capitalize()

def get_references(doc):
    found_references=False
    pattern = r'(?<=\(\d\d\d\d\)).*?(?=\.|\n)'
    references=[]
    for i, page in enumerate(doc):
        text = page.get_text().replace(' \n', ' ').replace('-\n', '')
        if ("References\n" in text) or found_references:
            if ("References\n" in text):
                ini = text.find("References\n")
                text = text[ini+11:]
            found_references=True
            citations = re.finditer(pattern, text)
            citations = [correct_name(match, text) for match in citations if len(correct_name(match, text))!=0]
            references+=citations
    return references

In [98]:
references = get_references(doc)
references

['An algorithmic framework for computing shortest routes in urban multimodal networks with different criteria',
 'The project of intellectual multimodal transport system',
 'Intelligent traffic congestion classification system using artificial neural network',
 'Transfer graph approach for multimodal transport problems',
 'Fast routing in very large public transportation networks using transfer patterns',
 'Multiple criteria decision analysis: an integrated approach',
 'Managing smart-city transportation planning of ‘park-and-ride’ system: case of moscow metropolitan',
 'Accelerating multi-modal route planning by access-nodes',
 'A multimodal transport network model and efficient algorithms for building advanced traveler information systems',
 'User-constrained multimodal route planning',
 'Algorithm 97: shortest path',
 'Resilience in intelligent transportation systems (its)',
 'A distributed approach for shortest path algorithm in dynamic multimodal transportation networks',
 'Unders