In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

from collections import Counter, defaultdict
import logging
import os

from cipy import parsers

In [2]:
logger = logging.getLogger('cipy')
logger.setLevel(logging.DEBUG)

In [3]:
data_path = '/Users/burtondewilde/Desktop/datakind/ci/conservation-intl/data/raw/citation_files/dedupe_tests/'

In [4]:
# RIS FORMAT

ris_key_counts = defaultdict(lambda: defaultdict(int))
unique_ris_keys = Counter()

for fname in os.listdir(data_path):
    if fname.endswith('.ris') or fname.endswith('.txt'):
        print(fname)
        ris = parsers.RisFile(os.path.join(data_path, fname))
        unique_ris_keys.update(key
                               for record in ris.parse()
                               for key in record.keys())
        for record in ris.parse():
            for key, value in record.items():
                try:
                    ris_key_counts[key][value] += 1
                except TypeError:
                    print(key, value)
                    break

scopus0.ris
scopus1.ris

ERROR:cipy.parsers.ris:duplicate key error: key=place_published, value=Perth, Aust
ERROR:cipy.parsers.ris:duplicate key error: key=place_published, value=Washington, DC, USA
ERROR:cipy.parsers.ris:duplicate key error: key=place_published, value=Nagoya, Jpn
ERROR:cipy.parsers.ris:duplicate key error: key=place_published, value=Pittsburgh, PA, USA
ERROR:cipy.parsers.ris:duplicate key error: key=place_published, value=Madison, WI, USA
ERROR:cipy.parsers.ris:duplicate key error: key=place_published, value=Boston, MA, USA
ERROR:cipy.parsers.ris:duplicate key error: key=place_published, value=Perth, Aust
ERROR:cipy.parsers.ris:duplicate key error: key=place_published, value=Washington, DC, USA
ERROR:cipy.parsers.ris:duplicate key error: key=place_published, value=Nagoya, Jpn
ERROR:cipy.parsers.ris:duplicate key error: key=place_published, value=Pittsburgh, PA, USA
ERROR:cipy.parsers.ris:duplicate key error: key=place_published, value=Madison, WI, USA
ERROR:cipy.parsers.ris:duplicate key erro


scopus2.ris
scopus3.ris
WoS0.txt
Wos1.txt
WoS2.txt
WoS3.txt
WoS4.txt
WoS5.txt
WoS6.txt
WoS7.txt
WoS8.txt

DEBUG:cipy.parsers.ris:unknown tag: tag=D2, line=9604 "D2 10.1007/978-3-540-73349-2"
DEBUG:cipy.parsers.ris:unknown tag: tag=D2, line=9604 "D2 10.1007/978-3-540-73349-2"





In [5]:
# BIBTEX FORMAT

bib_key_counts = defaultdict(lambda: defaultdict(int))
unique_bib_keys = Counter()
for fname in os.listdir(data_path):
    if fname.endswith('.bib'):
        print(fname)
        
        bib = parsers.BibTexFile(os.path.join(data_path, fname))
        unique_bib_keys.update(key
                               for record in bib.parse()
                               for key in record.keys())
        for record in bib.parse():
            for key, value in record.items():
                try:
                    bib_key_counts[key][value] += 1
                except TypeError:
                    print(key, value)
                    break

scopus0.bib
scopus1.bib

DEBUG:cipy.parsers.bibtex:unusual "pages" field value: i-ii+S1-S266
DEBUG:cipy.parsers.bibtex:unusual "pages" field value: b-141-64
DEBUG:cipy.parsers.bibtex:unusual "pages" field value: 066133-1-066133-5
DEBUG:cipy.parsers.bibtex:unusual "pages" field value: 181-190,193-199,203-241
DEBUG:cipy.parsers.bibtex:unusual "pages" field value: i-ii+S1-S266
DEBUG:cipy.parsers.bibtex:unusual "pages" field value: b-141-64
DEBUG:cipy.parsers.bibtex:unusual "pages" field value: 066133-1-066133-5
DEBUG:cipy.parsers.bibtex:unusual "pages" field value: 181-190,193-199,203-241



scopus2.bib
scopus3.bib
WoS0.bib
WoS1.bib
WoS2.bib
WoS3.bib
WoS4.bib
WoS5.bib
WoS6.bib
WoS7.bib
WoS8.bib


In [6]:
print(sorted(ris_key_counts.keys()))

['D2', 'abstract', 'access_date', 'article_number', 'author_addresses', 'author_keywords', 'authors', 'caption', 'conference_host', 'conference_location', 'custom_1', 'custom_2', 'custom_3', 'custom_7', 'document_delivery_number', 'document_type', 'doi', 'electronic_intl_issn', 'email_address', 'end_page', 'funding_agency_and_grants', 'funding_text', 'isbn', 'issn', 'issue_number', 'journal_name', 'journal_name_user_abbr_2', 'keywords', 'language', 'name_of_database', 'notes', 'num_cited_references', 'num_times_cited', 'open_researcher_contributor_id', 'page_count', 'pages', 'part_number', 'place_published', 'publication_month', 'publication_type', 'publication_year', 'publisher', 'publisher_address', 'publisher_city', 'pubmed_id', 'reference_id', 'reprint_status', 'reviewed_item', 'secondary_authors', 'secondary_title', 'section', 'short_title', 'source_abbr_29char', 'source_abbr_iso', 'source_name', 'special_issue', 'start_page', 'subject_categories', 'subject_categories_alt', 'subsi

In [7]:
print(sorted(bib_key_counts.keys()))

['ENTRYTYPE', 'abbrev_source_title', 'abstract', 'affiliation', 'art_number', 'article-number', 'author_keywords', 'authors', 'book-group-author', 'booktitle', 'chemicals_cas', 'coden', 'correspondence_address1', 'doi', 'editor', 'eissn', 'funding_details', 'isbn', 'issn', 'issue_number', 'journal_name', 'keywords', 'language', 'link', 'manufacturers', 'molecular_seqnumbers', 'notes', 'orcid-numbers', 'organization', 'page_count', 'pages', 'publication_month', 'publication_year', 'publisher', 'publisher_address', 'pubmed_id', 'reference_id', 'references', 'researcherid-numbers', 'series', 'source', 'sponsors', 'title', 'tradenames', 'type_of_work', 'unique-id', 'volume']


In [8]:
for key, value in sorted(ris_key_counts.items()):
    try:
        print('{0:.<30} {1}'.format(key, max(len(str(val)) for val in value.keys())))
    except TypeError:
        print('{0:.<30} {1}'.format(key, max(len(str(v)) for val in value.keys() for v in val)))

D2............................ 25
abstract...................... 11689
access_date................... 19
article_number................ 29
author_addresses.............. 31568
author_keywords............... 534
authors....................... 88763
caption....................... 180
conference_host............... 45
conference_location........... 62
custom_1...................... 66902
custom_2...................... 10
custom_3...................... 166
custom_7...................... 15
document_delivery_number...... 5
document_type................. 26
doi........................... 68
electronic_intl_issn.......... 9
email_address................. 113
end_page...................... 19
funding_agency_and_grants..... 1673
funding_text.................. 7031
isbn.......................... 17
issn.......................... 9
issue_number.................. 15
journal_name.................. 105
journal_name_user_abbr_2...... 105
keywords...................... 4330
language...................

In [9]:
for key, value in sorted(bib_key_counts.items()):
    try:
        print('{0:.<30} {1}'.format(key, max(len(str(val)) for val in value.keys())))
    except TypeError:
        print('{0:.<30} {1}'.format(key, max(len(str(v)) for val in value.keys() for v in val)))
    except ValueError:
        print('{0:.<30} ValueError'.format(key))

ENTRYTYPE..................... 13
abbrev_source_title........... 105
abstract...................... 11691
affiliation................... 30922
art_number.................... 15
article-number................ 25
author_keywords............... 526
authors....................... 46917
book-group-author............. 13
booktitle..................... 107
chemicals_cas................. 1551
coden......................... 5
correspondence_address1....... 345
doi........................... 82
editor........................ 68
eissn......................... 9
funding_details............... 1557
isbn.......................... 40
issn.......................... 9
issue_number.................. 15
journal_name.................. 166
keywords...................... 4286
language...................... 19
link.......................... 113
manufacturers................. 350
molecular_seqnumbers.......... 5497
notes......................... 320
orcid-numbers................. 4356
organization............

In [10]:
for key, count in unique_ris_keys.most_common():
    print('{0:.<25} {1:>9}'.format(key, count))

title....................     12010
publication_year.........     12008
issn.....................     12007
authors..................     11986
volume...................     11744
abstract.................     11504
end_page.................     11119
doi......................     10602
issue_number.............      8976
language.................      8491
journal_name.............      8391
type_of_reference........      8000
notes....................      8000
name_of_database.........      8000
secondary_title..........      7999
type_of_work.............      7965
journal_name_user_abbr_2.      7891
author_addresses.........      7796
url......................      7713
keywords.................      7441
pages....................      7304
publisher................      4025
unique_identifier........      4010
publication_type.........      4010
source_name..............      4010
start_page...............      3962
publication_month........      3719
custom_2.................   

In [11]:
for key, count in unique_bib_keys.most_common():
    print('{0:.<25} {1:>9}'.format(key, count))

reference_id.............     12010
ENTRYTYPE................     12010
title....................     12010
publication_year.........     12008
journal_name.............     12000
authors..................     11985
volume...................     11742
issn.....................     11723
abstract.................     11505
pages....................     11141
doi......................     10588
issue_number.............      9040
notes....................      8205
link.....................      8000
source...................      8000
language.................      7991
type_of_work.............      7965
abbrev_source_title......      7891
affiliation..............      7796
correspondence_address1..      7569
references...............      7501
keywords.................      5894
coden....................      5286
author_keywords..........      4654
unique-id................      4010
publication_month........      3733
publisher................      3525
pubmed_id................   

In [24]:
sorted(ris_key_counts['pages'].items(), key=lambda x: x[1], reverse=True)

[('1', 265),
 ('19', 25),
 ('28', 25),
 ('3', 24),
 ('54', 24),
 ('9', 23),
 ('21', 23),
 ('11', 22),
 ('47', 21),
 ('10', 21),
 ('30', 21),
 ('33', 21),
 ('29', 21),
 ('37', 20),
 ('143', 20),
 ('35', 20),
 ('45', 19),
 ('57', 19),
 ('199', 19),
 ('36', 18),
 ('185', 18),
 ('31', 18),
 ('61', 18),
 ('53', 18),
 ('69', 18),
 ('101', 18),
 ('71', 18),
 ('24', 18),
 ('12', 18),
 ('7', 18),
 ('81', 17),
 ('129', 17),
 ('141', 17),
 ('93', 17),
 ('51', 17),
 ('116', 17),
 ('17', 17),
 ('65', 17),
 ('26', 17),
 ('77', 17),
 ('44', 17),
 ('137', 17),
 ('64', 17),
 ('58', 17),
 ('23', 17),
 ('32', 16),
 ('63', 16),
 ('38', 16),
 ('43', 16),
 ('55', 16),
 ('59', 16),
 ('225', 16),
 ('62', 16),
 ('169', 16),
 ('79', 15),
 ('13', 15),
 ('73', 15),
 ('221', 15),
 ('223', 15),
 ('213', 15),
 ('102', 15),
 ('41', 15),
 ('15', 15),
 ('87', 15),
 ('231', 15),
 ('16', 15),
 ('117', 15),
 ('42', 14),
 ('115', 14),
 ('40', 14),
 ('103', 14),
 ('179', 14),
 ('145', 14),
 ('215', 14),
 ('109', 14),
 ('155

In [20]:
sorted(bib_key_counts['pages'].items(), key=lambda x: x[1], reverse=True)

[('1--9', 30),
 ('1--12', 21),
 ('1--10', 20),
 ('1--11', 16),
 ('1--8', 16),
 ('1--17', 15),
 ('1--13', 14),
 ('1--6', 14),
 ('1--21', 11),
 ('1--15', 11),
 ('1--7', 10),
 ('1--25', 9),
 ('1--19', 8),
 ('1--20', 8),
 ('1--14', 8),
 ('11--22', 8),
 ('37--44', 7),
 ('1069--1075', 6),
 ('1--29', 6),
 ('7--18', 6),
 ('1--16', 6),
 ('35--43', 6),
 ('1--4', 6),
 ('377--384', 6),
 ('11--20', 6),
 ('54--61', 6),
 ('9--16', 6),
 ('29--39', 6),
 ('31--38', 5),
 ('26--34', 5),
 ('3--14', 5),
 ('76--80', 5),
 ('242--255', 5),
 ('457--460', 5),
 ('1--18', 5),
 ('36--43', 5),
 ('30--39', 5),
 ('27--36', 5),
 ('92--101', 5),
 ('73--83', 5),
 ('13--22', 5),
 ('62--66', 5),
 ('51--57', 5),
 ('43--54', 5),
 ('39--48', 4),
 ('322--332', 4),
 ('3--20', 4),
 ('94--102', 4),
 ('66--78', 4),
 ('31--43', 4),
 ('89--103', 4),
 ('83--86', 4),
 ('45--53', 4),
 ('229--241', 4),
 ('52--60', 4),
 ('16--25', 4),
 ('257--266', 4),
 ('243--253', 4),
 ('47--57', 4),
 ('9--17', 4),
 ('1--27', 4),
 ('35--41', 4),
 ('79-

In [43]:
import re

for val in (' 00431354 (ISSN)', '0377-2217', '0036-8075', '9781466648531 (ISBN); 146664852X (ISBN); 9781466648524 (ISBN)'):
    print(val, '=>', re.search(r'^[\w-]+$|(?<=\b)(\w+)(?=\s\(ISSN\))', val, flags=re.IGNORECASE).group(0))

 00431354 (ISSN) => 00431354
0377-2217 => 0377-2217
0036-8075 => 0036-8075


AttributeError: 'NoneType' object has no attribute 'group'

In [31]:
from schematics.models import Model
from schematics import types

In [35]:
from datetime import datetime

datetime.now()

datetime.datetime(2016, 6, 7, 23, 17, 33, 110901)

In [37]:
import arrow

arrow.utcnow().datetime

datetime.datetime(2016, 6, 8, 3, 17, 55, 924200, tzinfo=tzutc())

In [32]:
class Citation(Model):
    record_id = types.IntType(required=True,
                              min_value=0, max_value=9223372036854775807)
    project_id = types.IntType(required=True,
                               min_value=0, max_value=2147483647)
    user_id = types.IntType(required=True,
                            min_value=0, max_value=2147483647)
    insert_ts = types.UTCDateTimeType(required=True, default=arrow.utcnow().datetime,
                                      convert_tz=True, drop_tzinfo=True)
    type_of_work = types.StringType(max_length=25)
    title = types.StringType(max_length=250)
    secondary_title = types.StringType(max_length=250)
    publication_year = types.IntType(min_value=0, max_value=32767)
    publication_month = types.IntType(min_value=0, max_value=32767)
    authors = types.ListType(types.StringType(max_length=100))
    abstract = types.StringType()
    keywords = types.ListType(types.StringType(max_length=100))
    type_of_reference = types.StringType(max_length=50)
    journal_name = types.StringType(max_length=100)
    volume = types.StringType(max_length=20)
    issue_number = types.StringType(max_length=20)
    doi = types.StringType(max_length=100)
    issn = types.StringType(max_length=20)
    publisher = types.StringType(max_length=100)
    language = types.StringType(max_length=50)
    other_fields = types.DictType(types.StringType)

---

## RIS Format

In [66]:
"""
Parse .RIS files from Scopus or Mendeley, as well as plaintext exports from
Web of Science; return as a list of dictionaries, where each citation record
is a dictionary whose keys are field names and values are field values.
"""
import io
import re

from dateutil.parser import parse as parse_date


TAG_KEY_MAPPING = {
    'A1': 'primary_authors',  # special: Lastname, Firstname, Suffix
    'A2': 'secondary_authors',  # special: Lastname, Firstname, Suffix
    'A3': 'tertiary_authors',  # special: Lastname, Firstname, Suffix
    'A4': 'subsidiary_authors',  # special: Lastname, Firstname, Suffix
    'AB': 'abstract',
    'AD': 'author_address',
    'AN': 'accession_number',
    'AU': 'authors',  # special
    'AV': 'location_in_archives',
    'BN': 'isbn',
    'BP': 'start_page',
    'BT': 'bt',
    'C1': 'custom_1',
    'C2': 'custom_2',
    'C3': 'custom_3',
    'C4': 'custom_4',
    'C5': 'custom_5',
    'C6': 'custom_6',
    'C7': 'custom_7',
    'C8': 'custom_8',
    'CA': 'caption',
    'CN': 'call_number',
    'CP': 'cp',
    'CT': 'title_of_unpublished_ref',
    'CY': 'place_published',
    'DA': 'date',  # special: YYYY, YYYY/MM, YYYY/MM/DD/, or YYYY/MM/DD/other info
    'DB': 'name_of_database',
    'DE': 'author_keywords',
    'DI': 'doi',
    'DO': 'doi',
    'DP': 'database_provider',
    'DT': 'document_type',
    'ED': 'editor',
    'EF': 'end_file',  # ignore!
    'EM': 'email_address',
    'EP': 'end_page',
    'ER': 'end_of_reference',  # special: must be empty and last tag of record
    'ET': 'edition',
    'FN': 'file_name',  # ignore!
    'ID': 'reference_id',
    'IS': 'issue_number',
    'J1': 'journal_name_user_abbr_1',
    'J2': 'journal_name_user_abbr_2',
    'JA': 'journal_name_abbr',
    'JF': 'journal_name',
    'JO': 'journal_name',
    'KW': 'keywords',  # special
    'L1': 'link_to_pdf',
    'L2': 'link_to_fulltext',
    'L3': 'related_records',
    'L4': 'figure',
    'LA': 'language',
    'LB': 'label',
    'LK': 'link_to_website',
    'M1': 'number',
    'M2': 'miscellaneous_2',
    'M3': 'type_of_work',
    'N1': 'notes',
    'N2': 'abstract',
    'NV': 'number_of_volumes',
    'OP': 'original_publication',
    'PB': 'publisher',
    'PD': 'publication_date',
    'PP': 'publishing_place',
    'PT': 'publication_type',
    'PY': 'publication_year',  # special: YYYY
    'RI': 'reviewed_item',
    'RN': 'research_notes',
    'RP': 'reprint_status',  # special: 'IN FILE', 'NOT IN FILE', or 'ON REQUEST (MM/DD/YY)'
    'SE': 'section',
    'SN': 'issn',
    'SO': 'source_name',
    'SP': 'start_page',
    'ST': 'short_title',
    'SU': 'supplement',
    'T1': 'primary_title',
    'T2': 'secondary_title',  # note: journal_title, if applicable
    'T3': 'tertiary_title',
    'TA': 'translated_author',
    'TC': 'times_cited',
    'TI': 'title',
    'TT': 'translated_title',
    'TY': 'type_of_reference',  # special: must be key in REFERENCE_TYPES and first tag of record
    'U1': 'user_defined_1',
    'U2': 'user_defined_2',
    'U3': 'user_defined_3',
    'U4': 'user_defined_4',
    'U5': 'user_defined_5',
    'UR': 'url',
    'UT': 'unique_identifier',
    'VL': 'volume',
    'VO': 'published_standard_number',
    'VR': 'version',  # ignore!
    'Y1': 'primary_date',  # special: YYYY/
    'Y2': 'access_date',
}

REFERENCE_TYPES_MAPPING = {
    'ABST': 'abstract',
    'ADVS': 'audiovisual material',
    'AGGR': 'aggregated database',
    'ANCIENT': 'ancient text',
    'ART': 'art work',
    'BILL': 'bill/resolution',
    'BLOG': 'blog',
    'BOOK': 'book',
    'CASE': 'case',
    'CHAP': 'book chapter',
    'CHART': 'chart',
    'CLSWK': 'classical cork',
    'COMP': 'computer program',
    'CONF': 'conference proceeding',
    'CPAPER': 'conference paper',
    'CTLG': 'catalog',
    'DATA': 'data file',
    'DBASE': 'online database',
    'DICT': 'dictionary',
    'EBOOK': 'electronic book',
    'ECHAP': 'electronic book chapter',
    'EDBOOK': 'edited book',
    'EJOUR': 'electronic article',
    'ELEC': 'web page',
    'ENCYC': 'encyclopedia',
    'EQUA': 'equation',
    'FIGURE': 'figure',
    'GEN': 'generic',
    'GOVDOC': 'government document',
    'GRANT': 'grant',
    'HEAR': 'hearing',
    'ICOMM': 'internet communication',
    'INPR': 'in press',
    'JFULL': 'journal (full)',
    'JOUR': 'journal',
    'LEGAL': 'legal rule or regulation',
    'MANSCPT': 'manuscript',
    'MAP': 'map',
    'MGZN': 'magazine article',
    'MPCT': 'motion picture',
    'MULTI': 'online multimedia',
    'MUSIC': 'music score',
    'NEWS': 'newspaper',
    'PAMP': 'pamphlet',
    'PAT': 'patent',
    'PCOMM': 'personal communication',
    'RPRT': 'report',
    'SER': 'serial publication',
    'SLIDE': 'slide',
    'SOUND': 'sound recording',
    'STAND': 'standard',
    'STAT': 'statute',
    'THES': 'thesis/dissertation',
    'UNBILL': 'unenacted bill/resolution',
    'UNPB': 'unpublished work',
    'VIDEO': 'video recording',
}

MULTI_TAGS = {'A1', 'A2', 'A3', 'A4', 'AD', 'AU', 'KW', 'N1'}
IGNORE_TAGS = {'FN', 'VR', 'EF'}
START_TAGS = {'TY', 'PT'}
END_TAG = 'ER'

# TAG_RE = re.compile(r'^([A-Z][A-Z0-9])(  - | )|^(E[FR])(\s?$|  - | )')
TAGv1_RE = re.compile(r'^(?P<tag>[A-Z][A-Z0-9])(  - )')
TAGv2_RE = re.compile(r'^(?P<tag>[A-Z][A-Z0-9])( )|^(?P<endtag>E[FR])(\s?$)')


VALUE_SANITIZERS = {
    'DA': lambda x: parse_date(x).strftime('%Y-%m-%d'),
    'PY': lambda x: int(x),
    'TC': lambda x: int(x),
    'TY': lambda x: REFERENCE_TYPES_MAPPING.get(x, x),
    'Y1': lambda x: parse_date('-'.join(item if item else '01' for item in x[:-1].split('/'))),
    'Y2': lambda x: min(parse_date(val) for val in x.split(' through ')),
    }


def _add_tag_line(tag, line, start_idx, record):
    """
    Args:
        tag (str)
        line (str)
        start_idx (int)
        record (dict)
    """
    key = TAG_KEY_MAPPING[tag]
    value = line[start_idx:].strip()
    # try to sanitize value, but don't sweat failure
    try:
        value = VALUE_SANITIZERS[tag](value)
    except KeyError:
        pass
    except Exception:
        print('value sanitization error: key={}, value={}'.format(key, value))
    # for multi-value tags, append to a list
    if tag in MULTI_TAGS:
        try:
            record[key].append(value)
        except KeyError:
            record[key] = [value]
    # otherwise, add key:value to record
    else:
        if key in record:
            print('duplicate key error: key={}, value={}'.format(key, value))
        record[key] = value


def parse_ris_file(path):
    with io.open(path, mode='r') as f:

        in_record = False
        tag_re = None
        prev_tag = None
        record = {}
        records = []

        for i, line in enumerate(f):

            if not line.strip():
                continue

            # automatically detect regex needed for this RIS file
            if tag_re is None:
                tag_re = (TAGv1_RE if TAGv1_RE.match(line)
                          else TAGv2_RE if TAGv2_RE.match(line)
                          else None)
                if tag_re is None:
                    raise IOError('file {} is not formatted as expected!'.format(path))

            tag_match = tag_re.match(line)
            if tag_match:

                tag = tag_match.group('tag') or tag_match.group('endtag')

                if tag in IGNORE_TAGS:
                    prev_tag = tag
                    continue

                elif tag == END_TAG:
                    if in_record is False:
                        msg = 'found end tag, but not in a record!\nline: {} {}'.format(i, line.strip())
                        raise IOError(msg)
                    records.append(record)
                    in_record = False
                    record = {}
                    prev_tag = tag
                    continue

                elif tag in START_TAGS:
                    if in_record is True:
                        msg = 'found start tag, but already in a record!\nline: {} {}'.format(i, line.strip())
                        raise IOError(msg)
                    in_record = True
                    _add_tag_line(tag, line, tag_match.end(), record)
                    prev_tag = tag
                    continue

                if in_record is False:
                    raise IOError('start/end tag mismatch!\nline: {} {}'.format(i, line.strip()))

                if tag in TAG_KEY_MAPPING:
                    _add_tag_line(tag, line, tag_match.end(), record)
                    prev_tag = tag
                    continue
                                    
                # multi-value tag line happens to start with a tag-compliant string
                if prev_tag in MULTI_TAGS:
                    _add_tag_line(prev_tag, line, 0, record)
                    continue
                
                # no idea what this is, but might as well save it
                print('unknown tag: tag={}, line={} "{}"'.format(tag, i, line.strip()))
                record[tag] = line[tag_match.end():].strip()
                
            elif prev_tag in MULTI_TAGS:
                _add_tag_line(prev_tag, line, 0, record)
                continue
                
            # single-value tag split across multiple lines, ugh
            elif line.startswith('   '):
                key = TAG_KEY_MAPPING[prev_tag]
                record[key] += ' ' + line.strip()

            else:
                print('bad line: prev_tag={}, line={} "{}"'.format(prev_tag, i, line.strip()))

    return records

In [23]:
from __future__ import absolute_import, division, print_function, unicode_literals

import io
import re

from dateutil.parser import parse as parse_date


KEY_MAP = {
    'A1': 'primary_authors',  # special: Lastname, Firstname, Suffix
    'A2': 'secondary_authors',  # special: Lastname, Firstname, Suffix
    'A3': 'tertiary_authors',  # special: Lastname, Firstname, Suffix
    'A4': 'subsidiary_authors',  # special: Lastname, Firstname, Suffix
    'AB': 'abstract',
    'AD': 'author_addresses',
    'AN': 'accession_number',
    'AU': 'authors',  # special
    'AV': 'location_in_archives',
    'BN': 'isbn',
    'BP': 'start_page',
    'BT': 'bt',
    'C1': 'custom_1',
    'C2': 'custom_2',
    'C3': 'custom_3',
    'C4': 'custom_4',
    'C5': 'custom_5',
    'C6': 'custom_6',
    'C7': 'custom_7',
    'C8': 'custom_8',
    'CA': 'caption',
    'CN': 'call_number',
    'CP': 'cp',
    'CT': 'title_of_unpublished_ref',
    'CY': 'place_published',
    'DA': 'date',  # special: YYYY, YYYY/MM, YYYY/MM/DD/, or YYYY/MM/DD/other info
    'DB': 'name_of_database',
    'DE': 'author_keywords',
    'DI': 'doi',
    'DO': 'doi',
    'DP': 'database_provider',
    'DT': 'document_type',
    'ED': 'editor',
    'EF': 'end_file',  # ignore!
    'EM': 'email_address',
    'EP': 'end_page',
    'ER': 'end_of_reference',  # special: must be empty and last tag of record
    'ET': 'edition',
    'FN': 'file_name',  # ignore!
    'ID': 'reference_id',
    'IS': 'issue_number',
    'J1': 'journal_name_user_abbr_1',
    'J2': 'journal_name_user_abbr_2',
    'JA': 'journal_name_abbr',
    'JF': 'journal_name',
    'JO': 'journal_name',
    'KW': 'keywords',  # special
    'L1': 'link_to_pdf',
    'L2': 'link_to_fulltext',
    'L3': 'related_records',
    'L4': 'figure',
    'LA': 'language',
    'LB': 'label',
    'LK': 'link_to_website',
    'M1': 'number',
    'M2': 'miscellaneous_2',
    'M3': 'type_of_work',
    'N1': 'notes',
    'N2': 'abstract',
    'NV': 'number_of_volumes',
    'OP': 'original_publication',
    'PB': 'publisher',
    'PD': 'publication_date',
    'PP': 'publishing_place',
    'PT': 'publication_type',
    'PY': 'publication_year',  # special: YYYY
    'RI': 'reviewed_item',
    'RN': 'research_notes',
    'RP': 'reprint_status',  # special: 'IN FILE', 'NOT IN FILE', or 'ON REQUEST (MM/DD/YY)'
    'SE': 'section',
    'SN': 'issn',
    'SO': 'source_name',
    'SP': 'start_page',
    'ST': 'short_title',
    'SU': 'supplement',
    'T1': 'primary_title',
    'T2': 'secondary_title',  # note: journal_title, if applicable
    'T3': 'tertiary_title',
    'TA': 'translated_author',
    'TC': 'times_cited',
    'TI': 'title',
    'TT': 'translated_title',
    'TY': 'type_of_reference',  # special: must be key in REFERENCE_TYPES and first tag of record
    'U1': 'user_defined_1',
    'U2': 'user_defined_2',
    'U3': 'user_defined_3',
    'U4': 'user_defined_4',
    'U5': 'user_defined_5',
    'UR': 'url',
    'UT': 'unique_identifier',
    'VL': 'volume',
    'VO': 'published_standard_number',
    'VR': 'version',  # ignore!
    'Y1': 'primary_date',  # special: YYYY/
    'Y2': 'access_date',
}

REFERENCE_TYPES_MAPPING = {
    'ABST': 'abstract',
    'ADVS': 'audiovisual material',
    'AGGR': 'aggregated database',
    'ANCIENT': 'ancient text',
    'ART': 'art work',
    'BILL': 'bill/resolution',
    'BLOG': 'blog',
    'BOOK': 'book',
    'CASE': 'case',
    'CHAP': 'book chapter',
    'CHART': 'chart',
    'CLSWK': 'classical cork',
    'COMP': 'computer program',
    'CONF': 'conference proceeding',
    'CPAPER': 'conference paper',
    'CTLG': 'catalog',
    'DATA': 'data file',
    'DBASE': 'online database',
    'DICT': 'dictionary',
    'EBOOK': 'electronic book',
    'ECHAP': 'electronic book chapter',
    'EDBOOK': 'edited book',
    'EJOUR': 'electronic article',
    'ELEC': 'web page',
    'ENCYC': 'encyclopedia',
    'EQUA': 'equation',
    'FIGURE': 'figure',
    'GEN': 'generic',
    'GOVDOC': 'government document',
    'GRANT': 'grant',
    'HEAR': 'hearing',
    'ICOMM': 'internet communication',
    'INPR': 'in press',
    'JFULL': 'journal (full)',
    'JOUR': 'journal',
    'LEGAL': 'legal rule or regulation',
    'MANSCPT': 'manuscript',
    'MAP': 'map',
    'MGZN': 'magazine article',
    'MPCT': 'motion picture',
    'MULTI': 'online multimedia',
    'MUSIC': 'music score',
    'NEWS': 'newspaper',
    'PAMP': 'pamphlet',
    'PAT': 'patent',
    'PCOMM': 'personal communication',
    'RPRT': 'report',
    'SER': 'serial publication',
    'SLIDE': 'slide',
    'SOUND': 'sound recording',
    'STAND': 'standard',
    'STAT': 'statute',
    'THES': 'thesis/dissertation',
    'UNBILL': 'unenacted bill/resolution',
    'UNPB': 'unpublished work',
    'VIDEO': 'video recording',
}

MULTI_TAGS = {'A1', 'A2', 'A3', 'A4', 'AD', 'AU', 'KW', 'N1'}
IGNORE_TAGS = {'FN', 'VR', 'EF'}
START_TAGS = {'TY', 'PT'}
END_TAG = 'ER'

TAGv1_RE = re.compile(r'^(?P<tag>[A-Z][A-Z0-9])(  - )')
TAGv2_RE = re.compile(r'^(?P<tag>[A-Z][A-Z0-9])( )|^(?P<endtag>E[FR])(\s?$)')

VALUE_SANITIZERS = {
    'DA': lambda x: parse_date(x).strftime('%Y-%m-%d'),
    'PY': lambda x: int(x),
    'TC': lambda x: int(x),
    'TY': lambda x: REFERENCE_TYPES_MAPPING.get(x, x),
    'Y1': lambda x: parse_date('-'.join(item if item else '01' for item in x[:-1].split('/'))),
    'Y2': lambda x: min(parse_date(val) for val in x.split(' through ')),
    }


class RisFile(object):
    """
    Args:
        path (str): RIS file to be parsed
        key_map (dict or bool): mapping of short RIS tags to to human-readable keys;
            if None (default), default mapping is used; if False, no mapping will be done
        value_sanitizers (dict or bool): mapping of short RIS tags to functions
            that sanitize their associated values; if None (default), default
            sanitizers will be used; if False, no sanitization will be performed
    """

    def __init__(self, path,
                 key_map=None,
                 value_sanitizers=None):
        self.path = path
        self.key_map = (key_map if key_map is not None
                        else KEY_MAP)
        self.value_sanitizers = (value_sanitizers if value_sanitizers is not None
                                 else VALUE_SANITIZERS)
        if self.key_map:
            self.multi_keys = {self.key_map.get(tag, tag) for tag in MULTI_TAGS}
        else:
            self.multi_keys = MULTI_TAGS
        self.in_record = False
        self.tag_re = None
        self.prev_line_len = None
        self.prev_tag = None
        self.record = {}

    def parse(self):
        """
        Yields:
            dict: next complete citation record

        Raises:
            IOError
        """
        with io.open(self.path, mode='rt') as f:
            for i, line in enumerate(f):

                # skip empty lines
                if not line.strip():
                    continue

                # automatically detect regex needed for this RIS file
                if self.tag_re is None:
                    if TAGv1_RE.match(line):
                        self.tag_re = TAGv1_RE
                    elif TAGv2_RE.match(line):
                        self.tag_re = TAGv2_RE
                    else:
                        msg ='tags in file {} not formatted as expected!'.format(self.path)
                        raise IOError(msg)

                tag_match = self.tag_re.match(line)
                # lines starts with a tag
                if tag_match:

                    tag = tag_match.group('tag') or tag_match.group('endtag')

                    if tag in IGNORE_TAGS:
                        self._stash_prev_info(tag, len(line))
                        continue

                    elif tag == END_TAG:
                        if self.in_record is False:
                            msg = 'found end tag, but not in a record!\nline: {} {}'.format(i, line.strip())
                            raise IOError(msg)

                        self._sort_multi_values()
                        yield self.record  # record is complete! spit it out here

                        self.in_record = False
                        self.record = {}
                        self._stash_prev_info(tag, len(line))
                        continue

                    elif tag in START_TAGS:
                        if self.in_record is True:
                            msg = 'found start tag, but already in a record!\nline: {} {}'.format(i, line.strip())
                            raise IOError(msg)
                        self.in_record = True
                        self._add_tag_line(tag, line, tag_match.end())
                        self._stash_prev_info(tag, len(line))
                        continue

                    if self.in_record is False:
                        msg = 'start/end tag mismatch!\nline: {} {}'.format(i, line.strip())
                        raise IOError(msg)

                    if self.key_map and tag in self.key_map:
                        self._add_tag_line(tag, line, tag_match.end())
                        self._stash_prev_info(tag, len(line))
                        continue

                    # multi-value tag line happens to start with a tag-compliant string
                    if self.prev_tag in MULTI_TAGS:
                        self._add_tag_line(self.prev_tag, line, 0)
                        continue

                    # no idea what this is, but might as well save it
                    print('unknown tag: tag={}, line={} "{}"'.format(tag, i, line.strip()))
                    self.record[tag] = line[tag_match.end():].strip()
                    self._stash_prev_info(tag, len(line))
                    continue

                # subsequent line belonging to a multi-value tag
                elif self.prev_tag in MULTI_TAGS:
                    self._add_tag_line(self.prev_tag, line, 0)
                    continue

                # single-value tag split across multiple lines, ugh
                elif line.startswith('   ') or self.prev_line_len > 70:
                    key = (self.key_map[self.prev_tag] if self.key_map
                           else self.prev_tag)
                    self.record[key] += ' ' + line.strip()

                else:
                    print('bad line: prev_tag={}, line={} "{}"'.format(
                        self.prev_tag, i, line.strip()))

    def _add_tag_line(self, tag, line, start_idx):
        """
        Args:
            tag (str)
            line (str)
            start_idx (int)
        """
        key = (self.key_map[tag] if self.key_map
               else tag)
        value = line[start_idx:].strip()
        # try to sanitize value, but don't sweat failure
        try:
            value = self.value_sanitizers[tag](value)
        except KeyError:
            pass
        except Exception:
            print('value sanitization error: key={}, value={}'.format(key, value))
        # for multi-value tags, append to a list
        if tag in MULTI_TAGS:
            try:
                self.record[key].append(value)
            except KeyError:
                self.record[key] = [value]
        # otherwise, add key:value to record
        else:
            if key in self.record:
                print('duplicate key error: key={}, value={}'.format(key, value))
            self.record[key] = value

    def _stash_prev_info(self, tag, line_len):
        """
        Args:
            tag (str)
            line_len (int)
        """
        self.prev_tag = tag
        self.prev_line_len = line_len

    def _sort_multi_values(self):
        for key in self.multi_keys:
            try:
                self.record[key] = tuple(sorted(self.record[key]))
            except KeyError:
                pass
            except Exception:
                print('multi-value sort error: key={}, value={}'.format(key, self.record[key]))


In [28]:
fname = '../data/raw/citation_formats/scopus_to_ris.ris'
# fname = '../data/raw/citation_formats/mendeley_to_ris.ris'
# fname = '../data/raw/citation_formats/wos_to_plain_text.txt'
records = list(RisFile(fname).parse())

In [25]:
records[0]

{'abstract': 'Poly (styrene-divinylbenzene) (P (St-DVB)) foams with porosity as high as 98% were prepared by the method of high internal phase emulsions (HIPEs) in one-step process. The materials exhibited superhydrophobicity and excellent oleophilicity, with the water contact angle (WCA) even exceeding 150° and oil contact angle approaching 0°. The materials fabricated with different types of Fe3O4 particles had varied hierarchical pore structures. And the adsorption capacity of the monolithic foam towards chloroform was as high as 57 g/g. Importantly, the materials soaked with oil could be regenerated effectively by means of centrifugation with oil recovery rate reaching 90%. More importantly, the monolithic PolyHIPEs (polymers obtained by the polymerization of the HIPEs) were subjected to 20 adsorption-centrifugation cycles and superior reusability was demonstrated. These features achieved with PolyHIPEs made them ideal candidates for practical oil removal applications. © 2016 Elsev

In [67]:
fname = '../data/raw/citation_formats/scopus_to_ris.ris'
fname = '../data/raw/citation_formats/mendeley_to_ris.ris'
fname = '../data/raw/citation_formats/wos_to_plain_text.txt'
records = parse_ris_file(fname)

unknown tag: tag=ZB, line=37 "ZB 0"
unknown tag: tag=Z8, line=38 "Z8 0"
unknown tag: tag=ZR, line=39 "ZR 0"
unknown tag: tag=ZS, line=40 "ZS 0"
unknown tag: tag=Z9, line=42 "Z9 0"
unknown tag: tag=EI, line=44 "EI 1873-2119"
unknown tag: tag=ZB, line=85 "ZB 0"
unknown tag: tag=Z8, line=86 "Z8 0"
unknown tag: tag=ZR, line=87 "ZR 0"
unknown tag: tag=ZS, line=88 "ZS 0"
unknown tag: tag=Z9, line=90 "Z9 0"
unknown tag: tag=EI, line=92 "EI 1879-1026"
unknown tag: tag=PM, line=94 "PM 26971215"
unknown tag: tag=ZB, line=129 "ZB 0"
unknown tag: tag=Z8, line=130 "Z8 0"
unknown tag: tag=ZR, line=131 "ZR 0"
unknown tag: tag=ZS, line=132 "ZS 0"
unknown tag: tag=Z9, line=134 "Z9 0"
unknown tag: tag=EI, line=135 "EI 1095-8630"
unknown tag: tag=PM, line=137 "PM 27019358"
unknown tag: tag=ZB, line=166 "ZB 0"
unknown tag: tag=Z8, line=167 "Z8 0"
unknown tag: tag=ZR, line=168 "ZR 0"
unknown tag: tag=ZS, line=169 "ZS 0"
unknown tag: tag=Z9, line=171 "Z9 0"
unknown tag: tag=EI, line=173 "EI 1944-3986"
unkno

---

## BibTex

In [17]:
from __future__ import absolute_import, division, print_function, unicode_literals

import io
import re

import bibtexparser
from bibtexparser.bparser import BibTexParser
from bibtexparser.customization import convert_to_unicode, getnames

# TODO: confirm that 'references' sanitization is correct

KEY_MAP = {
    'address': 'publisher_address',
    'author': 'authors',
    'keyword': 'keywords',
    'journal': 'journal_name',
    'month': 'publication_month',
    'note': 'notes',
    'number': 'issue_number',
    'publisher': 'publisher_name',
    'year': 'publication_year',
}

VALUE_SANITIZERS = {
    'author': lambda x: tuple(sorted(getnames([a.strip() for a in x.replace('\n', ' ').split(' and ')]))),
    'keyword': lambda x: tuple(sorted(kw.strip() for kw in re.split(r',|;', x.replace('\n', '')) if kw)),
    'author_keywords': lambda x: tuple(sorted(kw.strip() for kw in re.split(r',|;', x.replace('\n', '')) if kw)),
    'month': lambda x: int(x),
    'pages': lambda x: _sanitize_pages(x),
    'references': lambda x: tuple(sorted(ref.strip() for ref in x.split('; ') if ref)),
    'type': lambda x: x.lower(),
    'year': lambda x: int(x),
}


def _sanitize_pages(value):
    # hyphen, non-breaking hyphen, en dash, em dash, hyphen-minus, minus sign
    separators = ('‐', '‑', '–', '—', '-', '−')
    for sep in separators:
        if sep in value:
            pages = [i.strip().strip(sep)
                     for i in value.split(sep)
                     if i]
            if len(pages) > 2:
                print('unusual "pages" field value: {}', value)
            else:
                value = pages[0] + '--' + pages[-1]
                break
    return value


def _sanitize_record(record):
    record = {key: value
              for key, value in record.items()
              if value}
    record = convert_to_unicode(record)
    return record


class BibTexFile(object):
    """
    Args:
        path (str): BibTex file to be parsed
        key_map (dict or bool): mapping of default BibTex tags to to human-readable keys;
            if None (default), default mapping is used; if False, no mapping will be done
        value_sanitizers (dict or bool): mapping of default BibTex tags to functions
            that sanitize their associated values; if None (default), default sanitizers
            will be used; if False, no sanitization will be performed
    """

    def __init__(self, path, key_map=None, value_sanitizers=None):
        self.path = path
        self.parser = BibTexParser()
        self.parser.ignore_nonstandard_types = False
        self.parser.homogenize_fields = False
        self.parser.customization = _sanitize_record
        self.key_map = (key_map if key_map is not None
                        else KEY_MAP)
        self.value_sanitizers = (value_sanitizers if value_sanitizers is not None
                                 else VALUE_SANITIZERS)

    def parse(self):
        """
        Yields:
            dict: next parsed citation record
        """
        with io.open(self.path, mode='rt') as f:
            parsed_data = bibtexparser.load(f, parser=self.parser)
        for record in parsed_data.entries:
            if self.value_sanitizers:
                for key, value in record.items():
                    try:
                        record[key] = self.value_sanitizers[key](value)
                    except KeyError:
                        pass
                    except Exception:
                        print('value sanitization error: key={}, value={}'.format(key, value))
            if self.key_map:
                for key, rekey in self.key_map.items():
                    try:
                        record[rekey] = record.pop(key)
                    except KeyError:
                        pass
                
            yield record


In [18]:
fname = '../data/raw/citation_formats/mendeley_to_bibtek.bib'
fname = '../data/raw/citation_formats/scopus_to_bibtek.bib'

for record in BibTexFile(fname).parse():
    pprint(record)
    break
#     if 'references' in record:
#         pprint(record['references'].split('; '))
#         break

{'ENTRYTYPE': 'article',
 'ID': 'Zhang2016117',
 'abbrev_source_title': 'Chem. Eng. J.',
 'abstract': 'Poly (styrene-divinylbenzene) (P (St-DVB)) foams with porosity '
             'as high as 98% were prepared by the method of high internal '
             'phase emulsions (HIPEs) in one-step process. The materials '
             'exhibited superhydrophobicity and excellent oleophilicity, with '
             'the water contact angle (WCA) even exceeding 150° and oil '
             'contact angle approaching 0°. The materials fabricated with '
             'different types of Fe3O4 particles had varied hierarchical pore '
             'structures. And the adsorption capacity of the monolithic foam '
             'towards chloroform was as high as 57 g/g. Importantly, the '
             'materials soaked with oil could be regenerated effectively by '
             'means of centrifugation with oil recovery rate reaching 90%. '
             'More importantly, the monolithic PolyHIPEs (poly

In [None]:
import io

import pandas as pd
import textacy

In [None]:
fname = '../data/raw/all_fields_Combined Search_Results_Final.txt'
records = []
with io.open(fname, mode='rt', encoding='utf8') as f:
    record = {}
    for i, line in enumerate(f):
        if not line.strip():
            if record:
                records.append(record)
            record = {}
        else:
            try:
                field, value = line.split(':', 1)
            except ValueError:
                print(i, line)
            record[field.strip()] = value.strip()
            
        if i > 1000:
            break
            
df = pd.DataFrame(records)

In [None]:
df = pd.read_excel('../data/raw/Combined Search_Results_Top_3.xls')
print(df.shape)
df.head(3)