In [1]:
#imports
import os, re, glob, copy
from collections import defaultdict

try:
    import bibtexparser
except:
    ! conda install -c conda-forge bibtexparse
    
from bibtexparser.bibdatabase import BibDatabase
from bibtexparser.bparser import BibTexParser


# Parsing and fixing of general things

**Warning:** No content checks and no author name rearrangments! (either you use bibtexparser crossref functionality for that or you do it manually)

## RULES
more latex signs can be found here:
https://de.wikibooks.org/wiki/LaTeX/_Akzente_und_Sonderzeichen

In [2]:
from reffix.entry_rules import titelcasing_fields, journal_iso4, double_minus, capitalize_keys
from reffix.entry_rules import check_misc_fields, check_incollection_fields, check_inproceedings_fields, check_inbook_fields, check_article_fields, check_book_fields
from reffix.entry_rules import fixLatexChars
from bibtexparser.customization import type as bibtexType, homogenize_latex_encoding

[nltk_data] Downloading package wordnet to /home/bschroed/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
from bibtexparser.bparser import BibTexParser
from collections import OrderedDict


#feel free to add new rules :) or modify them in the entry_rule file.
def titelcasing_journal(record):
    if("journal" in record):
        record.update({"journal": "{"+titlecase(record['jounal'])+"}"})
    return record

def build_standardParser():
    myParser = BibTexParser()

    def rulez(record):
        record = bibtexType(record)
        
        #check entry fields
        record = check_article_fields(record)
        record = check_book_fields(record)
        record = check_incollection_fields(record)
        record = check_inproceedings_fields(record)
        record = check_inbook_fields(record)

        record = titelcasing_fields(record)
        record = capitalize_keys(record)
        record = homogenize_latex_encoding(record)

        #check field values
        record = fixLatexChars(record)
        record = double_minus(record)
        record = journal_iso4(record)

        return record

    myParser.customization = rulez
    return myParser


#  DO 

In [4]:
#PATHs
root_dir = ".."

##INPATHS
self_path = root_dir+"/1_frontback"
chapter1_path = root_dir+"/2_chapter_intro"
chapter2_path = root_dir+"/3_chapter_1"
chapter3_path = root_dir+"/4_chapter_2"
chapter4_path = root_dir+"/5_chapter_3"
chapter5_path = root_dir+"/6_chapter_4"
chapter6_path = root_dir+"/7_chapter_5"
chapter7_path = root_dir+"/8_chapter_outlook"

chapter_paths = [chapter1_path, chapter2_path, chapter3_path, chapter4_path, chapter5_path, chapter6_path, chapter7_path]

    
chapters = {
    "chapter_"+str(i):{
        "path": chapter_path,
        "bib_path": chapter_path+"/ref/ref.bib",
        "tex_paths": list(filter(lambda x: not "fragment" in x, glob.glob(chapter_path+"/*tex")+glob.glob(chapter_path+"/*/*tex")))
    }
    for i, chapter_path in enumerate(chapter_paths) }

chapters.update({
    "self": {
        "path": self_path,
        "bib_path": self_path+"/publications.bib", 
    }
})

##OUTPaths
out_dir = "./fixed_ref"
if(not os.path.exists(out_dir)): os.mkdir(out_dir)
out_all_ref = out_dir+"/mergedReferences.bib"


## Load Data & Clean Entries

In [5]:
from reffix.database_functions import remove_duplicates, get_used_citations, filter_database_for_required_citations

for chapter in chapters:
    print("Chapter: ", chapter, )
    chapter_data = chapters[chapter]
    #print(chapter_data['bib_path'],  chapter_data['tex_paths'])

    bib_file = open(chapter_data['bib_path'], "r", encoding="utf-8")
    db = bibtexparser.load(bib_file, parser=build_standardParser())
    
    db = remove_duplicates(db)
    
    if("tex_paths" in chapter_data):
        used_cit = []
        for tex_path in chapter_data['tex_paths']:
            used_cit.extend(list(set(get_used_citations(tex_path))))
        chapters[chapter].update({"tex_cit":used_cit})
        db = filter_database_for_required_citations(db, used_cit)
    
    chapters[chapter].update({"bib":db})

    bib_file.close()
    print()

        
        

Chapter:  chapter_0
Filter DB for duplicates
	Before Elements in DB:  142
	After Elements in DB:  142
Filter DB for used in TEX
	Before Elements in DB:  142
	After Elements in DB:  140

Chapter:  chapter_1
Filter DB for duplicates
	Before Elements in DB:  60
	After Elements in DB:  60
Filter DB for used in TEX
	Before Elements in DB:  60
	After Elements in DB:  60

Chapter:  chapter_2
Filter DB for duplicates
	Before Elements in DB:  89
	After Elements in DB:  89
Filter DB for used in TEX
	Before Elements in DB:  89
	After Elements in DB:  89

Chapter:  chapter_3
Filter DB for duplicates
	Before Elements in DB:  55
	After Elements in DB:  55
Filter DB for used in TEX
	Before Elements in DB:  55
	After Elements in DB:  55

Chapter:  chapter_4
Filter DB for duplicates
	Before Elements in DB:  76
	After Elements in DB:  76
Filter DB for used in TEX
	Before Elements in DB:  76
	After Elements in DB:  76

Chapter:  chapter_5
Filter DB for duplicates
	Before Elements in DB:  78
	After Elemen

In [6]:
print("Number of citations")
for chapter, chapter_data in chapters.items():
    print("\t",chapter, "\t\t", len(chapter_data['bib'].entries_dict))

Number of citations
	 chapter_0 		 140
	 chapter_1 		 60
	 chapter_2 		 89
	 chapter_3 		 55
	 chapter_4 		 76
	 chapter_5 		 78
	 chapter_6 		 43
	 self 		 10


## Merge & Write out BibTex

In [7]:
#Merge Lib:
combined_db = BibDatabase()

##Write out and merge all entries
for chapter, chapter_data in chapters.items():
    print(chapter, chapter_data['path'])
    chapter_out = out_dir+"/"+chapter
    if(not os.path.exists(chapter_out)): os.mkdir(chapter_out)
    
    with open(chapter_out+"/"+"ref.bib", 'w', encoding="utf-8") as bibtex_file:
        bibtexparser.dump(chapter_data['bib'], bibtex_file)
        
    combined_db.entries.extend(chapter_data['bib'].entries)
    
##Clean Merged lib
print("\nClean Merged Citations: ")
combined_db = remove_duplicates(combined_db, verbose=False)
combined_db.entries = list(combined_db.entries_dict.values())

## Write out merged lib
with open(out_dir+"/"+"mergedReferences.bib", 'w', encoding="utf-8") as bibtex_file:
    bibtexparser.dump(combined_db, bibtex_file)


chapter_0 ../2_chapter_intro
chapter_1 ../3_chapter_1
chapter_2 ../4_chapter_2
chapter_3 ../5_chapter_3
chapter_4 ../6_chapter_4
chapter_5 ../7_chapter_5
chapter_6 ../8_chapter_outlook
self ../1_frontback

Clean Merged Citations: 
Filter DB for duplicates
	Before Elements in DB:  551
	After Elements in DB:  402
