In [1]:
#imports
import os, re, glob, copy
from collections import defaultdict

try:
    import bibtexparser
except:
    ! conda install -c conda-forge bibtexparse
    
    
import nltk
nltk.download('wordnet')

import iso4
from titlecase import titlecase


from bibtexparser.bibdatabase import BibDatabase
from bibtexparser.bparser import BibTexParser


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\benja\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Parsing and fixing of general things

## RULES

In [2]:
from reffix.entry_rules import titelcasing_fields, journal_iso4, double_minus, capitalize_keys
from reffix.entry_rules import check_misc_fields, check_incollection_fields, check_inproceedings_fields, check_inbook_fields, check_article_fields, check_book_fields
from reffix.entry_rules import homogenize_latex_encoding
from bibtexparser.customization import type as bibtexType

In [3]:
from bibtexparser.bparser import BibTexParser

def build_Parser():
    myParser = BibTexParser()

    def rulez(record):
        record = bibtexType(record)
        record = capitalize_keys(record)

        record = homogenize_latex_encoding(record)
        record = check_article_fields(record)
        record = check_book_fields(record)
        record = check_incollection_fields(record)
        record = check_inproceedings_fields(record)
        record = check_inbook_fields(record)


        record = double_minus(record)
        record = titelcasing_fields(record)
        record = journal_iso4(record)

        return record

    myParser.customization = rulez
    return myParser


#  DO 

In [6]:
#PATHs
root_dir = "C:/Users/benja/Downloads/thesis"

#INPATHS
chapter1_path = root_dir+"/2_chapter_intro"
chapter2_path = root_dir+"/3_chapter_1"
chapter3_path = root_dir+"/4_chapter_2"
chapter4_path = root_dir+"/5_chapter_3"
chapter5_path = root_dir+"/6_chapter_4"
chapter6_path = root_dir+"/7_chapter_5"
chapter7_path = root_dir+"/8_chapter_outlook"

chapter_paths = [chapter1_path, chapter2_path, chapter3_path, chapter4_path, chapter5_path, chapter6_path, chapter7_path]

    
chapters = {
    "chapter_"+str(i):{
        "path": chapter_path,
        "bib_path": chapter_path+"/ref/ref.bib",
        "tex_paths": list(filter(lambda x: not "fragment" in x, glob.glob(chapter_path+"/*tex")+glob.glob(chapter_path+"/*/*tex")))
    }
    for i, chapter_path in enumerate(chapter_paths) }


#OUTPaths
out_dir = "C:/Users/benja/Desktop/fixed_ref"
if(not os.path.exists(out_dir)): os.mkdir(out_dir)
out_all_ref = out_dir+"/mergedReferences.bib"


In [10]:
from reffix.database_functions import remove_duplicates, get_used_citations, filter_database_for_required_citations

#LOAD DATA & Modify
for chapter in chapters:
    print("Chapter: ", chapter, )
    chapter_data = chapters[chapter]
    #print(chapter_data['bib_path'],  chapter_data['tex_paths'])

    used_cit = []
    for tex_path in chapter_data['tex_paths']:
        used_cit.extend(list(set(get_used_citations(tex_path))))
    chapters[chapter].update({"tex_cit":used_cit})
    
    bib_file = open(chapter_data['bib_path'], "r", encoding="utf-8")
    db = bibtexparser.load(bib_file, parser=build_Parser())
    db = remove_duplicates(db)
    db = filter_database_for_required_citations(db, used_cit)
    chapters[chapter].update({"bib":db})
    bib_file.close()
    print()

        
        

Chapter:  chapter_0
Filter DB for duplicates
	Before Elements in DB:  146
	After Elements in DB:  146
Filter DB for used in TEX
	Before Elements in DB:  146
	After Elements in DB:  133

Chapter:  chapter_1
Filter DB for duplicates
	Before Elements in DB:  59
	After Elements in DB:  59
Filter DB for used in TEX
	Before Elements in DB:  59
	After Elements in DB:  59

Chapter:  chapter_2
Filter DB for duplicates
	Before Elements in DB:  90
	After Elements in DB:  90
Filter DB for used in TEX
	Before Elements in DB:  90
	After Elements in DB:  89

Chapter:  chapter_3
Filter DB for duplicates
	Before Elements in DB:  119
	After Elements in DB:  119
Filter DB for used in TEX
	Before Elements in DB:  119
	After Elements in DB:  55

Chapter:  chapter_4
Filter DB for duplicates
	Before Elements in DB:  79
Found 1 Duplicates!
Keys: 	Ries2021B
found duplicate for: Ries2021B
	After Elements in DB:  78
Filter DB for used in TEX
	Before Elements in DB:  78
	After Elements in DB:  75

Chapter:  chapt

In [11]:
print("Number of citations")
for chapter, chapter_data in chapters.items():
    print("\t",chapter, "\t\t", len(chapter_data['bib'].entries_dict))

Number of citations
	 chapter_0 		 133
	 chapter_1 		 59
	 chapter_2 		 89
	 chapter_3 		 55
	 chapter_4 		 75
	 chapter_5 		 75
	 chapter_6 		 43


In [12]:
#write out:
combined_db = BibDatabase()

for chapter, chapter_data in chapters.items():
    print(chapter, chapter_data['path'])
    chapter_out = out_dir+"/"+chapter
    if(not os.path.exists(chapter_out)): os.mkdir(chapter_out)
    
    with open(chapter_out+"/"+"ref.bib", 'w', encoding="utf-8") as bibtex_file:
        bibtexparser.dump(chapter_data['bib'], bibtex_file)
        
    combined_db.entries.extend(chapter_data['bib'].entries)
    
print("\n merge Citations: ")
combined_db = remove_duplicates(combined_db, verbose=False)
combined_db.entries = list(combined_db.entries_dict.values())

with open(out_dir+"/"+"mergedReferences.bib", 'w', encoding="utf-8") as bibtex_file:
    bibtexparser.dump(combined_db, bibtex_file)


chapter_0 C:/Users/benja/Downloads/thesis/2_chapter_intro
chapter_1 C:/Users/benja/Downloads/thesis/3_chapter_1
chapter_2 C:/Users/benja/Downloads/thesis/4_chapter_2
chapter_3 C:/Users/benja/Downloads/thesis/5_chapter_3
chapter_4 C:/Users/benja/Downloads/thesis/6_chapter_4
chapter_5 C:/Users/benja/Downloads/thesis/7_chapter_5
chapter_6 C:/Users/benja/Downloads/thesis/8_chapter_outlook

 merge Citations: 
Filter DB for duplicates
	Before Elements in DB:  529
	After Elements in DB:  390
