In [1]:
# import Levenshtein
import pymupdf
import json
from tqdm import tqdm
from openai import OpenAI
import copy

import io
import os
import sys
import time
import re
# import fitzs
from pdf2image import convert_from_path
from PIL import Image

In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
from openai import OpenAI

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("API_KEY")

In [None]:
OPENAI_API_KEY = api_key
client = OpenAI(api_key = OPENAI_API_KEY)

In [11]:
def get_api_response(prompt, text, model = "gpt-4o-mini"):
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "developer", "content": "You are a helpful assistant."},
            {"role": "user", "content": f" {prompt}. Here is the text: {text}"}
        ]
    )
    return completion.choices[0].message.content

In [12]:
DS_BIB_PDF_PATH = 'pdf2latex/data-science-bib.pdf'

In [13]:
# read book using pymupdf
pdf_document = pymupdf.open(DS_BIB_PDF_PATH)

In [14]:
bibliography = ""
for page in pdf_document:
    bibliography += page.get_text()
    

In [15]:
prompt = r""" 
    I am going to pass the extracted text from a PDF file of a textbook. It contains a bibliography section. 
    I want you to extract the references mentioned in the bibliography section. This should be a dictionary and the dictionary's keys 
    should be the key given as [something] for the references in the text. The values should be the full references in bibtex format. You can return the bibtex. 
    Create the bibtext format yourself. Understand what the bibtex format should be for each reference and return that as a string. For example, the
    bibtex format for a reference "Andrew Abela. Advanced Presentations by Design: Creating Communi-cation that Drives Action. Pfeier, 2nd edition, 2013."
    should be:
    @book{abela2008advanced,
        title={Advanced presentations by design: Creating communication that drives action},
        author={Abela, Andrew},
        year={2008},
        publisher={John Wiley \& Sons}
    }
    So if the text exists like this in the book:
    [Abe13]\nAndrew Abela. Advanced Presentations by Design: Creating Communi-\ncation that Drives Action. Pfeiﬀer, 2nd edition, 2013.
    Then the output should be:
    {
        "Abe13": "@book{abela2008advanced,\n    title={Advanced presentations by design: Creating communication that drives action},\n    author={Abela, Andrew},\n    year={2008},\n    publisher={John Wiley \& Sons}\n}"
    }
    Only and only extract the information from the text I provide. If you are not sure about the information,
    do not make any assumptions. Just return the information as it is.
    What is most important to me is that the citation key is created for each reference. Return ONLY and ONLY the dictionary, so that I can use it
    in my code. Do not return any other text.
"""

In [16]:
api_response = get_api_response(prompt, bibliography, model="gpt-4o")

In [20]:
text_split = api_response.split("```json\n")
if len(text_split) > 1:
    json_content = text_split[1].split("\n```")[0]

In [21]:
# store the api response
with open("ds_bib.json", "w") as f:
    f.write(json_content)

In [3]:
# load the json content
with open("../../files/data-science_book/ds_bib.json", "r") as f:
    ds_bib_dict = json.load(f)

In [27]:
def save_bibtex(bib_dict, filename="references.bib"):
    """
    Saves a dictionary of BibTeX entries to a .bib file.

    Parameters:
        bib_dict (dict): Dictionary where keys are citation keys and values are BibTeX-formatted strings.
        filename (str): The name of the output .bib file (default: 'references.bib').

    Returns:
        None
    """
    with open(filename, "w", encoding="utf-8") as f:
        for entry in bib_dict.values():
            f.write(entry + "\n\n")  # Ensure entries are separated by a blank line

    print(f"BibTeX file saved as {filename}")

In [28]:
save_bibtex(ds_bib_dict, filename="ds_bib.bib")

BibTeX file saved as ds_bib.bib


In [4]:
citation_dict = {key: re.search(r'@[\w]+\{([^,]+),', value).group(1) for key, value in ds_bib_dict.items()}

In [5]:
TEX_PATH = "../../files/data-science_book/outputs/data-science_pg_sep.tex"

In [6]:
import re

def replace_citations(tex_filename, bib_dict, output_filename="updated.tex"):
    """
    Replaces citation keys in a LaTeX file with \cite{...} using a dictionary.

    Parameters:
        tex_filename (str): The name of the input .tex file.
        bib_dict (dict): Dictionary where keys are the citation keys in text (e.g., 'Abe13'),
                         and values are the BibTeX citation keys (e.g., 'abela2013advanced').
        output_filename (str): The name of the output file with updated citations (default: 'updated.tex').

    Returns:
        None
    """
    # Read the LaTeX file
    with open(tex_filename, "r", encoding="utf-8") as f:
        tex_content = f.read()
    
    # Replace each citation key with \cite{bibtex_key}
    for key, bibtex_key in bib_dict.items():
        tex_content = re.sub(rf'\b{re.escape(key)}\b', rf'\\cite{{{bibtex_key}}}', tex_content, count=1)

    # Save the updated content to a new file
    with open(output_filename, "w", encoding="utf-8") as f:
        f.write(tex_content)

    print(f"Updated LaTeX file saved as {output_filename}")


In [7]:
replace_citations(TEX_PATH, citation_dict, output_filename="../../files/data-science_book/outputs/data-science_pg_sep_bib.tex")

Updated LaTeX file saved as ../../files/data-science_book/outputs/data-science_pg_sep_bib.tex
