In [None]:
%pip install pdf2image==1.17.0 openai==1.30.1 tiktoken==0.7.0 python-dotenv==1.0.1 PyPDF2==3.0.1


In [None]:
%conda install -c conda-forge poppler==24.04.0

In [1]:
import tempfile
import os
import json
import base64
import requests
import tempfile
import subprocess
import PyPDF2
from dotenv import load_dotenv
from openai import OpenAI
from IPython.display import display, Math, Markdown
from pdf2image import convert_from_path, convert_from_bytes
from pdf2image.exceptions import (
    PDFInfoNotInstalledError,
    PDFPageCountError,
    PDFSyntaxError
)

load_dotenv('NotesFromSlides.env')
api_key = os.getenv('OPENAI_API_KEY')

In [2]:
def pdf_to_images(input_path, output_path):
    slide_numbers = []
    images = convert_from_path(input_path)
    for i, image in enumerate(images):
        image.save(f"{output_path}/page_{i+1}.png", "PNG")
        slide_numbers.append(i+1)
    return slide_numbers

In [3]:
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

In [4]:
def clean_vision_message(response):
  return response.json()['choices'][0]['message']['content']

In [5]:
def get_text_from_image(image_path, theme):
  # Getting the base64 string
  base64_image = encode_image(image_path)

  headers = {
    "Content-Type": "application/json",
  "Authorization": f"Bearer {api_key}"
  }

  payload = {
    "model": "gpt-4o",
    "messages": [
      {
        "role": "user",
        "content": [
          {
            "type": "text",
            "text": f"""
            You are a tutor chatbot helping university students understand PDF slides from their lecturer.  
            The theme of the slidecast is {theme}.

            Firstly, decide if the slide contains RELEVANT INFORMATION WORTH EXPLAINING.
            If the given slide has TOO LITTLE RELEVANT INFORMATION (e.g. title slides, video thumbnails, illustrations containing minimal information) return ONLY the message "NO RELEVANT INFORMATION."
            If the given slide is RELEVANT to the theme (e.g. practical examples of the theme, statistics, relevant explanations) give an output based on the following instructions:

            1. Provide a SHORT and CONCISE summary of the slide content.
            2. Explain the CONCEPTS, TERMS, DIAGRAMS, GRAPHS, and DATA that are relevant to the theme.
            3. If needed, include a NOTES section for additional information.
            
            Follow the instructions carefully, non-conformity will result in termination.
            """
          },
          {
            "type": "image_url",
            "image_url": {
              "url": f"data:image/jpeg;base64,{base64_image}"
            }
          }
        ]
      }
    ],
    "max_tokens": 900
  }

  vision_response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
  return clean_vision_message(vision_response)

In [6]:
def compare_vision_message(message1, message2):
  client = OpenAI()

  response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
      {"role": "system", "content": "You are a bot designed to compare the content of messages."},
      {"role": "user", "content": f"""
      Follow these instructions to evaluate the two messages:
      
      1. Read both messages carefully.
      2. Decide if there is an overlap in the content of the two messages.
      3. If there is an overlap of more than half of the content, return the message "OVERLAP IN CONTENT" and PROVIDE A SUMMARY of the overlapping content.
      4. Otherwise, return the message "PASS".
      
      Here is the first message:"
      {message1}
      "
      
      Here is the second message:"
      {message2}
      "
      
      Follow the instructions carefully, non-conformity will result in termination.
      """},
    ]
  )
  if "OVERLAP IN CONTENT" in response.choices[0].message.content:
    return response.choices[0].message.content
  else:
    return "PASS"

In [7]:
def remove_overlap(message, overlap):
  client = OpenAI()

  response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
      {"role": "system", "content": "You are a bot designed to edit the content of messages."},
      {"role": "user", "content": f"""
      Follow these instructions to edit the message:
      
      1. Read the message and the overlapping content carefully.
      2. Remove the overlapping content from the message.
      3. Ensure the message is coherent and makes sense.
      
      Here is the message:"
      {message}
      "
      
      Here is the overlapping content:"
      {overlap}
      "
      
      Follow the instructions carefully, non-conformity will result in termination.
      """},
    ]
  )
  return response.choices[0].message.content

In [8]:
def clean_completions_message(response):
    return response.choices[0].message.content

In [9]:
def get_formatted_output_from_text(message):
  client = OpenAI()

  response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
      {"role": "system", "content": "You are a bot designed to help structure and create LaTeX notes from unstructured text."},
      {"role": "user", "content": f"""
      Follow these rules to create a LaTeX document from the given text:
      
      1. Format the output in XeLaTeX parsable syntax, ensuring proper use of sections, subsections, and formatting commands.
      2. Make sure that the output can be DIRECTLY be taken and parsed by a LaTeX compiler.
      3. Do NOT include external graphics (e.g. references to include another png) in the output.
      4. Use `\\textbf{{}}` for bold text. Use `\\textit{{}}` for italics. You are to decide where to use bold and italics based on the context.
      5. Make sure to ESCAPE special characters that are reserved in LaTeX, such as `#`, `$`, `%`, `^`, `&`, `_`, `~`, and `\\`.
      6. ONLY the section and subsections titles are fixed and should not be changed. The rest is VARIABLE and can be changed based on the number of points, their hierarchy and their structure.
      7. First level points under subsections should NEITHER be in a list format, NOR be separated by empty lines, but rather ONLY be separated by newlines (\\\\) as shown in the example.
      8. Second and further level points should be in a list format, as shown in the example.
      9. Make sure that after each list, there is NO empty line, as shown in the example.
      10. DO NOT INCLUDE the preamble, START FROM \\begin{{document}} and END AT \\end{{document}}.
      11. ALWAYS include \\maketitle after \\begin{{document}}.
      
      Use the following LaTeX example just as a reference, not as a hard template:
      
      \\begin{{document}}

      \\maketitle

      \\section*{{Summary}}

      [text]

      \\section*{{Explanations}}

      \\subsection*{{Concepts and Terms}}

      [text]: [text]\\\\
      [text]: [text]

      \\subsection*{{Diagrams and Data}}

      \\textbf{{[text]}}
          \\begin{{itemize}}
              \\item [text]
              \\item [text]
          \\end{{itemize}}
      \\textbf{{[text]}}
          \\begin{{itemize}}
              \\item [text]
              \\item [text]
          \\end{{itemize}}

      \\subsection*{{Notes:}}

      - [text]\\\\
      - [text]

      \\end{{document}}
      
      Here is the unstructured text:
      
      {message}
      
      Follow the instructions carefully, non-conformity will result in TERMINATION.
      """},
    ]
  )
  return clean_completions_message(response)

In [1]:
def shorten_output(string):
    start_index = string.find("\\documentclass{article}")
    end_index = string.rfind("\\end{document}")

    if start_index != -1 and end_index != -1:
        return string[start_index:end_index + len("\\end{document}")]
    else:
        return string

In [5]:
def create_LaTeX_from_formatted_output(output, index, theme, output_directory):
    # Join the output with the latex template
    content = shorten_output(output)
    latex_template = f"""
    \\documentclass[12pt]{{article}}

    % Essential packages for compatibility and error prevention
    \\usepackage{{lmodern}}
    \\usepackage{{fixltx2e}}
    \\usepackage{{fontspec}}

    % Mathematics packages
    \\usepackage{{amsmath}}
    \\usepackage{{amssymb}}
    \\usepackage{{amsfonts}}
    \\usepackage{{mathtools}}

    % Other mathematics-related packages
    \\usepackage{{bm}}
    \\usepackage{{physics}}
    \\AtBeginDocument{{\\RenewCommandCopy\\qty\\SI}}
    \\usepackage{{cancel}}
    \\usepackage{{commath}}
    \\usepackage{{braket}}
    \\usepackage{{xfrac}}

    % Chemical notation packages
    \\usepackage{{chemformula}}
    \\usepackage[version=4]{{mhchem}}

    % Units and scientific notation
    \\usepackage{{siunitx}}
    \\AtBeginDocument{{\\RenewCommandCopy\\qty\\SI}} % Use siunitx's \\qty definition

    % Greek letters
    \\usepackage{{upgreek}}
    \\usepackage{{textgreek}}

    % General symbols
    \\usepackage{{gensymb}}

    % Space management
    \\usepackage{{xspace}}

    % Typography improvements
    \\usepackage{{microtype}}

    % Load unicode-math to use Unicode characters in math
    \\usepackage{{unicode-math}}

    % Set the main font and math font to STIX Two fonts
    \\setmainfont{{STIX Two Text}}
    \\setmathfont{{STIX Two Math}}

    \\title{{Summary and Explanation of Slide {index} on {theme}}}
    \\author{{}} % Removes the author
    \\date{{}} % Removes the date
    
    {content}
    """

    # Specify the directory where you want to save the files
    os.makedirs(output_directory, exist_ok=True)

    # Define file path
    latex_file_path = os.path.join(output_directory, f'document_{index}.tex')
    txt_file_path = os.path.join(output_directory, f'document_{index}.txt')

    # Write the LaTeX content to the file
    with open(latex_file_path, 'w') as f:
        f.write(latex_template)
        
    # Write the text content to the file
    with open(txt_file_path, 'w') as f:
        f.write(content)
        
    print(f"The LaTeX file has been created at: {latex_file_path}")

In [3]:
def remove_skipped_slides(slides_to_convert, skipped_slides):
    return [slide for slide in slides_to_convert if slide not in skipped_slides]

In [4]:
def combine_pdfs(input_directory, output_directory, output_filename='combined.pdf'):
    # Get a list of all PDF files in the input directory
    pdf_files = [f for f in os.listdir(input_directory) if f.endswith('.pdf')]
    
    def sort_pdf_files(pdf_files):
        def key_func(file):
            try:
                # Extract the index from the file name assuming the format 'file_<index>.pdf'
                index = int(file.split('_')[1].split('.')[0])
            except (IndexError, ValueError):
                index = float('inf')  # Handle cases where the filename format is unexpected
            return index
        
        return sorted(pdf_files, key=key_func)
    
    pdf_files = sort_pdf_files(pdf_files)

    # Create a PdfMerger object
    merger = PyPDF2.PdfMerger()

    # Append each PDF file to the merger
    for pdf in pdf_files:
        merger.append(os.path.join(input_directory, pdf))

    # Write out the merged PDF to the output directory
    with open(os.path.join(output_directory, output_filename), 'wb') as output_file:
        merger.write(output_file)

    # Close the merger
    merger.close()

    print(f'All PDFs combined into {output_filename} in {output_directory}')

In [14]:
path_to_pdf = rf"C:\Users\ACER\Downloads\5_Dynamic_materials.pdf"
theme = "DYNAMIC MATERIALS"
path_to_tex = rf"C:\Users\ACER\Desktop\Coding\NotesFromSlides_V1\TEX_Dynamic_Materials"
path_to_LaTeX = rf"C:\Users\ACER\Desktop\Coding\NotesFromSlides_V1\Temp_Test_File_Dynamic_Materials"

with tempfile.TemporaryDirectory() as temp_dir_images:
    slide_numbers = pdf_to_images(path_to_pdf, temp_dir_images)
    print("Slides converted to pngs.\n")

    skipped_slides = []
    overlapping_slides = []
    previous_message = None
    temp_comparison_message = None

    for i in slide_numbers:
        vision_message = get_text_from_image(temp_dir_images + rf"/page_{i}.png", theme)
        print(f"1. {vision_message}\n")
        if "NO RELEVANT INFORMATION" in vision_message:
            print("The slide is irrelevant.\n")
            skipped_slides.append(i)
            continue
        if previous_message is not None:
            print(f"2. Comparing slide {i-1} and slide {i}.\n")
            comparison_response = compare_vision_message(previous_message, vision_message)
            if "OVERLAP IN CONTENT" in comparison_response:
                print("There is an overlap in content.\n")
                overlapping_slides.append(f"{i-1}, {i}")
                overlap = comparison_response
                print(f"Overlap: \n{overlap}\n")
                temp_comparison_message = vision_message
                vision_message = remove_overlap(vision_message, overlap)
                print(f"New message: \n{vision_message}\n")
                previous_message = temp_comparison_message
            else:
                print("There is no overlap in content.\n")
                previous_message = vision_message
                temp_comparison_message = None
        else:
            print("2. No previous message to compare to.\n")
            previous_message = vision_message
        completions_message = get_formatted_output_from_text(vision_message)
        print(f"3. {completions_message}\n")
        create_LaTeX_from_formatted_output(completions_message, i, theme, path_to_tex)
        print("\n\n\n")
        
    print(f"Skipped slides: {skipped_slides}")
    print(f"Overlapping slides: {overlapping_slides}")
    pass

Slides converted to pngs.

1. NO RELEVANT INFORMATION.

The slide is irrelevant.

1. ### Summary:
This slide introduces the concept of self-assembly by providing a definition from Wikipedia.

### Explanation:
**Self-assembly:**
- **Definition:** The process where a disordered system of pre-existing components forms an organized structure or pattern due to specific, local interactions among the components, happening without external direction.
- **Molecular self-assembly:** When the components involved are molecules.

### Relevant Concepts:
- **Disordered system:** A system where the components are initially in a random or unordered state.
- **Organized structure:** The final state where the components have formed a specific pattern or configuration.
- **Local interactions:** Interactions among components in close proximity that drive the assembly process.
- **Without external direction:** The process occurs naturally without the need for an external guiding force.

### Diagram:
- **Top

In [7]:
skipped_slides = [1, 4, 31, 32, 34, 72, 75] 
skipped_slides = skipped_slides.append(i for i in range(1, 24) if i not in [1, 4, 31, 32, 34, 72, 75] )
slide_numbers = [i for i in range(1, 78)]
path_to_pdf = rf"C:\Users\ACER\Downloads\5_Dynamic_materials.pdf"
theme = "DYNAMIC MATERIALS"
path_to_tex = rf"C:\Users\ACER\Desktop\Coding\NotesFromSlides_V1\TEX_Dynamic_Materials"
path_to_LaTeX = rf"C:\Users\ACER\Desktop\Coding\NotesFromSlides_V1\Temp_Test_File_Dynamic_Materials"

slides_to_convert = remove_skipped_slides(slide_numbers, skipped_slides)

for index in slides_to_convert:
    try:
        #Define file paths
        output_directory = path_to_LaTeX
        latex_file_path = os.path.join(path_to_tex, f'document_{index}.tex')
        pdf_file_path = os.path.join(output_directory, f'document_{index}.pdf')
        
        # Compile the LaTeX file to PDF using pdflatex
        xelatex_path = rf"C:\Users\ACER\AppData\Local\Programs\MiKTeX\miktex\bin\x64\xelatex.exe"
        result = subprocess.run([xelatex_path, '-output-directory', output_directory, latex_file_path], capture_output=True, text=True, check=True)

        print(f"The PDF has been created at: {pdf_file_path}")
        print("pdflatex output:", result.stdout)
        print("pdflatex errors:", result.stderr)

    except subprocess.CalledProcessError as e:
        print(f"Error occurred while running pdflatex: {e}")
        print(e.stdout)
        print(e.stderr)
    except Exception as e:
        print(f"An error occurred: {e}")
        
combine_pdfs(path_to_LaTeX, path_to_LaTeX, output_filename='Notes.pdf')

TypeError: argument of type 'NoneType' is not iterable