In [7]:
import os
import time
import requests

In [8]:
def read_markdown_chunks(file_path, chunk_size = 50, overlap = 50):
  with open(file_path, "r", encoding = "utf-8") as file:
    lines = file.readlines()
  
  chunks = []
  for i in range(0, len(lines), overlap):
    chunk = "".join(lines[i : i + chunk_size])
    chunks.append((i, chunk))
  
  return chunks

In [9]:
def preprocess(api_key, chunks, model = "gpt-4o", temperature = 0.4):
  refined_chunks = []
  max_retries = 3
  headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_key}"
  }

  for i in range(len(chunks)):
    start_idx = max(0, i - 3)
    context = ''.join([chunk[1] for chunk in chunks[start_idx:i]])
    current_chunk = chunks[i][1]

    messages = [
      {"role": "system", "content": "You are a helpful assistant that refines and cleans markdown content for use in a Retrieval-Augmented Generation (RAG) system."},
      {"role": "user", "content": f"""Refine and clean the following markdown content for use in a Retrieval-Augmented Generation (RAG) system. 
      Maintain important information and context. Remove any unnecessary markdown syntax.
      
      Context (for reference, do not include in output):
      {context}
      
      Current content to refine:
      {current_chunk}
      
      Provide only the refined content in your response."""}
    ]

    data = {
      "model": model,
      "messages": messages,
      "temperature": temperature
    }

    for attempt in range(max_retries):
      try:
        response = requests.post("https://api.openai.com/v1/chat/completions", headers = headers, json = data)
        response.raise_for_status()
        refined_content = response.json()['choices'][0]['message']['content'].strip()
        refined_chunks.append((chunks[i][0], refined_content))
        break
      except requests.exceptions.RequestException as e:
        if attempt == max_retries - 1:
          print(f"Error processing chunk {i}: {e}")
          refined_chunks.append((chunks[i][0], current_chunk))
        else:
          time.sleep(2 ** attempt)
  
  return refined_chunks

In [10]:
def save_refined_chunks(refined_chunks, output_file):
  with open(output_file, "w", encoding = "utf-8") as file:
    for chunk in refined_chunks:
      file.write(f"--- Chunk starting at line {chunk[0]} ---\n")
      file.write(chunk[1])
      file.write("\n\n")


In [11]:
def main(input_file, output_file, api_key):
  chunks = read_markdown_chunks(input_file)
  refined_chunks = preprocess(api_key, chunks)
  save_refined_chunks(refined_chunks, output_file)

In [13]:
input_file = "../markdowns/grading_doc.md"
output_file = "../plaintexts/refined_rag_data.txt"
api_key = os.getenv("OPENAI_API_KEY")

if not api_key:
  raise ValueError("OPENAI_API_KEY env var not found")

main(input_file, output_file, api_key)

KeyboardInterrupt: 