In [None]:
import os
import requests
from PyPDF2 import PdfReader
from dotenv import load_dotenv, set_key
import openai
import ipywidgets as widgets
from bs4 import BeautifulSoup
load_dotenv(".env")

openai.api_key = os.environ.get("OPENAI_API_KEY")

if not openai.api_key:
    openai.api_key = input("Enter OPENAI_API_KEY API key")
    set_key(".env", "OPENAI_API_KEY", openai.api_key)

os.environ["OPENAI_API_KEY"] = openai.api_key
input_dir = "input/"
output_dir = "output/"
text_dir = os.path.join(output_dir, "text")
done_dir = os.path.join(output_dir, "done")
for directory in [input_dir, output_dir, text_dir, done_dir]:
  os.makedirs(directory, exist_ok=True)

In [None]:
url = "https://www.mass.gov/lists/data-breach-reports"
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
pdf_links = []
for a_tag in soup.find_all("a", href=True):
    href = a_tag["href"]
    if href.startswith("https://www.mass.gov/doc/data-breach-report-20"):
        pdf_links.append(href)

for url in pdf_links:
    response = requests.get(url)
    response.raise_for_status()
    filename = os.path.basename(url)
    with open(os.path.join(input_dir, filename), "wb") as pdf_file:
        pdf_file.write(response.content)
    with open(os.path.join(input_dir, filename), "rb") as pdf_file:
        pdf_reader = PdfReader(pdf_file)
        for page in pdf_reader.pages:
            text = page.extract_text()

            text_filename = os.path.splitext(filename)[0] + ".txt"
            text_path = os.path.join(text_dir, text_filename)
            with open(text_path, "a") as text_file:
                text_file.write(text)

In [None]:
model_options = {
    'gpt-3.5-turbo': 'gpt-3.5-turbo',
    'gpt-3.5-turbo-0301': 'gpt-3.5-turbo-0301',
    'gpt-4': 'gpt-4',
    'gpt-4-0314': 'gpt-4-0314',
    'text-davinci-003': 'text-davinci-003',
    'text-davinci-002': 'text-davinci-002',
    'text-davinci-edit-001': 'text-davinci-edit-001',
    'code-davinci-edit-001': 'code-davinci-edit-001'
}

dropdown = widgets.Dropdown(options=model_options)
display(dropdown)

selected_model = dropdown.value
print(f'Selected model: {selected_model}')

In [None]:
headers = {'Authorization': f'Bearer {openai.api_key}'}
openai_endpoint = f'https://api.openai.com/v1/engines/{dropdown.value}/completions'
responses = []
chunk_size = 2500
for filename in os.listdir(text_dir):
    if filename.endswith(".txt"):
        text_path = os.path.join(text_dir, filename)
        with open(text_path, "r") as text_file:
            text = text_file.read()
            for i in range(0, len(text), chunk_size):
                chunk = text[i:i+chunk_size]
                payload = {
                    "prompt": chunk,
                    "max_tokens": 2048,
                    "temperature": 0.5,
                    "n": 1,
                    "stop": "\n"
                }
                response = requests.post(openai_endpoint, headers=headers, json=payload)
                response.raise_for_status()
                responses.append(response.json())
                print(response['choices'][0]['text'])

# Combine responses and save to "done" directory
output_text = ""
for response in responses:
    output_text += response['choices'][0]['text']

done_path = os.path.join(done_dir, "output.txt")
with open(done_path, "w") as done_file:
    done_file.write(output_text)
