In [None]:
url = "https://ispo.ucsd.edu/"

In [None]:
%%capture
!apt-get update

In [None]:
import os
import jsonlines
import json
import openai
from webpage_scraper import extract_data_from_website
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver import FirefoxOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
opts = FirefoxOptions()
opts.add_argument("--headless")


In [None]:
driver = webdriver.Firefox(options=opts)

def visit_links(url, visited = set()):
    TAG_NAME = "a"
    if url not in visited and 'ispo' in url:
        try:
            driver.get(url)
            visited.add(url)
            print(f"Visited {url}")
            links = driver.find_elements(By.TAG_NAME, 'a')
            for link in links:
                href = link.get_attribute('href')
                if href and 'ispo' in href:
                    visited = visit_links(href, visited)
        except Exception as e:
            print(f"Error visiting {url}: {e}")
    return visited

url = "https://ispo.ucsd.edu/"
visited_links = visit_links(url)

driver.quit()


In [None]:
def get_website_text(visited_links):
  final_data = []
  for url in visited_links:
      f = open(f"/content/{url[8:].replace('/', '_')}.txt", "w")
      try:
          data = extract_data_from_website(url)
          final_data.append(data)
          f.write(data)
          f.close()
      except:
          os.remove(f"/content/{url[8:].replace('/', '_')}.txt")
          print(f"{url} empty")
  return final_data

final_data = get_website_text(visited_links)

In [None]:
def count_statistics(passages):
    total_characters = 0
    total_words = 0
    total_sentences = 0

    for passage in passages:
        # Counting characters
        total_characters += len(passage)

        # Counting words
        words = passage.split()
        total_words += len(words)

        # Counting sentences
        sentences = passage.split('.')
        total_sentences += len(sentences)

    return total_characters, total_words, total_sentences

count_statistics(final_data)

In [None]:
def split_string(string, length):
    lines = []
    current_chunk = ""

    for line in string.split("\n"):
        if len(current_chunk) + len(line) <= length:
            current_chunk += line + "\n"
        else:
            lines.append(current_chunk)
            current_chunk = line + "\n"

    # Add the last remaining chunk, if any
    if current_chunk:
        lines.append(current_chunk)

    return lines


In [None]:
def website_chunked(final_text_data):
  input_data = []
  for page in final_text_data:
    if (len(page)) < 2000:
      input_data.append(page)
    else:
      input_data.extend(split_string(page, 2000-1))
  return input_data

In [None]:
os.environ["OPENAI_API_KEY"] = "<insert API token>"

def get_web_data_chunked(url):
  jsonl_file = "/content/finetuning_data_websites_chatgpt.jsonl"
  visited_urls = visit_links(url)
  print(len(visited_urls))
  final_text_data = get_website_text(visited_links)
  print(len(final_text_data))
  website_chunked_data = website_chunked(final_text_data)
  print(len(website_chunked_data))
  print(count_statistics(website_chunked_data))
  num_requests = 0
  t = website_chunked_data
  with jsonlines.open(jsonl_file, mode='a') as writer:
      for text in tqdm(t, desc="Processing Texts"):
          completion = openai.ChatCompletion.create(
              model="gpt-3.5-turbo",
              temperature=0.1,
              max_tokens=2000,
              messages=[
                  {"role": "system", "content": "You are a language expert who needs to help create Question Answer pairs from a given piece of text."},
                  {"role": "user", "content": f'Text: \n{text}\n\nTask: Generate up to 10 logical prompt completion pairs from this text. The output should not be numbered and strictly use the following format for your response ' + '{"prompt": <text for prompt>, "completion": <text for completion>}'}
              ]
          )
          num_requests += 1
          sample = completion.choices[0].message.content

          split_samples = [i for i in sample.split('\n') if len(i) != 0 and "prompt" in i and "completion" in i]
          for pair in split_samples:
              start_index = pair.find("{")
              end_index = pair.rfind("}")
              if start_index != -1 and end_index != -1:
                pair = pair[start_index:end_index+1]

              p = json.loads(pair)
              p['prompt'] = p['prompt'] + "\n\n###\n\n"
              p['completion'] = p['completion'] + "###"
              writer.write(p)
get_web_data_chunked(url)