# Testing

```source: this key separates the various keys found in the table in Sources. Here's the set of sources with their corresponding value name:```
```
'https://aipulse.org'
'ebook'
'https://qualiacomputing.com'
'alignment forum'
'lesswrong'
'manual'
'arxiv'
'https://deepmindsafetyresearch.medium.com/'
'waitbutwhy.com'
'GitHub'
'https://aiimpacts.org'
'arbital.com'
'carado.moe'
'nonarxiv_papers'
'https://vkrakovna.wordpress.com'
'https://jsteinhardt.wordpress.com'
'audio-transcripts'
'https://intelligence.org'
'youtube'
'reports'
'https://aisafety.camp'
'curriculum'
'https://www.yudkowsky.net'
'distill'
```

```...and this is how the arxiv papers look like:```

```
{
    "source": "arxiv", # where the dataset comes from
    "source_type": "latex", # the type of file the data was original in
    "converted_with": "pandoc", # which tool we used to convert the data in .md format
    "paper_version": paper_id,
    "title": title,
    "authors": [str(x) for x in authors], # list of authors
    "date_published": date_published,
    "data_last_modified": data_last_modified,
    "url": url,
    "abstract": abstract,
    "author_comment": author_comment,
    "journal_ref": journal_ref,
    "doi": doi,
    "primary_category": primary_category,
    "categories": categories,
    "citation_level": citation_level, # (0 = curated alignment papers, 1 = citation of curated papers, 2 = citation of citation, etc.)
    "alignment_text": is_alignment_text, # 'pos' is maunally labeled as an alignment paper, 'unlabeled' if unlabeled
    "confidence_score": confidence_scores, # this is a confidence score obtained by using the SPECTER model to classify papers to add to the dataset
    "main_tex_filename": "main.tex", # the main latex file needed to convert the paper
    "text": "lots of text", # this is where you will grab the text contents of each entry in the dataset (in .md format)
    "bibliography_bbl": "string of bbl",
    "bibliography_bib": "string of bib", # more common to have bib than bbl
}
```

Useful links:

- https://github.com/openai/openai-cookbook/blob/main/examples/Semantic_text_search_using_embeddings.ipynb

- https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb

## Imports

In [1]:
import jsonlines
import numpy as np
from typing import List, Dict, Tuple
import re
import matplotlib.pyplot as plt
import openai

import config

## Constants

In [2]:
LEN_EMBEDDINGS = 1536
PATH_TO_DATA = r"C:\Users\Henri\Documents\GitHub\AlignmentSearch\data\alignment_texts.jsonl"

COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

openai.api_key = config.OPENAI_API_KEY

MAX_LEN_PROMPT = 5000

## Helpers

In [8]:
# FROM https://stackoverflow.com/a/31505798/16185542

# -*- coding: utf-8 -*-
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov|edu|me)"
digits = "([0-9])"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = text.replace("?!", "?")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    if "..." in text: text = text.replace("...","<prd><prd><prd>")
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")

    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    
    if sentences == []:
        sentences = [text.strip()]
    return sentences

In [9]:
def split_article(text: str) -> List[str]:
    # Receives one text (str) and returns a list of sections (List[str]), each section being a few appended paragraphs that do not exceed 1000 words.
    # This is done to avoid the 8000 token limit of OpenAI embeddings.
    sections = []
    section = ""
    paragraphs = text.split('\n')
    for paragraph in paragraphs:
        if paragraph == "": continue
        if len(section.split()) + len(paragraph.split()) > 1000 or len(section) + len(paragraph) > 7000:
            sections.append(section)
            section = ""
        section += f"{paragraph}\n"
    sections.append(section)
    return sections

## Dataset Class

In [51]:
class Dataset:
    def __init__(self,
            path: str,  # Path to the dataset .jsonl file.
            sources: List[str] = None,  # List of sources to include. If None, include all sources.
            max_paragraph_length: Tuple[int, int] = None  # (max number of words in a paragraph, max number of characters in a paragraph)
        ):

        self.path = path
        self.sources = sources
        self.max_paragraph_length = max_paragraph_length
            
        self.data: List[Tuple[str, str, str]] = []  # List of tuples, each containing the title of an article, its URL, and text. E.g.: [('title', 'url', 'text'), ...]
        self.embed_split: List[str] = []  # List of strings, each being a few paragraphs from a single article (not exceeding 1000 words).
        
        self.num_articles: Dict[str, int] = {}  # Number of articles per source. E.g.: {'source1': 10, 'source2': 20, 'total': 30}
        if sources is None:
            self.num_articles['total'] = 0
        else:
            for source in sources: 
                self.num_articles[source] = 0
            self.num_articles['total'] = 0
        
        self.total_char_count = 0
        self.total_word_count = 0
        self.total_sentence_count = 0
        self.total_paragraph_count = 0
        
    def get_alignment_texts(self):
        with jsonlines.open(self.path, "r") as reader:
            for entry in reader:
                try:
                    if self.sources is None:
                        if entry['source'] not in self.num_articles:
                            self.num_articles[entry['source']] = 1
                        else:
                            self.num_articles[entry['source']] += 1
                        self.num_articles['total'] += 1
                    else:
                        if entry['source'] in self.sources:
                            self.num_articles[entry['source']] += 1
                            self.num_articles['total'] += 1
                        else:
                            continue
                    
                    # BIG PROBLEM: Very often, the post will have no URL, so this will fail.
                    self.data.append((entry['title'], entry['url'], entry['text']))
                    paragraphs = split_article(entry['text'])
                    self.embed_split.extend(paragraphs)
                    
                    self.total_char_count += len(entry['text'])
                    self.total_word_count += len(entry['text'].split())
                    self.total_sentence_count += len(split_into_sentences(entry['text']))
                    self.total_paragraph_count += len(paragraphs)
                except KeyError: # TO BE CHANGED
                    pass
    
    def get_embedding(text: str) -> np.ndarray:
        result = openai.Embedding.create(model=EMBEDDING_MODEL, input=text)
        return result["data"][0]["embedding"]

    def get_embeddings(self):
        self.embeddings = np.array([self.get_embedding(text) for text in self.embed_split])
    
    def save_embeddings(self, path: str):
        np.save(path, self.embeddings)
        
    def load_embeddings(self, path: str):
        self.embeddings = np.load(path)

In [52]:
class SearchAndAnswer:
    def __init__(self,
            dataset: Dataset,  # Dataset object containing the data.
        ):
        self.dataset = dataset
        
    def get_embedding(self, text: str) -> np.ndarray:
        result = openai.Embedding.create(model=EMBEDDING_MODEL, input=text)
        return result["data"][0]["embedding"]
    
    def get_top_k(self, query: str, k: int=10) -> List[Tuple[str, str, str]]:
        # Receives a query (str) and returns the top k articles (List[Tuple[str, str, str]]) that are most similar to the query.
        # Each tuple contains the title of an article, its URL, and text.
        query_embedding = self.get_embedding(query)
        similarities = np.dot(self.dataset.embeddings, query_embedding)
        top_k_indices = np.argsort(similarities)[::-1][:k]
        top_k = [self.dataset.data[i] for i in top_k_indices]
        return top_k
    
    def construct_prompt(self, question: str, texts: List[Tuple[str, str, str]]) -> str:
        # Receives a question (str) and a list of articles (List[Tuple[str, str, str]]) and returns a prompt (str) to be used for text generation.
        context = "\n".join(texts)[:MAX_LEN_PROMPT]
        header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
        return header + "".join(context) + "\n\n Q: " + question + "\n A:"
    
    def answer_question(self, question: str, texts: List[Tuple[str, str, str]]) -> str:
        # Receives a question (str) and a list of articles (List[Tuple[str, str, str]]) and returns an answer (str) to the question.
        prompt = self.construct_prompt(question, texts)
        COMPLETIONS_API_PARAMS = {
            "temperature": 0.0,
            "max_tokens": 500,
            "model": COMPLETIONS_MODEL,
        }
        answer = openai.Completion.create(prompt=prompt, **COMPLETIONS_API_PARAMS)["choices"][0]["text"].strip(" \n")
        return answer
    
    def search_and_answer(self, question: str, k: int=10) -> str:
        # Receives a question (str) and returns an answer (str) to the question.
        top_k = self.get_top_k(question, k)
        answer = self.answer_question(question, top_k)
        return answer
    
    def summarize(self, article: str) -> str:
        COMPLETIONS_API_PARAMS = {
            "temperature": 0.0,
            "max_tokens": 300,
            "model": COMPLETIONS_MODEL,
        }
        raise NotImplementedError

In [54]:
dataset = Dataset(path=PATH_TO_DATA, sources=['https://www.yudkowsky.net'])
dataset.get_alignment_texts()
# dataset.get_embeddings()
# dataset.save_embeddings(EMBEDDINGS_PATH)
# # dataset.load_embeddings(EMBEDDINGS_PATH)

# search_and_answer = SearchAndAnswer(dataset)

# while True:
#     question = input("Enter a question: ")
#     if question == "quit":
#         break
#     top_k = search_and_answer.get_top_k(question)
#     answer = search_and_answer.answer_question(question, top_k)
#     print(answer)

In [59]:
dataset.num_articles

{'https://www.yudkowsky.net': 23, 'total': 23}

In [1]:
# self.path = path
# self.sources = sources
# self.max_data_length = max_data_length
# self.len_embeddings = len_embeddings


# self.data: List[Tuple[str, str, str]] = []  # List of tuples, each containing the title of an article, its URL, and text. E.g.: [('title', 'url', 'text'), ...]
# self.embed_split: List[str] = []  # List of strings, each being a few paragraphs from a single article (not exceeding 1000 words).

# self.num_articles: Dict[str, int] = {}  # Dict of number of articles from each source, with total number of articles. Initialize num_articles to 0 for each source.
# for source in sources: self.num_articles[source] = 0
# self.num_articles['total'] = 0

# self.total_char_count = 0
# self.total_word_count = 0
# self.total_sentence_count = 0
# self.total_paragraph_count = 0

num_articles_truth = {
    'https://aipulse.org': 23,
    'ebook': 23,
    'https://qualiacomputing.com': 278,
    'alignment forum': 2138,
    'lesswrong': 28252 + 227,
    'manual': "?",
    'arxiv': 707 + 1679 + 1000 + 4621,
    'https://deepmindsafetyresearch.medium.com/': 10,
    'waitbutwhy.com': 2,
    'GitHub': "?",
    'https://aiimpacts.org': 227,
    'arbital.com': 223,
    'carado.moe': 59,
    'nonarxiv_papers': "?",
    'https://vkrakovna.wordpress.com': 43,
    'https://jsteinhardt.wordpress.com': 39,
    'audio-transcripts': 25 + 12,
    'https://intelligence.org': 479,
    'youtube': 457,
    'reports': "?",
    'https://aisafety.camp': 8,
    'curriculum': "?",
    'https://www.yudkowsky.net': 23,
    'distill': 49,
    'total': 2138+28252+707+1679+1000+4621+23+227+23+8+59+111+10+17+7+479+39+278+43+2+23+420+323+49+457+25+12+223+227+132    
}
word_count_truth = 53_550_146
char_count_truth = 351_767_163

# Print table. First row has Truth and Empirical findings.
print(f"{'Source':<20} {'Truth':<10} {'Empirical':<10} {'Difference':<10}")
for source in dataset.num_articles:
    try:
        print(f"{source[:20]:<20} {num_articles_truth[source]:<10} {dataset.num_articles[source]:<10} {num_articles_truth[source] - dataset.num_articles[source]:<10}")
    except TypeError:
        print(f"{source[:20]:<20} {num_articles_truth[source]:<10} {dataset.num_articles[source]:<10} {'UNKNOWN':<10}")

# Compare true and empirical word counts and character counts
print(f"\n{'':<20} {'Truth':<10} {'Empirical':<10} {'Difference':<10}")
print(f"{'Word Count':<20} {word_count_truth:<10} {dataset.total_word_count:<10} {word_count_truth - dataset.total_word_count:<10}")
print(f"{'Character Count':<20} {char_count_truth:<10} {dataset.total_char_count:<10} {char_count_truth - dataset.total_char_count:<10}")

Source               Truth      Empirical  Difference


NameError: name 'dataset' is not defined

In [150]:

num_words = dataset.data_length / 5
num_tokens = num_words * 1.5
num_paragraphs = num_words // 200
num_embeds_method_1 = num_tokens // 8096
num_embeds_method_2 = num_words // 600
cost_per_embed = 1/(3000*500/8096)
cost_per_page = 1/3000
num_pages = num_words // 500
cost_1 = num_pages * cost_per_page
cost_2 = num_embeds_method_2 * cost_per_embed

print(f"{dataset.data_length} characters")
print(f"~{num_tokens:.0f} tokens")
print(f"~{num_words:.0f} words")
print(f"~{num_paragraphs:.0f} paragraphs")
print(f"~{num_embeds_method_1:.0f} embeddings using method 1")
print(f"~{num_embeds_method_2:.0f} embeddings using method 2")
print(f"~{num_pages:.0f} pages")
print(f"~{cost_1:.0f} cost using method 1")
print(f"~{cost_2:.0f} cost using method 2")

348457759 characters
~104537328 tokens
~69691552 words
~348457 paragraphs
~12912 embeddings using method 1
~116152 embeddings using method 2
~139383 pages
~46 cost using method 1
~627 cost using method 2


## Random tests

In [63]:
import json

In [72]:
with jsonlines.open(PATH_TO_DATA, "r") as reader, open("aipulse.txt", "w", encoding="utf-8") as writer:
    for entry in reader:
        try:
            if 'source' in entry and entry['source'] == 'https://aipulse.org':
                if 'title' in entry:
                    writer.write(f"Title: {entry['title']}\n")
                else:
                    writer.write(f"NO TITLE\n")
                if 'text' in entry:
                    writer.write(f"Text: {entry['text']}\n")
                else:
                    writer.write(f"NO TEXT\n")
                if 'url' in entry:
                    writer.write(f"URL: {entry['url']}\n")
                else:
                    writer.write(f"NO URL\n")
                writer.write("\n\n")
            else:
                continue
        except KeyError:
            pass