# Testing

```source: this key separates the various keys found in the table in Sources. Here's the set of sources with their corresponding value name:```
```
'https://aipulse.org'
'ebook'
'https://qualiacomputing.com'
'alignment forum'
'lesswrong'
'manual'
'arxiv'
'https://deepmindsafetyresearch.medium.com/'
'waitbutwhy.com'
'GitHub'
'https://aiimpacts.org'
'arbital.com'
'carado.moe'
'nonarxiv_papers'
'https://vkrakovna.wordpress.com'
'https://jsteinhardt.wordpress.com'
'audio-transcripts'
'https://intelligence.org'
'youtube'
'reports'
'https://aisafety.camp'
'curriculum'
'https://www.yudkowsky.net'
'distill'
```

```...and this is how the arxiv papers look like:```

```
{
    "source": "arxiv", # where the dataset comes from
    "source_type": "latex", # the type of file the data was original in
    "converted_with": "pandoc", # which tool we used to convert the data in .md format
    "paper_version": paper_id,
    "title": title,
    "authors": [str(x) for x in authors], # list of authors
    "date_published": date_published,
    "data_last_modified": data_last_modified,
    "url": url,
    "abstract": abstract,
    "author_comment": author_comment,
    "journal_ref": journal_ref,
    "doi": doi,
    "primary_category": primary_category,
    "categories": categories,
    "citation_level": citation_level, # (0 = curated alignment papers, 1 = citation of curated papers, 2 = citation of citation, etc.)
    "alignment_text": is_alignment_text, # 'pos' is maunally labeled as an alignment paper, 'unlabeled' if unlabeled
    "confidence_score": confidence_scores, # this is a confidence score obtained by using the SPECTER model to classify papers to add to the dataset
    "main_tex_filename": "main.tex", # the main latex file needed to convert the paper
    "text": "lots of text", # this is where you will grab the text contents of each entry in the dataset (in .md format)
    "bibliography_bbl": "string of bbl",
    "bibliography_bib": "string of bib", # more common to have bib than bbl
}
```

https://aipulse.org: title links link authors author text (tags)

None: title url text

ebook: title book_title authors text (publication_date)

https://qualiacomputing.com: title link authors author text (tags)

alignment forum: title url authors text (tags)

lesswrong: title authors url text (tags score date_published)

manual: title authors text (date_published)

arxiv: title authors url text (citation_level alignment_text confidence_score date_published)

https://deepmindsafetyresearch.medium.com/: title url text

waitbutwhy.com: title authors text (date_published)

GitHub: book_title authors author text

https://aiimpacts.org: title link authors author text (tags)

arbital.com: title authors url text (date_published)

carado.moe: title authors text (date_published)

nonarxiv_papers: title authors doi text (date_published)

https://vkrakovna.wordpress.com: title link authors author text (tags)

https://jsteinhardt.wordpress.com: title link authors author text (tags)

audio-transcripts: title authors text (date_published)

https://intelligence.org: title link authors author text (tags)

youtube: title authors url text (date_published)

reports: title authors doi text (date_published)

https://aisafety.camp: title link authors author text (tags)

curriculum: title authors text (date_published)

https://www.yudkowsky.net: title link authors author text (tags)

distill: title authors doi text (date_published)

Useful links:

- https://github.com/openai/openai-cookbook/blob/main/examples/Semantic_text_search_using_embeddings.ipynb

- https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb

## Imports

In [143]:
import jsonlines
import numpy as np
from typing import List, Dict, Tuple
import re
import time
import random
import pickle
import openai
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

import config

## Constants

In [119]:
LEN_EMBEDDINGS = 1536
PATH_TO_DATA = r"C:\Users\Henri\Documents\GitHub\AlignmentSearch\src\Embeddings Search\data\alignment_texts.jsonl"
PATH_TO_EMBEDDINGS = r"C:\Users\Henri\Documents\GitHub\AlignmentSearch\src\Embeddings Search\data\embeddings.npy"
PATH_TO_DATASET = r"C:\Users\Henri\Documents\GitHub\AlignmentSearch\src\Embeddings Search\data\dataset.pkl"

COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

openai.api_key = config.OPENAI_API_KEY

MAX_LEN_PROMPT = 5000

## Helpers

In [9]:
# FROM https://stackoverflow.com/a/31505798/16185542

# -*- coding: utf-8 -*-
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov|edu|me)"
digits = "([0-9])"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = text.replace("?!", "?")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    if "..." in text: text = text.replace("...","<prd><prd><prd>")
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")

    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    
    if sentences == []:
        sentences = [text.strip()]
    return sentences

In [13]:
class MissingDataException(Exception):
    pass

In [14]:
class TextSplitter:
    def __init__(self, block_maxsize: int = 800, block_minsize: int = 500):
        self.block_maxsize = block_maxsize
        self.block_minsize = block_minsize
        self.blocks = []
        self.current_block = []
        self.current_block_len = 0


    def add_sentence_to_blocks(self, sentence):
        sentence_len = len(sentence)
        sentence_fits_in_current_block = self.current_block_len + sentence_len <= self.block_maxsize
        current_block_is_big_enough = self.current_block_len >= self.block_minsize
        sentence_fits_in_standalone_block = sentence_len <= self.block_maxsize

        if sentence_fits_in_current_block:
            self.current_block.append(sentence)
            self.current_block_len += sentence_len + 1 # +1 for the space
            return
        
        if current_block_is_big_enough and sentence_fits_in_standalone_block:
            self.blocks.append(" ".join(self.current_block))
            self.current_block = [sentence]
            self.current_block_len = sentence_len + 1 # +1 for the space
            return
        
        #special cases:TODO refactor
        #case 1: current_block_len < block_minsize and current_block_len + sentence_len > block_maxsize
        #case 2: current_block_len > block_minsize but sentence_len > block_maxsize
        shorter_sentence = sentence[self.block_maxsize - self.current_block_len]
        self.current_block.append(shorter_sentence)
        self.blocks.append(" ".join(self.current_block))
        self.current_block = []
        self.current_block_len = 0      
        

    def add_paragraph_to_blocks(self, paragraph):
        paragraph_len = len(paragraph)
        if self.current_block_len + paragraph_len > self.block_maxsize:
            sentences = split_into_sentences(paragraph)
            for sentence in sentences:
                self.add_sentence_to_blocks(sentence)
            return
        
        if self.block_minsize <= self.current_block_len + paragraph_len <= self.block_maxsize:
            self.current_block.append(paragraph)
            self.blocks.append("\n\n".join(self.current_block))
            self.current_block = []
            self.current_block_len = 0
            return
        
        if self.current_block_len + paragraph_len < self.block_minsize:
            self.current_block.append(paragraph)
            self.current_block_len += paragraph_len + 2 # +2 for the \n\n
            return
        
    def add_text_to_blocks(self, text):
        paragraphs = text.split("\n\n")
        for paragraph in paragraphs:
            self.add_paragraph_to_blocks(paragraph)
        if self.current_block != []:
            self.blocks.append("\n\n".join(self.current_block))


    def split(self, text: str, signature: str) -> List[str]:
        """Split text into multiple blocks and add signature to each block."""
        # signature has the format : "link, title, author"
        self.add_text_to_blocks(text)
        
        return [f"{block}\n - {signature}" for block in self.blocks]


## Dataset Class

In [15]:
error_count_dict = {
    "Entry has no source.": 0,
    "Entry has no title.": 0,
    "Entry has no text.": 0,
    "Entry has no URL.": 0,
    "Entry has wrong citation level.": 0
}

In [138]:
class Dataset:
    def __init__(self,
            path: str,  # Path to the dataset .jsonl file.
            sources: List[str] = None,  # List of sources to include. If None, include all sources.
            rate_limit_per_minute: int = 60,  # Rate limit for the OpenAI API.
            block_min_max_size: Tuple[int, int] = None,  # Tuple of (min_block_size, max_block_size), used for the text splitter. If None, use default values.
        ):
        self.path = path
        self.sources = sources
        self.rate_limit_per_minute = rate_limit_per_minute
        self.delay_in_seconds = 60.0 / self.rate_limit_per_minute
        
        # Set up text splitter
        if block_min_max_size is None: self.block_min_max_size = (400, 600)
        else: self.block_min_max_size = block_min_max_size
        self.text_splitter = TextSplitter(block_maxsize=self.block_min_max_size[1], block_minsize=self.block_min_max_size[0])
        
        self.data: List[Tuple[str]] = []  # List of tuples, each containing the title of an article, its URL, and text. E.g.: [('title', 'url', 'text'), ...]
        self.embed_split: List[str] = []  # List of strings, each being a few paragraphs from a single article (not exceeding 1000 words).
        
        self.num_articles: Dict[str, int] = {}  # Number of articles per source. E.g.: {'source1': 10, 'source2': 20, 'total': 30}
        if sources is None:
            self.num_articles['total'] = 0
        else:
            for source in sources: 
                self.num_articles[source] = 0
            self.num_articles['total'] = 0
        
        self.total_char_count = 0
        self.total_word_count = 0
        self.total_sentence_count = 0
        self.total_block_count = 0
        
    def get_info_tmp(self):
        self.sources_so_far = []
        self.info_types: Dict[str, List[str]] = {}
        with jsonlines.open(self.path, "r") as reader:
            for entry in reader:
                if 'source' not in entry: entry['source'] = 'None'
                
                if entry['source'] not in self.sources_so_far:
                    self.sources_so_far.append(entry['source'])
                    self.info_types[entry['source']] = entry.keys()
                
                if 'tags' in entry:
                    print(entry['tags'])
                    
                """
                {
                'text', 
                'title', 'book_title', # If there is both, take title, otherwise take book_title
                'author', 'authors', # If there is both, take author, otherwise take authors, otherwise take author
                'citation_level', # must be 0 or 1
                'date_published', 'published', # take first 10 chars of date_published, if it exists; else take first 16 chars of published, if it exists
                'doi', 'link', 'links', 'url', # if link, take link; elif url, take url; elif doi, take doi
                'tags'
                }
                """
    
    def get_alignment_texts(self):
        with jsonlines.open(self.path, "r") as reader:
            for entry in reader:
                # Only get one in a thousand articles
                if random.randint(0, 3000) != 19: continue
                try:
                    if 'source' not in entry: raise MissingDataException("Entry has no source.")
                    
                    if self.sources is None:
                        if entry['source'] not in self.num_articles: self.num_articles[entry['source']] = 1
                        else: self.num_articles[entry['source']] += 1
                        self.num_articles['total'] += 1
                    else:
                        if entry['source'] in self.sources:
                            self.num_articles[entry['source']] += 1
                            self.num_articles['total'] += 1
                        else: continue
                    
                    text=title=author=citation_level=date_published=url=tags=None
                    
                    # Get text
                    if 'text' in entry and entry['text'] != '': text = entry['text']
                    else: raise MissingDataException(f"Entry has no text.")
                    
                    # Get title
                    if 'title' in entry and 'book_title' in entry and entry['title'] != '': title = entry['title']
                    elif 'book_title' in entry and entry['book_title'] != '': title = entry['book_title']
                    else: title = None
                        
                    # Get author
                    if 'author' in entry and 'authors' in entry and entry['author'] != '': author = entry['author']
                    elif 'authors' in entry and entry['authors'] != '': author = entry['authors']
                    elif 'author' in entry and entry['author'] != '': author = entry['author']
                    else: author = None
                        
                    # Get citation level
                    if 'citation_level' in entry:
                        if entry['citation_level'] != 0: raise MissingDataException(f"Entry has citation_level {entry['citation_level']}.")
                    
                    # Get date published
                    if 'date_published' in entry and entry['date_published'] != '': date_published = entry['date_published'][:10]
                    elif 'published' in entry and entry['published'] != '': date_published = entry['published'][:16]
                    else: date_published = None
                        
                    # Get URL
                    if 'link' in entry and entry['link'] != '': url = entry['link']
                    elif 'url' in entry and entry['url'] != '': url = entry['url']
                    elif 'doi' in entry and entry['doi'] != '': url = entry['doi']
                    else: url = None
                        
                    # Get tags
                    if 'tags' in entry and entry['tags'] != '':
                        if type(entry['tags']) == list: tags = ', '.join([val['term'] for val in entry['tags']])
                        elif type(entry['tags']) == str: tags = entry['tags']
                        else: tags = None
                    
                    signature = ""
                    if title: signature += f"Title: {title}, "
                    if author: signature += f"Author: {author}, "
                    if date_published: signature += f"Date published: {date_published}, "
                    if url: signature += f"URL: {url}, "
                    if tags: signature += f"Tags: {tags}, "
                    signature = signature[:-2]

                    self.data.append((title, author, date_published, url, tags, text))
                    
                    blocks = self.text_splitter.split(text, signature)
                    self.embed_split.extend(blocks)
                    
                    self.total_char_count += len(entry['text'])
                    self.total_word_count += len(entry['text'].split())
                    self.total_sentence_count += len(split_into_sentences(entry['text']))
                    self.total_block_count += len(blocks)
                
                except MissingDataException as e:
                    if str(e) not in error_count_dict:
                        error_count_dict[str(e)] = 0
                    error_count_dict[str(e)] += 1

    @retry(wait=wait_random_exponential(min=1, max=100), stop=stop_after_attempt(10))
    def get_embedding(self, text: str, delay_in_seconds: float = 0) -> np.ndarray:
        time.sleep(delay_in_seconds)
        result = openai.Embedding.create(model=EMBEDDING_MODEL, input=text)
        return result["data"][0]["embedding"]

    def get_embeddings(self):
        self.embeddings = np.array([self.get_embedding(text, delay_in_seconds=self.delay_in_seconds) for text in self.embed_split])
    
    def save_embeddings(self, path: str):
        np.save(path, self.embeddings)
        
    def load_embeddings(self, path: str):
        self.embeddings = np.load(path)
        
    def save_class(self, path: str):
        with open(path, 'wb') as f:
            pickle.dump(self, f)

In [139]:
dataset = Dataset(path=PATH_TO_DATA, sources=None)
dataset.get_alignment_texts()
dataset.get_embeddings()
dataset.save_embeddings(PATH_TO_EMBEDDINGS)
dataset.save_class(PATH_TO_DATASET)

1416


In [None]:
# with open(PATH_TO_DATASET, 'rb') as f:
#     dataset = pickle.load(f)

In [157]:
class SearchAndAnswer:
    def __init__(self,
            dataset: Dataset,  # Dataset object containing the data.
        ):
        self.dataset = dataset
    
    @retry(wait=wait_random_exponential(min=1, max=100), stop=stop_after_attempt(10))
    def get_embedding(self, text: str) -> np.ndarray:
        result = openai.Embedding.create(model=EMBEDDING_MODEL, input=text)
        return result["data"][0]["embedding"]
    
    def get_top_k(self, query: str, k: int=10) -> List[Tuple[str, str, str]]:
        # Receives a query (str) and returns the top k articles (List[Tuple[str, str, str]]) that are most similar to the query.
        # Each tuple contains the title of an article, its URL, and text.
        query_embedding = self.get_embedding(query)
        similarities = np.dot(self.dataset.embeddings, query_embedding)
        print(similarities.shape)
        top_k_indices = np.argsort(similarities)[::-1][:k]
        print(top_k_indices)
        top_k = [self.dataset.embed_split[i] for i in top_k_indices]
        return top_k
    
    def construct_prompt(self, question: str, texts: List[Tuple[str]]) -> str:
        # Receives a question (str) and a list of articles (List[Tuple[str, str, str]]) and returns a prompt (str) to be used for text generation.
        context = "\n".join(texts)[:MAX_LEN_PROMPT]
        header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
        return header + "".join(context) + "\n\n Q: " + question + "\n A:"
    
    def answer_question(self, question: str, texts: List[Tuple[str, str, str]]) -> str:
        # Receives a question (str) and a list of articles (List[Tuple[str, str, str]]) and returns an answer (str) to the question.
        prompt = self.construct_prompt(question, texts)
        COMPLETIONS_API_PARAMS = {
            "temperature": 0.0,
            "max_tokens": 500,
            "model": COMPLETIONS_MODEL,
        }
        answer = openai.Completion.create(prompt=prompt, **COMPLETIONS_API_PARAMS)["choices"][0]["text"].strip(" \n")
        return answer
    
    def search_and_answer(self, question: str, k: int=10, HyDE: bool=False) -> str:
        # Receives a question (str) and returns an answer (str) to the question.
        if HyDE:
            raise NotImplementedError
        else:
            top_k = self.get_top_k(question, k)
        answer = self.answer_question(question, top_k)
        return answer


In [158]:
SA = SearchAndAnswer(dataset=dataset)
prompt = "Name a problem in AI Alignment."
answer = SA.search_and_answer(prompt, 3, HyDE=False)
print(answer)

(1416,)
[  82  916 1171]
The need for a high quality alignment dataset for very capable models.


In [147]:
# self.path = path
# self.sources = sources
# self.max_data_length = max_data_length
# self.len_embeddings = len_embeddings


# self.data: List[Tuple[str, str, str]] = []  # List of tuples, each containing the title of an article, its URL, and text. E.g.: [('title', 'url', 'text'), ...]
# self.embed_split: List[str] = []  # List of strings, each being a few paragraphs from a single article (not exceeding 1000 words).

# self.num_articles: Dict[str, int] = {}  # Dict of number of articles from each source, with total number of articles. Initialize num_articles to 0 for each source.
# for source in sources: self.num_articles[source] = 0
# self.num_articles['total'] = 0

# self.total_char_count = 0
# self.total_word_count = 0
# self.total_sentence_count = 0
# self.total_paragraph_count = 0

num_articles_truth = {
    'https://aipulse.org': 23,
    'ebook': 23,
    'https://qualiacomputing.com': 278,
    'alignment forum': 2138,
    'lesswrong': 28252 + 227,
    'manual': "?",
    'arxiv': 707 + 1679 + 1000 + 4621,
    'https://deepmindsafetyresearch.medium.com/': 10,
    'waitbutwhy.com': 2,
    'GitHub': "?",
    'https://aiimpacts.org': 227,
    'arbital.com': 223,
    'carado.moe': 59,
    'nonarxiv_papers': "?",
    'https://vkrakovna.wordpress.com': 43,
    'https://jsteinhardt.wordpress.com': 39,
    'audio-transcripts': 25 + 12,
    'https://intelligence.org': 479,
    'youtube': 457,
    'reports': "?",
    'https://aisafety.camp': 8,
    'curriculum': "?",
    'https://www.yudkowsky.net': 23,
    'distill': 49,
    'total': 2138+28252+707+1679+1000+4621+23+227+23+8+59+111+10+17+7+479+39+278+43+2+23+420+323+49+457+25+12+223+227+132    
}
word_count_truth = 53_550_146
char_count_truth = 351_767_163

# Print table. First row has Truth and Empirical findings.
print(f"{'Source':<20} {'Truth':<10} {'Empirical':<10} {'Difference':<10}")
for source in dataset.num_articles:
    try:
        print(f"{source[:20]:<20} {num_articles_truth[source]:<10} {dataset.num_articles[source]:<10} {num_articles_truth[source] - dataset.num_articles[source]:<10}")
    except TypeError:
        print(f"{source[:20]:<20} {num_articles_truth[source]:<10} {dataset.num_articles[source]:<10} {'UNKNOWN':<10}")

# Compare true and empirical word counts and character counts
print(f"\n{'':<20} {'Truth':<10} {'Empirical':<10} {'Difference':<10}")
print(f"{'Word Count':<20} {word_count_truth:<10} {dataset.total_word_count:<10} {word_count_truth - dataset.total_word_count:<10}")
print(f"{'Character Count':<20} {char_count_truth:<10} {dataset.total_char_count:<10} {char_count_truth - dataset.total_char_count:<10}")

Source               Truth      Empirical  Difference
total                41614      14         41600     
lesswrong            28479      11         28468     
alignment forum      2138       1          2137      
arxiv                8007       2          8005      

                     Truth      Empirical  Difference
Word Count           53550146   23344      53526802  
Character Count      351767163  143048     351624115 
