# Testing

```source: this key separates the various keys found in the table in Sources. Here's the set of sources with their corresponding value name:```
```
'https://aipulse.org'
'ebook'
'https://qualiacomputing.com'
'alignment forum'
'lesswrong'
'manual'
'arxiv'
'https://deepmindsafetyresearch.medium.com/'
'waitbutwhy.com'
'GitHub'
'https://aiimpacts.org'
'arbital.com'
'carado.moe'
'nonarxiv_papers'
'https://vkrakovna.wordpress.com'
'https://jsteinhardt.wordpress.com'
'audio-transcripts'
'https://intelligence.org'
'youtube'
'reports'
'https://aisafety.camp'
'curriculum'
'https://www.yudkowsky.net'
'distill'
```

```...and this is how the arxiv papers look like:```

```
{
    "source": "arxiv", # where the dataset comes from
    "source_type": "latex", # the type of file the data was original in
    "converted_with": "pandoc", # which tool we used to convert the data in .md format
    "paper_version": paper_id,
    "title": title,
    "authors": [str(x) for x in authors], # list of authors
    "date_published": date_published,
    "data_last_modified": data_last_modified,
    "url": url,
    "abstract": abstract,
    "author_comment": author_comment,
    "journal_ref": journal_ref,
    "doi": doi,
    "primary_category": primary_category,
    "categories": categories,
    "citation_level": citation_level, # (0 = curated alignment papers, 1 = citation of curated papers, 2 = citation of citation, etc.)
    "alignment_text": is_alignment_text, # 'pos' is maunally labeled as an alignment paper, 'unlabeled' if unlabeled
    "confidence_score": confidence_scores, # this is a confidence score obtained by using the SPECTER model to classify papers to add to the dataset
    "main_tex_filename": "main.tex", # the main latex file needed to convert the paper
    "text": "lots of text", # this is where you will grab the text contents of each entry in the dataset (in .md format)
    "bibliography_bbl": "string of bbl",
    "bibliography_bib": "string of bib", # more common to have bib than bbl
}
```

https://aipulse.org: title links link authors author text (tags)

None: title url text

ebook: title book_title authors text (publication_date)

https://qualiacomputing.com: title link authors author text (tags)

alignment forum: title url authors text (tags)

lesswrong: title authors url text (tags score date_published)

manual: title authors text (date_published)

arxiv: title authors url text (citation_level alignment_text confidence_score date_published)

https://deepmindsafetyresearch.medium.com/: title url text

waitbutwhy.com: title authors text (date_published)

GitHub: book_title authors author text

https://aiimpacts.org: title link authors author text (tags)

arbital.com: title authors url text (date_published)

carado.moe: title authors text (date_published)

nonarxiv_papers: title authors doi text (date_published)

https://vkrakovna.wordpress.com: title link authors author text (tags)

https://jsteinhardt.wordpress.com: title link authors author text (tags)

audio-transcripts: title authors text (date_published)

https://intelligence.org: title link authors author text (tags)

youtube: title authors url text (date_published)

reports: title authors doi text (date_published)

https://aisafety.camp: title link authors author text (tags)

curriculum: title authors text (date_published)

https://www.yudkowsky.net: title link authors author text (tags)

distill: title authors doi text (date_published)

Useful links:

- https://github.com/openai/openai-cookbook/blob/main/examples/Semantic_text_search_using_embeddings.ipynb

- https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb

## Imports

In [9]:
import jsonlines
import numpy as np
from typing import List, Dict, Tuple
import re
import time
import random
import pickle
import openai
import concurrent.futures
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

import config
from pathlib import Path

## Constants

In [10]:
LEN_EMBEDDINGS = 1536

project_path = Path(__file__).parent.parent.parent
PATH_TO_DATA = project_path / "data" / "alignment_texts.jsonl" # Path to the dataset .jsonl file.
PATH_TO_EMBEDDINGS = project_path / "src" / "Embeddings Search" / "data" / "embeddings.npy" # Path to the saved embeddings (.npy) file.
PATH_TO_DATASET = project_path / "src" / "Embeddings Search" / "data" / "dataset.pkl" # Path to the saved dataset (.pkl) file.

COMPLETIONS_MODEL = "gpt-3.5-turbo"
EMBEDDING_MODEL = "text-embedding-ada-002"

openai.api_key = config.OPENAI_API_KEY

MAX_LEN_PROMPT = 5000

## Helpers

In [11]:
# FROM https://stackoverflow.com/a/31505798/16185542

# -*- coding: utf-8 -*-
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov|edu|me)"
digits = "([0-9])"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = text.replace("?!", "?")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    if "..." in text: text = text.replace("...","<prd><prd><prd>")
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")

    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    
    if sentences == []:
        sentences = [text.strip()]
    return sentences

In [12]:
class MissingDataException(Exception):
    pass

In [13]:
class TextSplitter:
    def __init__(self, block_maxsize: int = 800, block_minsize: int = 500):
        self.block_maxsize = block_maxsize
        self.block_minsize = block_minsize
        self.blocks = []
        self.current_block = []
        self.current_block_len = 0


    def add_sentence_to_blocks(self, sentence):
        sentence_len = len(sentence)
        sentence_fits_in_current_block = self.current_block_len + sentence_len <= self.block_maxsize
        current_block_is_big_enough = self.current_block_len >= self.block_minsize
        sentence_fits_in_standalone_block = sentence_len <= self.block_maxsize

        if sentence_fits_in_current_block:
            self.current_block.append(sentence)
            self.current_block_len += sentence_len + 1 # +1 for the space
            return
        
        if current_block_is_big_enough and sentence_fits_in_standalone_block:
            self.blocks.append(" ".join(self.current_block))
            self.current_block = [sentence]
            self.current_block_len = sentence_len + 1 # +1 for the space
            return
        
        #special cases:TODO refactor
        #case 1: current_block_len < block_minsize and current_block_len + sentence_len > block_maxsize
        #case 2: current_block_len > block_minsize but sentence_len > block_maxsize
        shorter_sentence = sentence[self.block_maxsize - self.current_block_len]
        self.current_block.append(shorter_sentence)
        self.blocks.append(" ".join(self.current_block))
        self.current_block = []
        self.current_block_len = 0      
        

    def add_paragraph_to_blocks(self, paragraph):
        paragraph_len = len(paragraph)
        if self.current_block_len + paragraph_len > self.block_maxsize:
            sentences = split_into_sentences(paragraph)
            for sentence in sentences:
                self.add_sentence_to_blocks(sentence)
            return
        
        if self.block_minsize <= self.current_block_len + paragraph_len <= self.block_maxsize:
            self.current_block.append(paragraph)
            self.blocks.append("\n\n".join(self.current_block))
            self.current_block = []
            self.current_block_len = 0
            return
        
        if self.current_block_len + paragraph_len < self.block_minsize:
            self.current_block.append(paragraph)
            self.current_block_len += paragraph_len + 2 # +2 for the \n\n
            return
        
    def add_text_to_blocks(self, text):
        paragraphs = text.split("\n\n")
        for paragraph in paragraphs:
            self.add_paragraph_to_blocks(paragraph)
        if self.current_block != []:
            self.blocks.append("\n\n".join(self.current_block))


    def split(self, text: str, signature: str) -> List[str]:
        """Split text into multiple blocks and add signature to each block."""
        # signature has the format : "link, title, author"
        self.add_text_to_blocks(text)
        blocks = self.blocks
        self.blocks = []
        self.current_block = []
        self.current_block_len = 0
        if blocks == []:
            raise MissingDataException("No blocks were created")
        return [f"{block}\n - {signature}" for block in blocks]


## Dataset Class

In [14]:
error_count_dict = {
    "Entry has no source.": 0,
    "Entry has no title.": 0,
    "Entry has no text.": 0,
    "Entry has no URL.": 0,
    "Entry has wrong citation level.": 0
}

In [15]:
class Dataset:
    def __init__(self,
            path: str,  # Path to the dataset .jsonl file.
            sources: List[str] = None,  # List of sources to include. If None, include all sources.
            rate_limit_per_minute: int = 3_500,  # Rate limit for the OpenAI API.
            block_min_max_size: Tuple[int, int] = None,  # Tuple of (min_block_size, max_block_size), used for the text splitter. If None, use default values.
            fraction_of_articles_to_use: float = 1.0,  # Fraction of articles to use. If 1.0, use all articles.
        ):
        self.path = path
        self.sources = sources
        self.rate_limit_per_minute = rate_limit_per_minute
        self.delay_in_seconds = 60.0 / self.rate_limit_per_minute
        self.fraction_of_articles_to_use = fraction_of_articles_to_use
        
        # Set up text splitter
        if block_min_max_size is None: self.block_min_max_size = (400, 600)
        else: self.block_min_max_size = block_min_max_size
        self.text_splitter = TextSplitter(block_maxsize=self.block_min_max_size[1], block_minsize=self.block_min_max_size[0])
        
        self.data: List[Tuple[str]] = []  # List of tuples, each containing the title of an article, its URL, and text. E.g.: [('title', 'url', 'text'), ...]
        self.embed_split: List[str] = []  # List of strings, each being a few paragraphs from a single article (not exceeding 1000 words).
        
        self.num_articles: Dict[str, int] = {}  # Number of articles per source. E.g.: {'source1': 10, 'source2': 20, 'total': 30}
        if sources is None:
            self.num_articles['total'] = 0
        else:
            for source in sources: 
                self.num_articles[source] = 0
            self.num_articles['total'] = 0
        
        self.total_char_count = 0
        self.total_word_count = 0
        self.total_sentence_count = 0
        self.total_block_count = 0
        
    def get_info_tmp(self):
        self.sources_so_far = []
        self.info_types: Dict[str, List[str]] = {}
        with jsonlines.open(self.path, "r") as reader:
            for entry in reader:
                if 'source' not in entry: entry['source'] = 'None'
                
                if entry['source'] not in self.sources_so_far:
                    self.sources_so_far.append(entry['source'])
                    self.info_types[entry['source']] = entry.keys()
                
                if 'tags' in entry:
                    print(entry['tags'])
                    
                """
                {
                'text', 
                'title', 'book_title', # If there is both, take title, otherwise take book_title
                'author', 'authors', # If there is both, take author, otherwise take authors, otherwise take author
                'citation_level', # must be 0 or 1
                'date_published', 'published', # take first 10 chars of date_published, if it exists; else take first 16 chars of published, if it exists
                'doi', 'link', 'links', 'url', # if link, take link; elif url, take url; elif doi, take doi
                'tags'
                }
                """
    
    def get_alignment_texts(self):
        with jsonlines.open(self.path, "r") as reader:
            for entry in reader:
                try:
                    if 'source' not in entry: raise MissingDataException("Entry has no source.")
                    random_number = random.random()
                    if random_number > self.fraction_of_articles_to_use:
                        continue
                    
                    if self.sources is None:
                        if entry['source'] not in self.num_articles: self.num_articles[entry['source']] = 1
                        else: self.num_articles[entry['source']] += 1
                        self.num_articles['total'] += 1
                    else:
                        if entry['source'] in self.sources:
                            self.num_articles[entry['source']] += 1
                            self.num_articles['total'] += 1
                        else: continue
                    
                    text=title=author=citation_level=date_published=url=tags=None
                    
                    # Get text
                    if 'text' in entry and entry['text']: text = entry['text']
                    else:
                        raise MissingDataException(f"Entry has no text.")
                    
                    # Get title
                    if 'title' in entry and 'book_title' in entry and entry['title']: title = entry['title']
                    elif 'book_title' in entry and 'title' not in entry and entry['book_title']: 
                        title = entry['book_title']
                        if title[-1] == '\n': title = title[:-1]
                    elif 'title' in entry and entry['title']: 
                        title = entry['title']
                        if title[-1] == '\n': title = title[:-1]
                    else: title = None
                        
                    # Get author
                    if 'author' in entry and 'authors' in entry and entry['author']: author = entry['author']
                    elif 'authors' in entry and entry['authors']: author = entry['authors']
                    elif 'author' in entry and entry['author']: author = entry['author']
                    else: author = None
                        
                    # Get citation level
                    if 'citation_level' in entry:
                        if entry['citation_level'] != 0: raise MissingDataException(f"Entry has citation_level {entry['citation_level']}.")
                    
                    # Get date published
                    if 'date_published' in entry and entry['date_published'] and len(entry['date_published']) >= 10: date_published = entry['date_published'][:10]
                    elif 'published' in entry and entry['published'] and len(entry['published']) >= 16: date_published = entry['published'][:16]
                    else: date_published = None
                        
                    # Get URL
                    if 'link' in entry and entry['link']: url = entry['link']
                    elif 'url' in entry and entry['url']: url = entry['url']
                    elif 'doi' in entry and entry['doi']: url = entry['doi']
                    else: url = None
                        
                    # Get tags
                    if 'tags' in entry and entry['tags']:
                        if type(entry['tags']) == list: tags = ', '.join([val['term'] for val in entry['tags']])
                        elif type(entry['tags']) == str: tags = entry['tags']
                        else: tags = None
                    
                    signature = ""
                    if title: signature += f"Title: {title}, "
                    if author: signature += f"Author: {author}, "
                    if date_published: signature += f"Date published: {date_published}, "
                    if url: signature += f"URL: {url}, "
                    # if tags: signature += f"Tags: {tags}, "
                    if signature: signature = signature[:-2]

                    self.data.append((title, author, date_published, url, tags, text))
                    
                    blocks = self.text_splitter.split(text, signature)
                    self.embed_split.extend(blocks)
                    
                    self.total_char_count += len(entry['text'])
                    self.total_word_count += len(entry['text'].split())
                    self.total_sentence_count += len(split_into_sentences(entry['text']))
                    self.total_block_count += len(blocks)
                
                except MissingDataException as e:
                    if str(e) not in error_count_dict:
                        error_count_dict[str(e)] = 0
                    error_count_dict[str(e)] += 1

    def get_embeddings(self):
        # Get an embedding for each text, with retries if necessary
        # @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(10))
        def get_embedding(text: str, delay_in_seconds: float = 0) -> np.ndarray:
            time.sleep(delay_in_seconds)
            result = openai.Embedding.create(model=EMBEDDING_MODEL, input=text)
            return result["data"][0]["embedding"]
        
        embeddings = []
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = [executor.submit(get_embedding, text) for text in self.embed_split]
            num_completed = 0
            for future in concurrent.futures.as_completed(futures):
                embeddings.append(future.result())
                num_completed += 1
                if num_completed % 50 == 0:
                    print(f"Completed {num_completed}/{len(self.embed_split)}")
        self.embeddings = np.vstack(embeddings)
    
    def save_embeddings(self, path: str):
        np.save(path, self.embeddings)
        
    def load_embeddings(self, path: str):
        self.embeddings = np.load(path)
        
    def save_class(self, path: str):
        with open(path, 'wb') as f:
            pickle.dump(self, f)

In [38]:
# List of possible sources:
all_sources = ["https://aipulse.org", "ebook", "https://qualiacomputing.com", "alignment forum", "lesswrong", "manual", "arxiv", "https://deepmindsafetyresearch.medium.com", "waitbutwhy.com", "GitHub", "https://aiimpacts.org", "arbital.com", "carado.moe", "nonarxiv_papers", "https://vkrakovna.wordpress.com", "https://jsteinhardt.wordpress.com", "audio-transcripts", "https://intelligence.org", "youtube", "reports", "https://aisafety.camp", "curriculum", "https://www.yudkowsky.net", "distill"]

sources = ["alignment forum", #"lesswrong", 
           "manual", "waitbutwhy.com", "https://aiimpacts.org", "arbital.com", "https://intelligence.org", "reports", "https://aisafety.camp", "curriculum", "https://www.yudkowsky.net", "distill"]


dataset = Dataset(path=PATH_TO_DATA, sources=sources, rate_limit_per_minute=3500, block_min_max_size = [1200, 1500], fraction_of_articles_to_use=1/20)
dataset.get_alignment_texts()

In [31]:
article_num = 0
print(f"Title: {dataset.data[article_num][0]}")
print(f"Author: {dataset.data[article_num][1]}")
print(f"Date published: {dataset.data[article_num][2]}")
print(f"URL: {dataset.data[article_num][3]}")
print(f"Tags: {dataset.data[article_num][4]}")
print(f"Text: {dataset.data[article_num][5]}")

Title: AXRP Episode 13 - First Principles of AGI Safety with Richard Ngo
Author: DanielFilan
Date published: 2022-03-31
URL: https://www.lesswrong.com/posts/tEf8fEFCkFtPyg9pm/axrp-episode-13-first-principles-of-agi-safety-with-richard
Tags: AI/Audio/AXRP/Interviews/AI Risk/Existential Risk
Text: Link post
Contents
 - The nature of intelligence and AGI 
 - The nature of intelligence 
 - AGI: what and how 
 - Single vs collective AI minds 
 - AGI in practice 
 - Impact 
 - Timing 
 - Creation 
 - Risks and benefits 
 - Making AGI safe 
 - Robustness of the agency abstraction 
 - Pivotal acts 
 - AGI safety concepts 
 - Alignment 
 - Transparency 
 - Cooperation 
 - Optima and selection pressures 
 - The AI alignment research community 
 - Updates from Yudkowsky conversation 
 - Corrections to the community 
 - Why others don’t join 
 - Richard Ngo as a researcher 
 - The world approaching AGI 
 - Following Richard’s work 
Google Podcasts link

This podcast is called AXRP, pronounced axe-

In [40]:
print(f"Articles count: {len(dataset.data)}")
print(f"Num of each source: {dataset.num_articles}")
print(f"Num chars: {dataset.total_char_count}")
print(f"Num words: {dataset.total_word_count}")
print(f"Num sentences: {dataset.total_sentence_count}")
print(f"Num blocks: {dataset.total_block_count}")

Articles count: 150
Num of each source: {'alignment forum': 105, 'manual': 0, 'waitbutwhy.com': 0, 'https://aiimpacts.org': 11, 'arbital.com': 10, 'https://intelligence.org': 22, 'reports': 2, 'https://aisafety.camp': 0, 'curriculum': 0, 'https://www.yudkowsky.net': 2, 'distill': 0, 'total': 152}
Num chars: 1893590
Num words: 306879
Num sentences: 13750
Num blocks: 1434


In [19]:
dataset.get_embeddings()
dataset.save_embeddings(PATH_TO_EMBEDDINGS)
dataset.save_class(PATH_TO_DATASET)

Completed 50/197
Completed 100/197
Completed 150/197


In [20]:
for embed in dataset.embed_split:
    print(embed)
    print()

Glossary of Qualia Research Institute Terms This is a glossary of key terms and concept handles that are part of the memetic ecosystem of the Qualia Research Institute. Reading this glossary is itself a great way to become acquainted with this emerging memeplex. If you do not know what a memeplex is… you can find its definition in this glossary. Basics Consciousness (standard psychology, neuroscience, and philosophy term): There are over a dozen common uses for the word consciousness, and all of them are interesting. Common senses include: self-awareness, linguistic cognition, and the ability to navigate one’s environment. With that said, the sense of the word in the context of QRI is more often than not: the very fact of experience, that experience exists and there is something that it feels like to be. Talking loosely and evocatively- rather than formally and precisely- consciousness refers to “what experience is made of”. Of course formalizing that statement requires a lot of unpack

In [21]:
# with open(PATH_TO_DATASET, 'rb') as f:
#     dataset2 = pickle.load(f)

In [26]:
"""
TODO:
Add a moderation call to not be prompt-hacked: https://platform.openai.com/docs/guides/moderation/quickstart

"""

class AlignmentSearch:
    def __init__(self,
            dataset: Dataset,  # Dataset object containing the data.
        ):
        self.dataset = dataset
    
    # @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(10))
    def get_embedding(self, text: str) -> np.ndarray:
        try:
            result = openai.Embedding.create(model=EMBEDDING_MODEL, input=text)
            return result["data"][0]["embedding"]
        except openai.RateLimitError as e:
            print("Rate limit exceeded. Retrying in 30 seconds.")
            time.sleep(30)
            return self.get_embedding(text)
    
    def get_top_k(self, query: str, k: int=10) -> List[str]:
        # Receives a query (str) and returns the top k blocks that are most semantically similar to the query.
        # Each tuple contains the title of an article, its URL, and text.
        query_embedding = self.get_embedding(query)
        similarities = np.dot(self.dataset.embeddings, query_embedding)
        top_k_indices = np.argsort(similarities)[::-1][:k]
        top_k = [self.dataset.embed_split[i] for i in top_k_indices]
        return top_k
    
    def construct_messages(self, question: str, blocks: List[str] = None, mode: str = "balanced") -> str:
        # Receives a question (str) and a list of blocks and returns a prompt (str) to be used for text generation.
        if blocks:
            context = ""
            for i, block in enumerate(blocks):
                context += f"Context #{i+1}: {block}\n\n"
            context = context[:MAX_LEN_PROMPT * 3] + "..." if len(context) > MAX_LEN_PROMPT * 3 else context[:-2]
        
        if mode == "balanced":
            assistant_prompt = "You are a helpful assistant, and you help users by answering questions and providing information about AI Alignment and AI Safety. You are extremely knowledgeable, yet you know the limits of your own knowledge. Answer the user's questions as truthfully as possible using the provided context, and if the answer is not contained within it, say \"I don't know.\", or \"I'm not sure I know the answer to your question. However, I can try.\" followed by an attempt to answer as best you can. You can also ask the user questions to clarify their question."
            messages = [
                {"role": "system", "content": assistant_prompt},
                {"role": "system", "content": context},
                {"role": "user", "content": question},
            ]
        elif mode == "precise":
            raise NotImplementedError
        elif mode == "creative":
            raise NotImplementedError
        elif mode == "HyDE":
            assistant_prompt = "You are a helpful assistant, and you help users by answering questions and providing information about AI Alignment and AI Safety, on which you are extremely knowledgeable. Answer the user's question even if you are not certain of the answer; it is supremely important that you do attempt to offer an answer related to the user's query."
            messages = [
                {"role": "system", "content": assistant_prompt},
                {"role": "user", "content": question},
            ]
        else:
            raise ValueError("Mode must be one of 'balanced', 'precise', 'creative', or 'HyDE'.")
        return messages
    
    def answer_question(self, question: str, blocks: List[str]) -> str:
        # Receives a question (str) and a list of blocks and returns an answer (str) to the question.
        messages = self.construct_messages(question, blocks, mode="balanced")
        answer = openai.ChatCompletion.create(
            model=COMPLETIONS_MODEL, 
            messages=messages
        )
        return answer["choices"][0]["message"]["content"]
    
    def search_and_answer(self, question: str, k: int=10, HyDE: bool=False) -> str:
        # Receives a question (str) and returns an answer (str) to the question.
        if HyDE:
            messages = self.construct_messages(question, mode="HyDE")
            hyde_completion = openai.ChatCompletion.create(
                model=COMPLETIONS_MODEL, 
                messages=messages
            )
            top_k = self.get_top_k(f"{question}\n{hyde_completion}", k)
            # print(top_k)
            # raise NotImplementedError
        else:
            top_k = self.get_top_k(question, k)
        answer = self.answer_question(question, top_k)
        return answer, top_k, sources

In [27]:
SA = AlignmentSearch(dataset=dataset)
query = "Claim: AI Alignment is unnecessary because smarter AI will also be more moral."
answer = SA.search_and_answer(query, 10)#, HyDE=True)
print(answer)

This claim is controversial and not widely accepted in the field of AI Alignment. While it is possible that smarter AI will have a better understanding of morality, it is not guaranteed, and there are reasons to believe that AI may not have human-like values or morals. One of the major challenges in AI Alignment is ensuring that AI systems pursue objectives that align with human values and goals, which may not be straightforward for AI systems to understand or infer. Therefore, research in AI Alignment is important for ensuring that AI is developed and used in ways that are safe, ethical, and beneficial for humanity.


In [41]:
num_articles_truth = {
    'https://aipulse.org': 23,
    'ebook': 23,
    'https://qualiacomputing.com': 278,
    'alignment forum': 2138,
    'lesswrong': 28252 + 227,
    'manual': "?",
    'arxiv': 707 + 1679 + 1000 + 4621,
    'https://deepmindsafetyresearch.medium.com/': 10,
    'waitbutwhy.com': 2,
    'GitHub': "?",
    'https://aiimpacts.org': 227,
    'arbital.com': 223,
    'carado.moe': 59,
    'nonarxiv_papers': "?",
    'https://vkrakovna.wordpress.com': 43,
    'https://jsteinhardt.wordpress.com': 39,
    'audio-transcripts': 25 + 12,
    'https://intelligence.org': 479,
    'youtube': 457,
    'reports': "?",
    'https://aisafety.camp': 8,
    'curriculum': "?",
    'https://www.yudkowsky.net': 23,
    'distill': 49,
    'total': 2138+28252+707+1679+1000+4621+23+227+23+8+59+111+10+17+7+479+39+278+43+2+23+420+323+49+457+25+12+223+227+132    
}
word_count_truth = 53_550_146
char_count_truth = 351_767_163

# Print table. First row has Truth and Empirical findings.
print(f"{'Source':<20} {'Truth':<10} {'Empirical':<10} {'Difference':<10}")
for source in dataset.num_articles:
    try:
        print(f"{source[:20]:<20} {num_articles_truth[source]:<10} {dataset.num_articles[source]:<10} {num_articles_truth[source] - dataset.num_articles[source]:<10}")
    except TypeError:
        print(f"{source[:20]:<20} {num_articles_truth[source]:<10} {dataset.num_articles[source]:<10} {'UNKNOWN':<10}")

# Compare true and empirical word counts and character counts
print(f"\n{'':<20} {'Truth':<10} {'Empirical':<10} {'Difference':<10}")
print(f"{'Word Count':<20} {word_count_truth:<10} {dataset.total_word_count:<10} {word_count_truth - dataset.total_word_count:<10}")
print(f"{'Character Count':<20} {char_count_truth:<10} {dataset.total_char_count:<10} {char_count_truth - dataset.total_char_count:<10}")

Source               Truth      Empirical  Difference
alignment forum      2138       105        2033      
manual               ?          0          UNKNOWN   
waitbutwhy.com       2          0          2         
https://aiimpacts.or 227        11         216       
arbital.com          223        10         213       
https://intelligence 479        22         457       
reports              ?          2          UNKNOWN   
https://aisafety.cam 8          0          8         
curriculum           ?          0          UNKNOWN   
https://www.yudkowsk 23         2          21        
distill              49         0          49        
total                41614      152        41462     

                     Truth      Empirical  Difference
Word Count           53550146   306879     53243267  
Character Count      351767163  1893590    349873573 


# Tests

In [5]:
# Define a helper function that takes in a single string and outputs a single d-dimensional vector
def get_embedding(text):
  # Use the embeddings OpenAI API endpoint to get an embedding for the text
  result = openai.Embedding.create(model=EMBEDDING_MODEL, input=text)
  # Convert the response to a numpy array and return it
  return result["data"][0]["embedding"]

# Define a function that takes in a list of strings and outputs a numpy matrix of embeddings
def get_embeddings(texts):
  embeddings = []
  with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(get_embedding, text) for text in texts]
    for future in concurrent.futures.as_completed(futures):
      embeddings.append(future.result())
  return np.vstack(embeddings)

def get_embeddings_not_parallel(texts):
    embeddings = np.array([get_embedding(text) for text in texts])
    return embeddings

In [6]:
# Define a list of texts to be embedded
texts = ["Hello world!"] * 100

# Regular method
start = time.time()
embeddings_1 = get_embeddings_not_parallel(texts)
end = time.time()
print(f"Regular method: {end - start}")

# Parallel method
start = time.time()
embeddings_2 = get_embeddings(texts)
end = time.time()
print(f"Parallel method: {end - start}")

print(embeddings_1.shape)
print(embeddings_2.shape)

RateLimitError: The server is currently overloaded with other requests. Sorry about that! You can retry your request, or contact us through our help center at help.openai.com if the error persists.

In [14]:
import random
from timeit import timeit
import string
import tiktoken
encoding = tiktoken.get_encoding("cl100k_base")

#we want to compare the speed of len(string) vs len(encoding.encode(string))
#we will use a random string of length 1000 to do this

def len_string(string):
    return len(string)

def len_tiktoken(string):
    return len(encoding.encode(string))

random_str = ''.join(random.choice(string.ascii_letters) for i in range(1000))

#we will run each function 1000 times and compare the average time
print("Average time for len(string):", timeit(lambda: len_string(random_str), number=3000))
print("Average time for len(tiktoken):", timeit(lambda: len_tiktoken(random_str), number=3000))


Average time for len(string): 0.0008568000048398972
Average time for len(tiktoken): 15.112917799997376
