In [None]:
# Load the environment variables from the .env file
# pip install python-dotenv

# Add the following to .env file
# OPENAI_ORG_KEY=org-..
# OPENAI_API_KEY=sk-..
# TOKENIZERS_PARALLELISM=false


from dotenv import load_dotenv

load_dotenv()

In [None]:
# Using langchain API
# pip install langchain

from langchain.llms import OpenAI

model = OpenAI() # requires 'OPENAI_API_KEY' env var to be set
response = model("What is the capital of france?")
response.replace('\n', '')

In [None]:
# Using REST API
# pip install openai

from openai import OpenAI

client = OpenAI()  # requires 'OPENAI_API_KEY' env var to be set

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "What is the capital of France?",
        }
    ],
    model="gpt-3.5-turbo",
)

chat_completion.choices[0].message.content

In [None]:
# RAG - Retrieval Augumented Generation
# pip install requests beautifulsoup4 sentence-transformers faiss-cpu openai

import requests
import numpy as np
import re

import faiss
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from openai import OpenAI

def get_answer(query, url):
    # Download contents of a web page
    response = requests.get(url)
    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')
    text = soup.get_text()
    
    # Generate Text Embeddings from the web page content
    model = SentenceTransformer('all-MiniLM-L6-v2') # Downloads the embedding model from the internet
    sentences = text.split('.')
    sentences = [re.sub(r'\s+', ' ', s) for s in sentences]
    embeddings = model.encode(sentences) # One embedding per sentence
    
    # Storing and Indexing Content to a Vector Database (using FAISS for now, can use any vector db)
    
    # Initialize a FAISS index
    dimension = embeddings.shape[1]  # Dimension of the embeddings
    base_index = faiss.IndexFlatL2(dimension)
    index = faiss.IndexIDMap(base_index) # So that we can save an id in the index
    
    # Todo : save the senteneces somewhere permanent and then use the permanent ids instead.
    
    # Add vectors to the index
    ids = np.arange(len(embeddings))
    index.add_with_ids(np.array(embeddings).astype('float32'), ids)
    
    # Find ids of embeddings from the index which are close to the query
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding.astype('float32'), k=3)
    
    debug=False
    if debug:
        # Print top k similar sentences
        for i in indices[0]: # Since we only have one item in [query] use 'indices[0]'
            print(sentences[i])
            print('-'*50)
    
    context = ".".join([sentences[i] for i in indices[0]])
    
    # Do an LLM Query with the context
    client = OpenAI()  # requires 'OPENAI_API_KEY' env var to be set
    
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"Question: {query} \n Use the following context if it is helpful: {context}"
            }
        ],
        model="gpt-3.5-turbo",
    )

    return chat_completion.choices[0].message.content

# Use the contents of the URL to get answer to the question
query = "What is work study position"
url = "https://studentlife.utoronto.ca/news/work-study-is-back-for-september-2022/"
query = "Why are stripped stars difficult to find?"
url = "https://www.utoronto.ca/news/u-t-astronomers-discover-first-population-binary-stripped-stars"

get_answer(query, url)