Part 0: Installing packages and loading them

In [None]:
#install required packages not available in colab by default
!pip install pdfplumber
!pip install python-dotenv
!pip install openai
!pip install langchain_openai
!pip install -U langchain-cli
!pip install -qU pinecone-client==3.1.0 pandas==2.0.3
!pip install langchain

In [None]:
#loading libraries
import pdfplumber
from google.colab import userdata
import requests
import time
from dotenv import load_dotenv
import os
from openai import OpenAI
import nltk
import tiktoken
from typing import List
from langchain_openai import ChatOpenAI
from langchain.docstore.document import Document
from langchain.chains.question_answering import load_qa_chain

nltk.download('punkt')

Part 1: Obtaining data from Old Farmer's Almanac

In oder to obtain data from the farmer's Almanac, we first will require a .PDF copy of the Almanac, which can be obtained here:

https://store.almanac.com/online-edition-2024

The next step is to turn this code into a .txt file which can be ingested.

In [None]:
#convert the farmer's almanac into a .txt file. replace with whatever file name is relevant
with pdfplumber.open("2024 Old Farmers Almanac.pdf") as pdf:
    text = ""
    for page in pdf.pages:
        text += page.extract_text()
with open("almanac.txt", "w") as txt:
    txt.write(text)

Part 1.1 Alternative Data: We could try to get the data from another source like the New York Times. Here we use the API for the new york time to get some information from articles. This could be used instead, though far less successfully, but it could also be used to supplement the Information from the Almanac.

In [None]:
#setting the NYT API key. An API key will be required for this. The following code is made to work with colab's "secrets" functionality.
#if using a different environment (like VS Code) the API key will have to be accessed differently.
#either way, it is also possible to simply paste an API key to use.
NYT_API_KEY = userdata.get('NYT_API_KEY')

#calling the api and saving the results as agriculturedata
API_ENDPOINT = 'https://api.nytimes.com/svc/search/v2/articlesearch.json'
agriculturedata = ''

# Function to fetch articles
def fetch_articles(page):
    params = {
        'q': 'agriculture',
        'api-key': NYT_API_KEY,
        'page': page,
        'fl': 'abstract,snippet,lead_paragraph'  # Requesting abstract, snippet, and lead_paragraph
    }
    response = requests.get(API_ENDPOINT, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print(f'Error fetching articles: {response.status_code}')
        return None

# Fetch and concatenate articles
for i in range(50):  # 50 pages, 5 articles each to get 250 articles. Note that 500 is daily limit
    articles = fetch_articles(i)
    if articles:
        for article in articles['response']['docs']:
            # Concatenate abstract, snippet, and lead_paragraph
            if article['abstract']:
                agriculturedata += article['abstract'] + '\n\n'
            if article['snippet']:
                agriculturedata += article['snippet'] + '\n\n'
            if article['lead_paragraph']:
                agriculturedata += article['lead_paragraph'] + '\n\n'
    time.sleep(12)  # Sleep for 12 seconds to respect the rate limit

print('Fetched and concatenated articles into agriculturedata.')

In [None]:
#saving the agriculturedata information in the form of a text document which can be ingested
with open('agriculturedata.txt', 'w') as f:
    f.write(agriculturedata)

Part 2: Implementing the rag model with our Data


In this example, we went with the Old Farmer's Almanac, but any data will do. Using more data will probably yield better results, if the data is good.

For this to work, an account with pinecone.io is required, and so is an account with openai that has credit balance on it.

In [None]:
#input the text to be used for this.
with open('almanac.txt', 'r') as f:
    text = f.read()

In [None]:
#if using google colab, this is how to get the api keys from secrets:
pc = Pinecone(api_key=userdata.get('PINECONE_API_KEY'))
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
index = pc.Index('agriculture-project')

In [None]:
# Set the model name for our LLMs. And defining functions for later use
OPENAI_MODEL = "gpt-3.5-turbo"
EMBED_MODEL = "text-embedding-3-small"
# Store the API key in a variable.

client = OpenAI(api_key=OPENAI_API_KEY)
MAX_TOKENS = 1536

def prep(text: str):
    return text.replace("\n", " ").replace("\r", " ").replace("\t", " ")

def tokenize(text: List[str]):
    encoding = tiktoken.encoding_for_model(EMBED_MODEL)
    return encoding.encode(text)

def embed(tokens: List[int]):
    response = client.embeddings.create(input=tokens,model=EMBED_MODEL)
    return response.data[0].embedding

def chunk_text(text:str):
    current_chunk = []
    current_para = ""
    chunks = []
    paras = []
    current_len = 0
    sentences = nltk.sent_tokenize(text)
    chunks_of_tokens = []

    for sentence in sentences:
        # Tokenize the sentence
        sentence_tokens = tokenize(sentence)
        sentence_token_len = len(sentence_tokens)

        # Check if adding the next sentence exceeds the max token limit
        if current_len + sentence_token_len > MAX_TOKENS:
            # Add the current chunk to the list and start a new one
            paras.append(current_para)
            current_para = ""
            chunks_of_tokens.append(current_chunk)
            embeddings = embed(current_chunk)
            chunks.append(embeddings)
            current_chunk = []
            current_len = 0

        # Add the sentence to the current chunk
        current_para += " " + sentence
        current_chunk.extend(sentence_tokens)
        current_len += sentence_token_len

    # Add the last chunk if it's not empty
    if current_chunk:
        paras.append(current_para)
        chunks_of_tokens.append(current_chunk)
        embeddings = embed(current_chunk)
        chunks.append(embeddings)

    return paras, chunks, chunks_of_tokens

def create_embeddings(filename: str):
    with open(filename, "r") as file:
        text = file.read()
    text = prep(text)
    return chunk_text(text)

def create_embeddings_prompt(prompt:str):
    prompt = prep(prompt)
    return chunk_text(prompt)

def vectorize_chunks(paras: List, chunks: List, **kwargs):
    vectors = []
    for i in range(len(chunks)):
        if "filename" in kwargs:
            vectors.append({"id": f"{i}", "values": chunks[i], "metadata": {"file": filename, "para": f"{paras[i]}"}})
        else:
            vectors.append({"id": f"{i}", "values": chunks[i], "metadata": {"para": f"{paras[i]}"}})

    return vectors

In [None]:
### takes 30+ minutes runtime - do not use if not necessary
# this code provides pinecone.ai with the setup information required. It
# creates the vectors which will be stored on pinecone.ai

# to prevent inadvertent use of code, replace "True" with "False" after code has
# done running.

if(True): #change truth value here
    paras, chunks, chunks_of_tokens  = create_embeddings_prompt(text)
    vectors = vectorize_chunks(paras, chunks)
    print(len(vectors))

In [None]:
# upsert the vectors into pinecone index:
index.upsert(
    vectors=vectors[:200]
)
index.upsert(
    vectors=vectors[200:]
)
# Note: as of the time of writing this code, the vector list had to be split into two
# parts as an error would otherwise occur.

Part 3: Implementing the Function for asking the questions and Testing

In [None]:
#Define Question Asking Function

query_responses=[]

def ask_a_question(prompt):
    # convert the prompt to chunks of  embeddings
    paras, chunks, chunks_of_tokens  = create_embeddings_prompt(prompt)
    print(f"Embeddings: {chunks[0]}")
    # vectorize the embeddings
    prompt_vectors = vectorize_chunks(paras, chunks)
    print(f"Vectorized: {prompt_vectors[0]}")
    # search the index for the best match using semantic search
    query_response = index.query(
        top_k=2,
        vector=prompt_vectors[0]["values"]
    )
    query_responses.append(query_response)
    print(f"Query response: {query_response}")
    # get the id of the best match
    best_id = query_response["matches"][0]["id"]
    print(f"Best ID: {best_id}")
    # fetch the best match from the index
    result = index.fetch(ids=[best_id])
    # get the paragraph of interest from the result metadata
    para_of_interest = result["vectors"][best_id]["metadata"]["para"]
    print(f"Para of interest: {para_of_interest}")
    # Initialize the langchain chat model.
    llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model_name=OPENAI_MODEL, temperature=0.0)
    # turn the para_of_interest into a Document
    document = Document(page_content=para_of_interest)
    # Create the QA chain using the LLM.
    chain = load_qa_chain(llm)
    # Pass the para_of_interest and the prompt to the chain, and print the result.
    question = "If you can't find the answer in the provided document, say, I just don't know the answer to that, otherwise, answer the question. " + prompt
    result = chain.invoke({"input_documents": [document], "question": question})
    return result["output_text"]

In [None]:
#examples of some questions to ask to ensure that it works:

query_responses=[]

questions = ["will there be a solar eclipse?",
            "what will the weather be like during the summer?",
            "What are some of the newest developments in farming?",
            "Will there be a lunar eclipse?"
]

answers = []
for question in questions:
    answers.append(ask_a_question(question))

In [None]:
# test the clean output function
ix = 0
for query_response in query_responses:
    print(f"Match Score: {query_response['matches'][0]['score']}")
    print(f"Question: {questions[ix]}")
    print(f"Answer:   {answers[ix]}\n\n")
    ix += 1