In [1]:
import numpy as np
import os
import openai
import sys

In [2]:
from langchain.document_loaders import NotionDirectoryLoader
loader = NotionDirectoryLoader("../../tech_non_tech_blog/_posts/")

In [3]:
docs = loader.load()

In [4]:
docs[0].metadata

{'source': '..\\..\\tech_non_tech_blog\\_posts\\2019-02-02-Speeding-up-diff-between-consecutive-rows-in-python-on-my-laptop.md'}

In [5]:
from langchain.text_splitter import MarkdownHeaderTextSplitter
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)

def extract_txt(docs):
    txt = ' '.join([d.page_content for d in docs])
    return txt

md_header_splits = markdown_splitter.split_text(extract_txt(docs))
md_header_splits[1]

Document(page_content="I'm currently pursuing independent research on large financial data. Data engineering is an integral part of the analysis as financial data is often not in the format required for analysis. Financial data may be provided in the form of transactions/updates or in the form of aggregates. While it is easy to go from transaction level to aggregate level, it is difficult to do the opposite.  \nMy laptop configuration is decent: Ubuntu 18.04, 16 GB DDR4, Intel core i7–8750H (6 + 6 virtual core) @ 2.2 GHz and a GPU (not relevant here). But brute force is not a good idea — code run time was ~ 4 hours for processing one day's data, which is not acceptable.", metadata={'Header 2': "Speeding-up 'diff' between Consecutive Rows in Python on My Laptop", 'Header 3': 'Introduction'})

In [6]:
import pandas as pd
pd.Series([md_header_split.metadata.get('Header 2', '') for md_header_split in md_header_splits]).value_counts()

Lessons from my Internship (and Immediate Aftermath)                              8
Search for Exoplanets - Humans vs. Stars                                          8
Speeding-up 'diff' between Consecutive Rows in Python on My Laptop                6
Does python prioritize True in 'OR' and False in 'AND' condition checks?          6
Unsupervised Deep Learning in Astronomy for Exoplanet Candidate Identification    5
Running Python or R in Android OS                                                 5
                                                                                  1
Name: count, dtype: int64

To fix the chunking issue the `---` header section had to be added to each .md file. Now things look better

## Using SpacyEmbeddings and retrieving using np.dot similarity without using Chroma

In [7]:
from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings
embedding = SpacyEmbeddings()

In [8]:
embedding1 = embedding.embed_query("What's my laptop configuration?")
embedding2s = [embedding.embed_query(md_header_split.page_content) for md_header_split in md_header_splits]

In [9]:
k = 5
similarities = [np.dot(embedding2, embedding1) for embedding2 in embedding2s]
top_k = np.argsort(-np.array(similarities))[0:k]
np.array([md_header_split.page_content for md_header_split in md_header_splits])[top_k]

array(["I'm currently pursuing independent research on large financial data. Data engineering is an integral part of the analysis as financial data is often not in the format required for analysis. Financial data may be provided in the form of transactions/updates or in the form of aggregates. While it is easy to go from transaction level to aggregate level, it is difficult to do the opposite.  \nMy laptop configuration is decent: Ubuntu 18.04, 16 GB DDR4, Intel core i7–8750H (6 + 6 virtual core) @ 2.2 GHz and a GPU (not relevant here). But brute force is not a good idea — code run time was ~ 4 hours for processing one day's data, which is not acceptable.",
       '---\nlayout: post\ndate: 2019-01-11 12:00:00 -0500\nimage: ../data/multiprocessing.jpg\n---',
       "I recently extended the analysis, but there are several more ideas for 'discretization'.\n---\nlayout: post\ndate: 2019-09-21 12:00:00 -0500\nimage: ../data/Transit_photometry.gif\n---",
       "1.Follow the instructions in 

## Storing in vector store

In [10]:
from langchain.vectorstores import Chroma
persist_directory = 'docs/tech_non_tech_blog/'# Better to use a new folder here. There's a chromadb version compatibility issue while appending to the existing docs/chromadb folder
vectordb = Chroma.from_documents(docs, embedding, persist_directory=persist_directory)
if len(vectordb.get()['documents']) == 0:
    vectordb = Chroma.from_documents(
        documents=md_header_splits,
        embedding=embedding,# Reusing the spacy embedding
        persist_directory=persist_directory
    )
    print(vectordb._collection.count())

## Retrieval

### Using similarity_search (same as np.dot?)

In [11]:
questions = ["What's my laptop configuration?", "How long did brute force run for?"]
ret_docs = [vectordb.similarity_search(question, k=3) for question in questions]
ret_docs[0][0].page_content

"I'm currently pursuing independent research on large financial data. Data engineering is an integral part of the analysis as financial data is often not in the format required for analysis. Financial data may be provided in the form of transactions/updates or in the form of aggregates. While it is easy to go from transaction level to aggregate level, it is difficult to do the opposite.  \nMy laptop configuration is decent: Ubuntu 18.04, 16 GB DDR4, Intel core i7–8750H (6 + 6 virtual core) @ 2.2 GHz and a GPU (not relevant here). But brute force is not a good idea — code run time was ~ 4 hours for processing one day's data, which is not acceptable."

In [12]:
ret_docs[1][0].page_content

"1. Brushing, zooming and refining are time consuming tasks. They are needed because the current set of methods are not robust to 'noisy tagging' — for example: terminating the task after brushing (without zooming and refining) leads to light curve of candidate + default light curve of the host star.\n2. Can this be simplified further? Sections of the image can be automatically brushing (using unsupervised learning) and labeled as candidates. This process can be repeated for all light curves. It is alright to have few false positives, but the algorithm should produce much fewer false negatives. Manual taggers only have to click on the 'x' button on each brush or leave the light curve unaltered (approval that the automatic candidate identification is accurate). Brushing, zooming and refining will still be available to taggers, but the hope here is to reduce their usage.  \n- If a light curve has 'large' number of taggers, their approval and disapproval (+ optional refining) can be used 

## Using max_marginal_relevance_search

Maximum marginal relevance yields a different ordering of the retrieved documents compared to similarity_search

In [13]:
docs_mmr = [vectordb.max_marginal_relevance_search(question, k=3) for question in questions]
docs_mmr[0][0]

Document(page_content="I'm currently pursuing independent research on large financial data. Data engineering is an integral part of the analysis as financial data is often not in the format required for analysis. Financial data may be provided in the form of transactions/updates or in the form of aggregates. While it is easy to go from transaction level to aggregate level, it is difficult to do the opposite.  \nMy laptop configuration is decent: Ubuntu 18.04, 16 GB DDR4, Intel core i7–8750H (6 + 6 virtual core) @ 2.2 GHz and a GPU (not relevant here). But brute force is not a good idea — code run time was ~ 4 hours for processing one day's data, which is not acceptable.", metadata={'Header 2': "Speeding-up 'diff' between Consecutive Rows in Python on My Laptop", 'Header 3': 'Introduction'})

In [14]:
docs_mmr[1][0]

Document(page_content='If the transition probabilities are known, a model free approximation of the Q function can be computed based on historical data. The agent can be made to act in a greedy way (immediate reward, without exploration) to achieve its goal. However, it should be noted that a set of locally optimal paths is not guaranteed to build a globally optimal path. Therefore, this method is not guaranteed to work in environments that are drastically different from those observed in the past.  \nIf the transition probabilities are known, a model free approximation of the Q function can be computed based on historical data. The agent can be made to act in a greedy way (immediate reward, without exploration) to achieve its goal. However, it should be noted that a set of locally optimal paths is not guaranteed to build a globally optimal path. Therefore, this method is not guaranteed to work in environments that are drastically different from those observed in the past.', metadata={

#### Narrowing the search space by applying metadata filters

In [15]:
ret_docs = [vectordb.similarity_search(
    question,
    k=3,
    filter={"Header 2":"Speeding-up 'diff' between Consecutive Rows in Python on My Laptop"}
) for question in questions]
ret_docs[0][0]

Document(page_content="I'm currently pursuing independent research on large financial data. Data engineering is an integral part of the analysis as financial data is often not in the format required for analysis. Financial data may be provided in the form of transactions/updates or in the form of aggregates. While it is easy to go from transaction level to aggregate level, it is difficult to do the opposite.  \nMy laptop configuration is decent: Ubuntu 18.04, 16 GB DDR4, Intel core i7–8750H (6 + 6 virtual core) @ 2.2 GHz and a GPU (not relevant here). But brute force is not a good idea — code run time was ~ 4 hours for processing one day's data, which is not acceptable.", metadata={'Header 2': "Speeding-up 'diff' between Consecutive Rows in Python on My Laptop", 'Header 3': 'Introduction'})

In [16]:
ret_docs[1][0]

Document(page_content='#### Problem  \nThe result is a pandas series of lists. But my desired output is a structure with timestamp + edit in each row. So I used pd.concat(result), which had an additional run time of 3.5 mins (~ clock time for consecutive diff which is a relatively complicated process). This drove me nuts!  \n#### Solution  \nI found that np.concatenate is much faster (~ 40 sec) because of homogeneity. For simplicity I converted timestamp and all col_idx_values to string. The output was written to a CSV file using np.savetxt("file.csv", concat_result, fmt="%s") or using array_name.tofile("file.csv", sep=",", format="%s").', metadata={'Header 2': "Speeding-up 'diff' between Consecutive Rows in Python on My Laptop", 'Header 3': 'Concatenation — An Additional Problem'})

In [17]:
ret_docs = [vectordb.similarity_search(
    question,
    k=3,
    filter={"Header 2":"Search for Exoplanets - Humans vs. Stars"}
) for question in questions]
ret_docs[0][0]# Shouldn't be very relevant

Document(page_content="I strongly recommend people to support [Planet Hunter TESS](https://www.zooniverse.org/projects/nora-dot-eisner/planet-hunters-tess) and other [Zooniverse](https://www.zooniverse.org/projects) projects. Many projects require support from a large pool of enthusiastic individuals (like the readers who have reached this section). Crowdsourcing in scientific research will grow exponentially and I recommend the readers to be one of the early members of this growing community.  \nThe previous section of article presented the vision for my independent research — unsupervised learning and crowdsourcing for exoplanet candidate identification. I'm currently developing the idea on my own, so progress has been slow. I welcome people to join me.  \n**Note:** This project will always remain open source.\n---\nlayout: post\ndate: 2021-10-31 12:00:00 -0500\nimage: ../data/R_troll.jpg\n---", metadata={'Header 2': 'Search for Exoplanets - Humans vs. Stars', 'Header 3': 'Conclusion

In [18]:
ret_docs[1][0]# Shouldn't be very relevant, but there's some discussion on brute force approach to exoplanet detection using crowd-sourcing

Document(page_content="1. Brushing, zooming and refining are time consuming tasks. They are needed because the current set of methods are not robust to 'noisy tagging' — for example: terminating the task after brushing (without zooming and refining) leads to light curve of candidate + default light curve of the host star.\n2. Can this be simplified further? Sections of the image can be automatically brushing (using unsupervised learning) and labeled as candidates. This process can be repeated for all light curves. It is alright to have few false positives, but the algorithm should produce much fewer false negatives. Manual taggers only have to click on the 'x' button on each brush or leave the light curve unaltered (approval that the automatic candidate identification is accurate). Brushing, zooming and refining will still be available to taggers, but the hope here is to reduce their usage.  \n- If a light curve has 'large' number of taggers, their approval and disapproval (+ optional 

### Other types of retrievers

Using the in-memory document chunks; not the chunks and metadata stored in Chroma

In [19]:
from langchain.retrievers import SVMRetriever
from langchain.retrievers import TFIDFRetriever
svm_retriever = SVMRetriever.from_texts([md_header_split.page_content for md_header_split in md_header_splits], embedding)
tfidf_retriever = TFIDFRetriever.from_texts([md_header_split.page_content for md_header_split in md_header_splits])

In [20]:
docs_svm = [svm_retriever.get_relevant_documents(question) for question in questions]
docs_svm[0][0]

Document(page_content="I'm currently pursuing independent research on large financial data. Data engineering is an integral part of the analysis as financial data is often not in the format required for analysis. Financial data may be provided in the form of transactions/updates or in the form of aggregates. While it is easy to go from transaction level to aggregate level, it is difficult to do the opposite.  \nMy laptop configuration is decent: Ubuntu 18.04, 16 GB DDR4, Intel core i7–8750H (6 + 6 virtual core) @ 2.2 GHz and a GPU (not relevant here). But brute force is not a good idea — code run time was ~ 4 hours for processing one day's data, which is not acceptable.")

In [21]:
docs_svm[1][0]

Document(page_content='If the transition probabilities are known, a model free approximation of the Q function can be computed based on historical data. The agent can be made to act in a greedy way (immediate reward, without exploration) to achieve its goal. However, it should be noted that a set of locally optimal paths is not guaranteed to build a globally optimal path. Therefore, this method is not guaranteed to work in environments that are drastically different from those observed in the past.  \nIf the transition probabilities are known, a model free approximation of the Q function can be computed based on historical data. The agent can be made to act in a greedy way (immediate reward, without exploration) to achieve its goal. However, it should be noted that a set of locally optimal paths is not guaranteed to build a globally optimal path. Therefore, this method is not guaranteed to work in environments that are drastically different from those observed in the past.')

In [22]:
docs_tfidf = [tfidf_retriever.get_relevant_documents(question) for question in questions]
docs_tfidf[0][0]

Document(page_content="I'm currently pursuing independent research on large financial data. Data engineering is an integral part of the analysis as financial data is often not in the format required for analysis. Financial data may be provided in the form of transactions/updates or in the form of aggregates. While it is easy to go from transaction level to aggregate level, it is difficult to do the opposite.  \nMy laptop configuration is decent: Ubuntu 18.04, 16 GB DDR4, Intel core i7–8750H (6 + 6 virtual core) @ 2.2 GHz and a GPU (not relevant here). But brute force is not a good idea — code run time was ~ 4 hours for processing one day's data, which is not acceptable.")

In [23]:
docs_tfidf[1][0]

Document(page_content="Disclaimer: If one has the option, one should use rstudio.cloud and shinyapps.io to develop and deploy RShiny apps. Following this article is a waste of time.  \nWorking on R/Python in Android OS may sound like an artificial constraint, but I faced this scenario - I wanted to run a RShiny app, but I did not have the resources to run apps in the 'free tier' both locally and on the Cloud.  \nExperts claim Android OS is just a version of Linux, but I'm not an expert in this area. This short article covers how a layman (like me) can run R scripts in Android OS. Similar steps can be followed to install and run Python in Android OS. Clearly, this is not an everyday scenario; not even something worth trying for fun.")

## Question answering

Note: For now this section of code won't work.

In [24]:
import openai
openai.api_key = open("../OPENAI_API_KEY.txt", "r").read()

In [25]:
import datetime
current_date = datetime.datetime.now().date()
if current_date < datetime.date(2023, 9, 2):
    llm_name = "gpt-3.5-turbo-0301"
else:
    llm_name = "gpt-3.5-turbo"
print(llm_name)

gpt-3.5-turbo


In [26]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

### Using langsmith

In [27]:
import os
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = open("../LANGCHAIN_API_KEY.txt", "r").read()

## Chat

Note: For now this section of code won't work.

### Memory in chat