In [1]:
import os
import openai
import sys

In [2]:
from langchain.document_loaders import NotionDirectoryLoader
loader = NotionDirectoryLoader("../../tech_non_tech_blog/_posts/")

In [3]:
docs = loader.load()

In [4]:
docs[0].metadata

{'source': '..\\..\\tech_non_tech_blog\\_posts\\2019-02-02-Speeding-up-diff-between-consecutive-rows-in-python-on-my-laptop.md'}

In [5]:
from langchain.text_splitter import MarkdownHeaderTextSplitter
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)

def extract_txt(docs):
    txt = ' '.join([d.page_content for d in docs])
    return txt

md_header_splits = markdown_splitter.split_text(extract_txt(docs))

In [6]:
md_header_splits[1]

Document(page_content="I'm currently pursuing independent research on large financial data. Data engineering is an integral part of the analysis as financial data is often not in the format required for analysis. Financial data may be provided in the form of transactions/updates or in the form of aggregates. While it is easy to go from transaction level to aggregate level, it is difficult to do the opposite.  \nMy laptop configuration is decent: Ubuntu 18.04, 16 GB DDR4, Intel core i7–8750H (6 + 6 virtual core) @ 2.2 GHz and a GPU (not relevant here). But brute force is not a good idea — code run time was ~ 4 hours for processing one day's data, which is not acceptable.", metadata={'Header 2': "Speeding-up 'diff' between Consecutive Rows in Python on My Laptop", 'Header 3': 'Introduction'})

In [7]:
import pandas as pd
pd.Series([md_header_split.metadata.get('Header 2', '') for md_header_split in md_header_splits]).value_counts()

Lessons from my Internship (and Immediate Aftermath)                              8
Search for Exoplanets - Humans vs. Stars                                          8
Speeding-up 'diff' between Consecutive Rows in Python on My Laptop                6
Does python prioritize True in 'OR' and False in 'AND' condition checks?          6
Unsupervised Deep Learning in Astronomy for Exoplanet Candidate Identification    5
Running Python or R in Android OS                                                 5
                                                                                  1
Name: count, dtype: int64

To fix the chunking issue the `---` header section had to be added to each .md file. Now things look better

## Finding similarity using SpacyEmbeddings

In [8]:
from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings
embedding = SpacyEmbeddings()

In [9]:
embedding1 = embedding.embed_query("What's my laptop configuration?")
embedding2s = [embedding.embed_query(md_header_split.page_content) for md_header_split in md_header_splits]

In [10]:
import numpy as np
similarities = [np.dot(embedding2, embedding1) for embedding2 in embedding2s]
np.argmax(similarities)

1

In [11]:
k = 5
top_k = np.argsort(-np.array(similarities))[0:k]
np.array([md_header_split.page_content for md_header_split in md_header_splits])[top_k]

array(["I'm currently pursuing independent research on large financial data. Data engineering is an integral part of the analysis as financial data is often not in the format required for analysis. Financial data may be provided in the form of transactions/updates or in the form of aggregates. While it is easy to go from transaction level to aggregate level, it is difficult to do the opposite.  \nMy laptop configuration is decent: Ubuntu 18.04, 16 GB DDR4, Intel core i7–8750H (6 + 6 virtual core) @ 2.2 GHz and a GPU (not relevant here). But brute force is not a good idea — code run time was ~ 4 hours for processing one day's data, which is not acceptable.",
       '---\nlayout: post\ndate: 2019-01-11 12:00:00 -0500\nimage: ../data/multiprocessing.jpg\n---',
       "I recently extended the analysis, but there are several more ideas for 'discretization'.\n---\nlayout: post\ndate: 2019-09-21 12:00:00 -0500\nimage: ../data/Transit_photometry.gif\n---",
       "1.Follow the instructions in 

## Storing in vector store

In [12]:
from langchain.vectorstores import Chroma
persist_directory = 'docs/tech_non_tech_blog/'# Better to use a new folder here. There's a chromadb version compatibility issue while appending to the existing docs/chromadb folder
vectordb = Chroma.from_documents(docs, embedding, persist_directory=persist_directory)
if len(vectordb.get()['documents']) == 0:
    vectordb = Chroma.from_documents(
        documents=md_header_splits,
        embedding=embedding,# Reusing the spacy embedding
        persist_directory=persist_directory
    )
    print(vectordb._collection.count())

In [13]:
question = "What's my laptop configuration?"
ret_docs = vectordb.similarity_search(question,k=3)
ret_docs[0].page_content

"I'm currently pursuing independent research on large financial data. Data engineering is an integral part of the analysis as financial data is often not in the format required for analysis. Financial data may be provided in the form of transactions/updates or in the form of aggregates. While it is easy to go from transaction level to aggregate level, it is difficult to do the opposite.  \nMy laptop configuration is decent: Ubuntu 18.04, 16 GB DDR4, Intel core i7–8750H (6 + 6 virtual core) @ 2.2 GHz and a GPU (not relevant here). But brute force is not a good idea — code run time was ~ 4 hours for processing one day's data, which is not acceptable."

In [14]:
question = "How long did brute force run for?"
ret_docs = vectordb.similarity_search(question,k=3)
ret_docs[0].page_content

"1. Brushing, zooming and refining are time consuming tasks. They are needed because the current set of methods are not robust to 'noisy tagging' — for example: terminating the task after brushing (without zooming and refining) leads to light curve of candidate + default light curve of the host star.\n2. Can this be simplified further? Sections of the image can be automatically brushing (using unsupervised learning) and labeled as candidates. This process can be repeated for all light curves. It is alright to have few false positives, but the algorithm should produce much fewer false negatives. Manual taggers only have to click on the 'x' button on each brush or leave the light curve unaltered (approval that the automatic candidate identification is accurate). Brushing, zooming and refining will still be available to taggers, but the hope here is to reduce their usage.  \n- If a light curve has 'large' number of taggers, their approval and disapproval (+ optional refining) can be used 

In [15]:
ret_docs[1].page_content

'It\'s possible to setup Ubuntu like windowing system for Ubuntu in Android OS. Once again, some people claimed to be successful using the steps in this [Reddit post](https://www.reddit.com/r/termux/comments/184kb1c/this_is_just_a_quick_rundown_of_termuxx11/). I tried this, and also tried installing x11 using `apt install`, but neither solution worked for me. So, I\'ll add another disclaimer - "Your results may vary", but I hit a dead-end.  \nInstead of setting up a windowing system like x11, I ran the RShiny app using `Rscript` command - this doesn\'t pop-up a new window with the app, but the app can be opened in any browser by entering "localhost:<port_number>". To avoid randomness in the port number it\'s  better to run `Rscript -e "shiny::runApp(host=\'127.0.0.1\', port=<any_number_of_your_choice>)"`.  \nNow we have R (or Python) running in Android OS. Tablet hardware (especially CPUs) are meant to conserve energy, so they may be extremely slow with compilation and program executio

In [16]:
docs_mmr = vectordb.max_marginal_relevance_search(question,k=3)
docs_mmr[0]

Document(page_content='If the transition probabilities are known, a model free approximation of the Q function can be computed based on historical data. The agent can be made to act in a greedy way (immediate reward, without exploration) to achieve its goal. However, it should be noted that a set of locally optimal paths is not guaranteed to build a globally optimal path. Therefore, this method is not guaranteed to work in environments that are drastically different from those observed in the past.  \nIf the transition probabilities are known, a model free approximation of the Q function can be computed based on historical data. The agent can be made to act in a greedy way (immediate reward, without exploration) to achieve its goal. However, it should be noted that a set of locally optimal paths is not guaranteed to build a globally optimal path. Therefore, this method is not guaranteed to work in environments that are drastically different from those observed in the past.', metadata={

In [17]:
question = "How long did brute force run for?"
ret_docs = vectordb.similarity_search(
    question,
    k=3,
    filter={"Header 2":"Speeding-up 'diff' between Consecutive Rows in Python on My Laptop"}
)
ret_docs[0]

Document(page_content='#### Problem  \nThe result is a pandas series of lists. But my desired output is a structure with timestamp + edit in each row. So I used pd.concat(result), which had an additional run time of 3.5 mins (~ clock time for consecutive diff which is a relatively complicated process). This drove me nuts!  \n#### Solution  \nI found that np.concatenate is much faster (~ 40 sec) because of homogeneity. For simplicity I converted timestamp and all col_idx_values to string. The output was written to a CSV file using np.savetxt("file.csv", concat_result, fmt="%s") or using array_name.tofile("file.csv", sep=",", format="%s").', metadata={'Header 2': "Speeding-up 'diff' between Consecutive Rows in Python on My Laptop", 'Header 3': 'Concatenation — An Additional Problem'})

In [18]:
question = "How long did brute force run for?"
ret_docs = vectordb.similarity_search(
    question,
    k=3,
    filter={"Header 2":"Search for Exoplanets - Humans vs. Stars"}
)
ret_docs[0]

Document(page_content="1. Brushing, zooming and refining are time consuming tasks. They are needed because the current set of methods are not robust to 'noisy tagging' — for example: terminating the task after brushing (without zooming and refining) leads to light curve of candidate + default light curve of the host star.\n2. Can this be simplified further? Sections of the image can be automatically brushing (using unsupervised learning) and labeled as candidates. This process can be repeated for all light curves. It is alright to have few false positives, but the algorithm should produce much fewer false negatives. Manual taggers only have to click on the 'x' button on each brush or leave the light curve unaltered (approval that the automatic candidate identification is accurate). Brushing, zooming and refining will still be available to taggers, but the hope here is to reduce their usage.  \n- If a light curve has 'large' number of taggers, their approval and disapproval (+ optional 

### Other types of retrievers

In [19]:
from langchain.retrievers import SVMRetriever
from langchain.retrievers import TFIDFRetriever
svm_retriever = SVMRetriever.from_texts([md_header_split.page_content for md_header_split in md_header_splits], embedding)

In [20]:
question = "How long did brute force run for?"
docs_svm=svm_retriever.get_relevant_documents(question)
docs_svm[0]

Document(page_content='If the transition probabilities are known, a model free approximation of the Q function can be computed based on historical data. The agent can be made to act in a greedy way (immediate reward, without exploration) to achieve its goal. However, it should be noted that a set of locally optimal paths is not guaranteed to build a globally optimal path. Therefore, this method is not guaranteed to work in environments that are drastically different from those observed in the past.  \nIf the transition probabilities are known, a model free approximation of the Q function can be computed based on historical data. The agent can be made to act in a greedy way (immediate reward, without exploration) to achieve its goal. However, it should be noted that a set of locally optimal paths is not guaranteed to build a globally optimal path. Therefore, this method is not guaranteed to work in environments that are drastically different from those observed in the past.')

In [21]:
tfidf_retriever = TFIDFRetriever.from_texts([md_header_split.page_content for md_header_split in md_header_splits])

In [22]:
docs_tfidf=tfidf_retriever.get_relevant_documents(question)
docs_tfidf[0]

Document(page_content="Disclaimer: If one has the option, one should use rstudio.cloud and shinyapps.io to develop and deploy RShiny apps. Following this article is a waste of time.  \nWorking on R/Python in Android OS may sound like an artificial constraint, but I faced this scenario - I wanted to run a RShiny app, but I did not have the resources to run apps in the 'free tier' both locally and on the Cloud.  \nExperts claim Android OS is just a version of Linux, but I'm not an expert in this area. This short article covers how a layman (like me) can run R scripts in Android OS. Similar steps can be followed to install and run Python in Android OS. Clearly, this is not an everyday scenario; not even something worth trying for fun.")

## Question answering

Note: For now this section of code won't work.

In [23]:
import openai
openai.api_key = open("../OPENAI_API_KEY.txt", "r").read()

In [24]:
import datetime
current_date = datetime.datetime.now().date()
if current_date < datetime.date(2023, 9, 2):
    llm_name = "gpt-3.5-turbo-0301"
else:
    llm_name = "gpt-3.5-turbo"
print(llm_name)

gpt-3.5-turbo


In [25]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

### Using langsmith

In [26]:
import os
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = open("../LANGCHAIN_API_KEY.txt", "r").read()

## Chat

Note: For now this section of code won't work.

### We've already seen this

### Memory

## Putting everything together

In [27]:
# from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA,  ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
# from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader

In [28]:
def load_db(file, chain_type, k):
    # load documents
    loader = PyPDFLoader(file)
    documents = loader.load()
    # split documents
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    docs = text_splitter.split_documents(documents)
    # define embedding
    # embeddings = OpenAIEmbeddings()
    embeddings = SpacyEmbeddings()
    # create vector database from data
    db = DocArrayInMemorySearch.from_documents(docs, embeddings)
    # define retriever
    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})
    # create a chatbot chain. Memory is managed externally.
    qa = ConversationalRetrievalChain.from_llm(
        llm=ChatOpenAI(model_name=llm_name, temperature=0), 
        chain_type=chain_type, 
        retriever=retriever, 
        return_source_documents=True,
        return_generated_question=True,
    )
    return qa 


In [29]:
import panel as pn
import param

class cbfs(param.Parameterized):
    chat_history = param.List([])
    answer = param.String("")
    db_query  = param.String("")
    db_response = param.List([])
    
    def __init__(self,  **params):
        super(cbfs, self).__init__( **params)
        self.panels = []
        self.loaded_file = "docs/cs229_lectures/MachineLearning-Lecture01.pdf"
        self.qa = load_db(self.loaded_file,"stuff", 4)
    
    def call_load_db(self, count):
        if count == 0 or file_input.value is None:  # init or no file specified :
            return pn.pane.Markdown(f"Loaded File: {self.loaded_file}")
        else:
            file_input.save("temp.pdf")  # local copy
            self.loaded_file = file_input.filename
            button_load.button_style="outline"
            self.qa = load_db("temp.pdf", "stuff", 4)
            button_load.button_style="solid"
        self.clr_history()
        return pn.pane.Markdown(f"Loaded File: {self.loaded_file}")

    def convchain(self, query):
        if not query:
            return pn.WidgetBox(pn.Row('User:', pn.pane.Markdown("", width=600)), scroll=True)
        result = self.qa({"question": query, "chat_history": self.chat_history})
        self.chat_history.extend([(query, result["answer"])])
        self.db_query = result["generated_question"]
        self.db_response = result["source_documents"]
        self.answer = result['answer'] 
        self.panels.extend([
            pn.Row('User:', pn.pane.Markdown(query, width=600)),
            pn.Row('ChatBot:', pn.pane.Markdown(self.answer, width=600, style={'background-color': '#F6F6F6'}))
        ])
        inp.value = ''  #clears loading indicator when cleared
        return pn.WidgetBox(*self.panels,scroll=True)

    @param.depends('db_query ', )
    def get_lquest(self):
        if not self.db_query :
            return pn.Column(
                pn.Row(pn.pane.Markdown(f"Last question to DB:", styles={'background-color': '#F6F6F6'})),
                pn.Row(pn.pane.Str("no DB accesses so far"))
            )
        return pn.Column(
            pn.Row(pn.pane.Markdown(f"DB query:", styles={'background-color': '#F6F6F6'})),
            pn.pane.Str(self.db_query )
        )

    @param.depends('db_response', )
    def get_sources(self):
        if not self.db_response:
            return 
        rlist=[pn.Row(pn.pane.Markdown(f"Result of DB lookup:", styles={'background-color': '#F6F6F6'}))]
        for doc in self.db_response:
            rlist.append(pn.Row(pn.pane.Str(doc)))
        return pn.WidgetBox(*rlist, width=600, scroll=True)

    @param.depends('convchain', 'clr_history') 
    def get_chats(self):
        if not self.chat_history:
            return pn.WidgetBox(pn.Row(pn.pane.Str("No History Yet")), width=600, scroll=True)
        rlist=[pn.Row(pn.pane.Markdown(f"Current Chat History variable", styles={'background-color': '#F6F6F6'}))]
        for exchange in self.chat_history:
            rlist.append(pn.Row(pn.pane.Str(exchange)))
        return pn.WidgetBox(*rlist, width=600, scroll=True)

    def clr_history(self,count=0):
        self.chat_history = []
        return 
