In [1]:
from IPython.display import JSON
# If `import chromadb` throws some errors:
# import numpy as np
# np.float_ = np.float64

import json
from unstructured.staging.base import dict_to_elements, elements_to_json
import chromadb

# If you're planning to use OpenAI stuff:
# import os
# os.environ["OPENAI_API_KEY"] = open("../OPENAI_API_KEY.txt", "r").read()# Add your OpenAI API key as text to OPENAI_API_KEY.txt
# # But remember, each query is going to be billed to the payment method on the Open AI profile

import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))

from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings

from langchain.prompts.prompt import PromptTemplate
# from langchain_openai import OpenAI# Just kidding
from langchain.chains import ConversationalRetrievalChain, LLMChain
from langchain.chains.qa_with_sources import load_qa_with_sources_chain

# Use case description

Making my blog searchable using unstructured package, some vector database, etc. - probably setting things up for a RAG pipeline

In [2]:
from unstructured.partition.md import partition_md
filename = "../../tech_non_tech_blog/_posts/2019-02-02-Speeding-up-diff-between-consecutive-rows-in-python-on-my-laptop.md"
elements = partition_md(filename=filename)
element_dict = [el.to_dict() for el in elements]
example_output = json.dumps(element_dict, indent=2)
JSON(example_output)



<IPython.core.display.JSON object>

Write to `chromadb`

In [3]:
client = chromadb.PersistentClient(path="chroma_tmp", settings=chromadb.Settings(allow_reset=True))
client.reset()

True

In [4]:
collection = client.create_collection(
    name="tech_non_tech_blog",
    metadata={"hnsw:space": "cosine"}
)

In [5]:
chapters = [
    "Introduction",
    "Data (samples):",
    "Desired outputs for the examples",
    "Properties",
    "Solutions (processing 1 day)",
    "Brute force",
    "Improvement using pandas",
    "Improved ‘search’ using dictionary",
    "Slightly improved ‘search’: using only 1 loop (both id’s are sorted)",
    "Ultimatum: multi-processing",
    "Run-time Comparison",
    "Concatenation — An Additional Problem",
    "Problem",
    "Solution",
    "Learnings",
    "Summary"
]

In [6]:
chapter_ids = {}
for element in element_dict:
    for chapter in chapters:
        try:
            if element["text"] == chapter and element["type"] == "Title":
                chapter_ids[element["element_id"]] = chapter
                break
        except:
            continue

chapter_ids

{'b605350bc00209520b7cd8f546322663': 'Introduction',
 '636d541c4a0f461e28dea7d07da5b6dc': 'Data (samples):',
 '0c0b668be128a6f206694ef12e326abe': 'Desired outputs for the examples',
 'ae43692b2a310b8ed2443d2b7d2a3e4c': 'Properties',
 'e0cd2d7145059c6e745c7bb870252f48': 'Solutions (processing 1 day)',
 '710964e714f40ea09cb19e2a61d90ced': 'Brute force',
 '83df8c94937e7d4f56e1061aa4e8d275': 'Improvement using pandas',
 '90644dce098ec9604e4ed6b13daddf25': 'Ultimatum: multi-processing',
 'e3215a5d8b30d553614f0458c518b711': 'Run-time Comparison',
 '956dd4243bcd8067fc08f0b2a1b09384': 'Concatenation — An Additional Problem',
 'a1c5ae7bcb34bbf48104488eff8deede': 'Problem',
 '22927695994eab589c7601a6b15df4d9': 'Solution',
 '71cc0d3f8db84d73d55bd64224208f7c': 'Learnings',
 '8e76a94ac8320d515375e625bef18292': 'Summary'}

In [7]:
for element in element_dict:
    parent_id = element["metadata"].get("parent_id")
    chapter = chapter_ids.get(parent_id, "")
    collection.add(
        documents=[element["text"]],
        ids=[element["element_id"]],
        metadatas=[{"chapter": chapter}]
    )

In [8]:
results = collection.peek()
print(results["documents"])

["Slightly improved 'search': using only 1 loop (both id’s are sorted)", 'Algorithm matters! Gains are in algorithm level. This applies to almost every area of programming.', "Without mentioning the type of data, here are few anonymized samples (dy's can be positive or negative):", '0 < y0_id < y1_id < …. < yN_id', 'Let us iterate through 2 examples:', 'Desired outputs for the examples', "I'm currently pursuing independent research on large financial data. Data engineering is an integral part of the analysis as financial data is often not in the format required for analysis. Financial data may be provided in the form of transactions/updates or in the form of aggregates. While it is easy to go from transaction level to aggregate level, it is difficult to do the opposite.", 'Answer: Other columns (not shown above) in the data set help in identifying the difference.', "I believe this one doesn't require explanation. Sample code:", "My laptop configuration is decent: Ubuntu 18.04, 16 GB DD

In [9]:
result = collection.query(
    query_texts=["How long did brute force take to process one day's data?"],
    n_results=2,
    where={"chapter": "Introduction"},
)
print(json.dumps(result, indent=2))

{
  "ids": [
    [
      "1b6fec632d0be229a6cc22ae5d8018eb",
      "0dd507109de3540719327b782086dcd0"
    ]
  ],
  "distances": [
    [
      0.5362240327506318,
      0.8606636028930317
    ]
  ],
  "metadatas": [
    [
      {
        "chapter": "Introduction"
      },
      {
        "chapter": "Introduction"
      }
    ]
  ],
  "embeddings": null,
  "documents": [
    [
      "My laptop configuration is decent: Ubuntu 18.04, 16 GB DDR4, Intel core i7\u20138750H (6 + 6 virtual core) @ 2.2 GHz and a GPU (not relevant here). But brute force is not a good idea \u2014 code run time was ~ 4 hours for processing one day's data, which is not acceptable.",
      "I'm currently pursuing independent research on large financial data. Data engineering is an integral part of the analysis as financial data is often not in the format required for analysis. Financial data may be provided in the form of transactions/updates or in the form of aggregates. While it is easy to go from transaction level

Not very specific, but not a bad result.

## Trying Hugging Face summarization instead

In [10]:
element_dict[3]["text"]

"I'm currently pursuing independent research on large financial data. Data engineering is an integral part of the analysis as financial data is often not in the format required for analysis. Financial data may be provided in the form of transactions/updates or in the form of aggregates. While it is easy to go from transaction level to aggregate level, it is difficult to do the opposite."

Make sure to run `conda install conda-forge::sentencepiece` before running the next code block

The following code was copied from https://huggingface.co/csebuetnlp/mT5_multilingual_XLSum

In [11]:
article_text = element_dict[3]["text"]
model_name = "csebuetnlp/mT5_multilingual_XLSum"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

input_ids = tokenizer(
    [WHITESPACE_HANDLER(article_text)],
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=512
)["input_ids"]
output_ids = model.generate(
    input_ids=input_ids,
    max_length=84,
    no_repeat_ngram_size=2,
    num_beams=4
)[0]
summary = tokenizer.decode(
    output_ids,
    skip_special_tokens=True,
    clean_up_tokenization_spaces=False
)
print(summary)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


ах и а т і м о х е - мати .


Funny! Let's try again with a hardcoded text

In [12]:
article_text = """Data engineering is an integral part of the analysis as financial data is often not in the format required for analysis. Financial data may be provided in the form of transactions/updates or in the form of aggregates. While it is easy to go from transaction level to aggregate level, it is difficult to do the opposite."""
input_ids = tokenizer(
    [WHITESPACE_HANDLER(article_text)],
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=512
)["input_ids"]
output_ids = model.generate(
    input_ids=input_ids,
    max_length=84,
    no_repeat_ngram_size=2,
    num_beams=4
)[0]
summary = tokenizer.decode(
    output_ids,
    skip_special_tokens=True,
    clean_up_tokenization_spaces=False
)
print(summary)

ах и т а щ і м о х е ща мич я ри - мати .


Funny! Choosing a good summarizaiton model is not easy. This will need some fine-tuning or RAG. Stay tuned!

## Trying the retrieval piece

Fortunately this is not the end of the world. There are some open source embedding models. The list along with some code can be found here: https://python.langchain.com/docs/integrations/text_embedding/

Make sure to run `pip install spacy` before running the next code block

Also run `python -m spacy download en_core_web_sm` before running the next code block

In [13]:
embeddings = SpacyEmbeddings()

In [14]:
documents = []
for element in elements:
    metadata = element.metadata.to_dict()
    del metadata["languages"]
    metadata["source"] = metadata["filename"]
    documents.append(Document(page_content=element.text, metadata=metadata))

In [15]:
vectorstore = Chroma.from_documents(documents, embeddings)

In [16]:
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 6}
)

In [17]:
template = """You are an AI assistant for answering questions about the articles I wrote in the tech_non_tech_blog.
You are given the following extracted parts of a long document and a question. Provide a conversational answer.
If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
If the question is not about articles in the tech_non_tech_blog, politely inform them that you are tuned to only answer questions about tech_non_tech_blog.
Question: {question}
=========
{context}
=========
Answer in Markdown:"""
prompt = PromptTemplate(template=template, input_variables=["question", "context"])

## OpenLLM

OpenLLM doesn't seem to work on Windows. Try again using Linux

Make sure to run `pip install openllm` before running the next code block

Still the OpenLLM model doesn't initialize

AttributeError: module 'openllm' has no attribute 'Runner'

Tried a few things using OpenLLM command line. It doesn't look like OpenLLM works on Windows. Need to switch to Linux and try. WSL doesn't work because for some reason Nvidia GPU isn't recognized.