In [2]:
import os
import textwrap

import langchain
import chromadb
import transformers
import openai
import torch
import json
import time

from transformers import AutoTokenizer
from langchain import HuggingFacePipeline
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from mixedbread_ai.client import MixedbreadAI
from chromadb.utils import embedding_functions

ModuleNotFoundError: No module named 'mixedbread_ai'

In [82]:
# Load config data
f = open('config.json')

config = json.load(f)

In [83]:
key = config['key']
key

'emb_7def3fe8a51080e1466350a510a87a0b32d170dcda85c5fa'

In [40]:
#Set up HuggingFace Pipeline with Llama-2-7b-chat-hf model
model = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline(
      "text-generation", #task
      model=model,
      tokenizer=tokenizer,
      torch_dtype=torch.float16,
      trust_remote_code=True,
      device_map="auto",
      max_length=1000,
      do_sample=True,
      top_k=10,
      num_return_sequences=1,
      eos_token_id=tokenizer.eos_token_id
)

#LLM intialized in HuggingFace Pipeline wrapper
llm = HuggingFacePipeline(pipeline = pipeline, model_kwargs = {'temperature':0})

Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.90s/it]


In [115]:
loader = CSVLoader('data/atcs-final-test.csv')
docs = loader.load()
docs[0]

Document(page_content='Question: What types of fragrances do you offer?\nAnswer: We sell exotic Indian fragrances.', metadata={'source': 'data/atcs-final-test.csv', 'row': 0})

In [118]:
# Split document into text chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
print(docs[0])
docs[0] = text_splitter.split_documents(docs)

page_content='Question: What types of fragrances do you offer?\nAnswer: We sell exotic Indian fragrances.' metadata={'source': 'data/atcs-final-test.csv', 'row': 0}


[Document(page_content='Question: What types of fragrances do you offer?\nAnswer: We sell exotic Indian fragrances.', metadata={'source': 'data/atcs-final-test.csv', 'row': 0}),
 Document(page_content='Question: Do you sell anything besides perfume?\nAnswer: Yes! Incense, burning oils, etc.', metadata={'source': 'data/atcs-final-test.csv', 'row': 1})]

In [119]:
# Create embedding model
default_ef = embedding_functions.DefaultEmbeddingFunction()

In [61]:
!pip install tiktoken
!pip install mixedbread-ai

Collecting mixedbread-ai
  Downloading mixedbread_ai-2.2.0-py3-none-any.whl.metadata (5.4 kB)
Collecting types-requests<3.0.0.0,>=2.31.0.20240311 (from mixedbread-ai)
  Downloading types_requests-2.31.0.20240406-py3-none-any.whl.metadata (1.8 kB)
Downloading mixedbread_ai-2.2.0-py3-none-any.whl (39 kB)
Downloading types_requests-2.31.0.20240406-py3-none-any.whl (15 kB)
Installing collected packages: types-requests, mixedbread-ai
Successfully installed mixedbread-ai-2.2.0 types-requests-2.31.0.20240406


In [122]:
# Load it into ChromaDB
db = Chroma.from_documents(docs, default_ef)

AttributeError: 'ONNXMiniLM_L6_V2' object has no attribute 'embed_documents'

In [87]:
#Design Prompt Template
template = """You are an english teacher chatbot for an AP English language class.

{context}

Look for the phrases described in the source data. Surround these phrases with a double asterisk in the original text, and return it. If you are unsure, say "I am unsure."

Question:

Answer: """

In [88]:
#Intiliaze prompt using prompt template via langchain
prompt = PromptTemplate(template=template, input_variables=["context"])
print(
    prompt.format(
        context = "A student has an essay they would like you to review"
    )
)

You are an english teacher chatbot for an AP English language class.

A student has an essay they would like you to review

Look for the phrases described in the source data. Surround these phrases with a double asterisk in the original text, and return it. If you are unsure, say "I am unsure."

Question:

Answer: 


In [89]:
#Chain to have all components together and query the LLM
chain_type_kwargs = {"prompt": prompt}

chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": 1}),
    chain_type_kwargs=chain_type_kwargs,
)

AttributeError: 'list' object has no attribute 'as_retriever'

In [None]:
# Formatted printing
def print_response(response: str):
    print("\n".join(textwrap.wrap(response, width=80)))

In [None]:
#Running chain through LLM with query
query = "Does this text contain 'Since the beginning of time' Text: Since the beginning of time, humans have engaged in the practice of storytelling."
response = chain.run(query)
print_response(response)