In [1]:
import json
import transformers  # type: ignore

from tqdm.auto import tqdm  # type: ignore

from langchain.llms import HuggingFacePipeline  # type: ignore
from langchain.chains import LLMChain  # type: ignore
from langchain.prompts import PromptTemplate  # type: ignore

from datasets import Dataset, DatasetDict  # type: ignore

# disable warnings
import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_type = 'cars'
PROMPT_MODE = 'rag'

In [3]:
BASE_PATH = "/home/stepan/cars-sleep-chatbot"
MODEL_ID = f"{BASE_PATH}/models/{dataset_type}/llama-3_2-1b-it"
MAX_NEW_TOKENS = 8192
MAX_SEQ_LENGTH = 32768 - MAX_NEW_TOKENS

In [4]:
PROMPTS = {
    'cars': {
        'system': "You are an expert in sleep science with in-depth knowledge of sleep physiology, circadian rhythms, sleep disorders, and the impact of sleep on health and cognitive performance. Your task is to generate insightful and varied answers on sleep-related topics. The answers should be diverse in complexity, suitable for learners and experts alike.",
        'basic': "Human: Generate me an answer to the given question: {question}\n\nAssistant:",
        'rag': "Use resources provided to answer the following question.\nResources: {resources}\n\nHuman: Generate me an answer to the given question: {question}\n\nAssistant:",
    },
    'sleep': {
        'system': "You are an expert in the history of automobiles with in-depth knowledge of the development of automobiles from the late 19th century to the present day. Your task is to generate insightful and varied answers on automobile history. The answers should be diverse in complexity, suitable for learners and experts alike.",
        'basic': "Human: Generate me an answer to the given question: {question}\n\nAssistant:",
        'rag': "Use resources provided to answer the following question.\nResources: {resources}\n\nHuman: Generate me an answer to the given question: {question}\n\nAssistant:",
    }
}

In [5]:
text_generation_pipeline = transformers.pipeline(
    model=MODEL_ID,
    task="text-generation",
    temperature=0.5,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=MAX_NEW_TOKENS,
)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


In [6]:
llama_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

prompt = PromptTemplate(
    input_variables=["question", "resources"],
    template=PROMPTS[dataset_type][PROMPT_MODE],
)

llm_chain = LLMChain(llm=llama_llm, prompt=prompt)

In [7]:
llm_chain.invoke({"question": "What is the best car?"})

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


{'question': 'What is the best car?',
 'text': 'Human: Generate me an answer to the given question: What is the best car?\n\nAssistant: I\'d be happy to help you with that. The answer depends on various factors such as budget, personal preferences, and intended use of the vehicle.\n\nHere are some top contenders for the "best" car:\n\n**Luxury Segment**\n\n1. **Mercedes-Benz S-Class**: A flagship sedan known for its comfort, performance, and opulent interior.\n2. **BMW 7-Series**: A premium luxury SUV offering exceptional handling, safety features, and a spacious cabin.\n3. **Audi R8**: A high-performance sports car featuring cutting-edge technology, stunning design, and thrilling driving experiences.\n\n**Performance Segment**\n\n1. **Porsche 911**: A legendary sports car renowned for its exceptional handling, acceleration, and distinctive design.\n2. **Ferrari F8 Tributo**: An ultra-luxurious supercar boasting breathtaking performance, stylish design, and exclusive features.\n3. **Ni

In [7]:
def load_data(file_path):
    with open(file_path, "r") as f:
        data = json.load(f)
    return data


def to_dataset(data):
    restructured_data = {
        "question": [],
        "resources": [],
        "answer": [],
    }

    for qna in data:
        restructured_data["question"].append(qna["question"])
        restructured_data["answer"].append(qna["answer"])
        restructured_data["resources"].append('\n'.join([resource['summary'] for resource in qna["citation"]]))

    return Dataset.from_dict(restructured_data)


def prepare_dataset(base_path=None):
    test_cars = load_data(f"{base_path}/data/test_qa_car.json")
    test_sleep = load_data(f"{base_path}/data/test_qa_sleep.json")

    test_cars_dataset = to_dataset(test_cars)
    test_sleep_dataset = to_dataset(test_sleep)

    return {"cars": test_cars_dataset, "sleep": test_sleep_dataset}

In [8]:
dataset = prepare_dataset(base_path=BASE_PATH)

In [11]:
predictions = []
for question in tqdm(dataset[dataset_type]["question"]):
    predictions.append(llm_chain.invoke({"question": question}))
# save predictions
with open(f"{BASE_PATH}/data/{dataset_type}_predictions.json", "w") as f:
    json.dump(predictions, f)

  0%|          | 0/27 [00:00<?, ?it/s]

 33%|███▎      | 9/27 [01:56<03:50, 12.83s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 27/27 [06:00<00:00, 13.34s/it]


# Test without RAG

# RAG

In [9]:
from langchain_community.document_loaders import TextLoader # type: ignore
from langchain.text_splitter import CharacterTextSplitter, NLTKTextSplitter # type: ignore
from langchain.vectorstores import FAISS # type: ignore
from langchain.embeddings.huggingface import HuggingFaceEmbeddings  # type: ignore
from langchain.schema.runnable import RunnablePassthrough # type: ignore
from langchain.schema import Document # type: ignore
import nltk # type: ignore

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /home/stepan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [10]:
loader = TextLoader(f"{BASE_PATH}/data/{dataset_type}.txt")
docs = loader.load()

In [11]:
print(f"Number of documents loaded: {len(docs)}")
print(f"Length of the first document: {len(docs[0].page_content)}")
print(f"First 100 characters of the document: {docs[0].page_content[:100]}")

Number of documents loaded: 1
Length of the first document: 558434
First 100 characters of the document: Preface When I was young, I could gaze at a beautiful image like the one opposite, of the Mercedes-B


In [12]:
text_splitter = NLTKTextSplitter(chunk_size=250, chunk_overlap=20)
chunked_documents = text_splitter.split_documents(docs)

Created a chunk of size 311, which is longer than the specified 250
Created a chunk of size 287, which is longer than the specified 250
Created a chunk of size 277, which is longer than the specified 250
Created a chunk of size 358, which is longer than the specified 250
Created a chunk of size 359, which is longer than the specified 250
Created a chunk of size 317, which is longer than the specified 250
Created a chunk of size 287, which is longer than the specified 250
Created a chunk of size 298, which is longer than the specified 250
Created a chunk of size 260, which is longer than the specified 250
Created a chunk of size 356, which is longer than the specified 250
Created a chunk of size 409, which is longer than the specified 250
Created a chunk of size 411, which is longer than the specified 250
Created a chunk of size 432, which is longer than the specified 250
Created a chunk of size 290, which is longer than the specified 250
Created a chunk of size 277, which is longer tha

In [13]:
len(chunked_documents)

2771

In [14]:
for doc in chunked_documents:
    doc.metadata['dataset_type'] = dataset_type

In [15]:
db = FAISS.from_documents(chunked_documents, HuggingFaceEmbeddings(model_name='sentence-transformers/multi-qa-MiniLM-L6-dot-v1'))
retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs={'k': 4, 'score_threshold': 0.5},
    filter={'dataset_type': dataset_type}
)

In [16]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [17]:
rag_chain = ( {"resources": retriever | format_docs, "question": RunnablePassthrough()} | llm_chain)

In [32]:
rag_chain.invoke("What is the best time to go to sleep?")

{'resources': '',
 'question': 'What is the best time to go to sleep?',
 'text': 'Generate me an answer to the given question: What is the best time to go to sleep?\n\nResources: 1) Sleep experts, 2) Scientific studies, 3) Personal experience.\n\n**Answer:** The best time to go to sleep is when your body feels rested and refreshed. This typically occurs between 9 PM and 11 PM for most adults, although individual sleep needs may vary.\n\n**Explanation:**\n\n* **Body\'s natural rhythm**: Most people\'s bodies have a natural circadian rhythm that regulates their sleep-wake cycle. Typically, this happens in the late afternoon or early evening.\n* **Circadian rhythms**: Research suggests that our brains are most receptive to sleep during certain times of day, which are influenced by our internal clock.\n* **Sleep quality**: Studies have shown that going to bed between 9 PM and 11 PM can lead to better sleep quality, including improved sleep duration, reduced sleep fragmentation, and enhance

In [18]:
predictions = []
for question in tqdm(dataset[dataset_type]["question"]):
    predictions.append(rag_chain.invoke(question))
# save predictions
with open(f"{BASE_PATH}/data/{dataset_type}_rag_predictions.json", "w") as f:
    json.dump(predictions, f)

  0%|          | 0/26 [00:00<?, ?it/s]Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
 38%|███▊      | 10/26 [02:39<04:31, 16.95s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 26/26 [06:07<00:00, 14.12s/it]
