In [None]:
from auto_gptq import exllama_set_max_input_length
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

from langchain.vectorstores.faiss import FAISS
from langchain.prompts import ChatPromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms.huggingface_pipeline import HuggingFacePipeline

import dataset

## Retriever

In [None]:
import importlib
_ = importlib.reload(dataset)

from dataset import StudieinfoDataset

### Dataset

In [None]:
docs = StudieinfoDataset(path="./dataset/courses")

print(str(docs[0]))

### Retriever model

In [None]:
modelPath = "sentence-transformers/all-MiniLM-l6-v2"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

db = FAISS.from_documents(docs, embeddings)

## Language model

In [None]:
model_name_or_path = "TheBloke/Mistral-7B-OpenOrca-GPTQ"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="cuda",
                                             trust_remote_code=False,
                                             revision="main",)
model = exllama_set_max_input_length(model, 4096)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

In [None]:
template = """<|im_start|>system
Answer the question based only on the following context:
{context}<|im_end|>
<|im_start|>user
{question}<|im_end|>
<|im_start|>assistant
"""
prompt = ChatPromptTemplate.from_template(template)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    top_k=40,
    repetition_penalty=1.1,
    return_full_text=True
)

hf_pipe = HuggingFacePipeline(pipeline=pipe, verbose=True)

In [None]:
retriever = db.as_retriever(search_type="mmr",search_kwargs={'k': 3, 'fetch_k': 20, 'lambda_mult': 0.10})

In [None]:
question = "What course is most similar to text mining?"

context = retriever.invoke(question)
instruction = prompt.invoke({"context": ("\n\n").join([c.page_content for c in context]), "question": question})
output = hf_pipe.invoke(instruction)

In [None]:
print(instruction.to_string().strip())
print(output)