In [1]:
import torch
print(torch.cuda.is_available())        #
print(torch.cuda.get_device_name(0))    # 

True
NVIDIA GeForce RTX 3060


In [2]:
from huggingface_hub import login
import os
from dotenv import load_dotenv

# Load token từ .env
load_dotenv()
token = os.environ["HUGGINGFACEHUB_API_TOKEN"]

# Login trực tiếp bằng code
login(token=token)

print("✅ Đã login thành công!")

  from .autonotebook import tqdm as notebook_tqdm


✅ Đã login thành công!


In [3]:
%pwd

'c:\\d\\generative AI\\universal-document-QA_with_Llama2\\research'

In [4]:
import os
os.chdir("../")

In [5]:
%pwd

'c:\\d\\generative AI\\universal-document-QA_with_Llama2'

In [6]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [7]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

In [8]:
extracted_data=load_pdf_file(data='data/')

In [9]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [10]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 238


In [11]:
from langchain.embeddings import HuggingFaceEmbeddings

In [12]:
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings


In [13]:
embeddings = download_hugging_face_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


In [14]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [45]:
# import nltk
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

In [15]:
from langchain.vectorstores import FAISS
vectorstore = FAISS.from_documents(text_chunks, embeddings)

In [16]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [17]:
retrieved_docs = retriever.invoke("What is ViVit?")

In [18]:
retrieved_docs

[Document(id='c8620253-5b19-4a64-87fe-0ef8f02b1ea9', metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2021-11-02T00:53:43+00:00', 'author': '', 'keywords': '', 'moddate': '2021-11-02T00:53:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'data\\ViViT.pdf', 'total_pages': 14, 'page': 2, 'page_label': '3'}, page_content='3.1. Overview of Vision Transformers (ViT)\nVision Transformer (ViT) [18] adapts the transformer\narchitecture of [68] to process 2D images with minimal\nchanges. In particular, ViT extracts N non-overlapping im-\nage patches, xi ∈Rh×w, performs a linear projection and\nthen rasterises them into 1D tokens zi ∈Rd. The sequence\nof tokens input to the following transformer encoder is\nz = [zcls,Ex1,Ex2,..., ExN ] +p, (1)\nwhere the projection byE is equivalent to a 2D convolution.'),
 Document(id='399891b

In [19]:
model = "meta-llama/Llama-2-7b-chat-hf"

In [20]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(load_in_4bit=True)

tokenizer = AutoTokenizer.from_pretrained(model, token=token)
model = AutoModelForCausalLM.from_pretrained(
    model,
    device_map="auto",
    quantization_config=bnb_config,
    token=token
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:23<00:00, 11.82s/it]


In [23]:
from transformers import pipeline
pipe = pipeline("text-generation",
                model=model,
                tokenizer= tokenizer,
                dtype=torch.bfloat16,
                device_map="auto",
                max_new_tokens = 512,
                do_sample=True,
                top_k=30,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id
                )

Device set to use cuda:0


In [24]:
from langchain import HuggingFacePipeline
llm=HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature':0})

  llm=HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature':0})


In [25]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [26]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [27]:
response = rag_chain.invoke({"input": "What is ViVit?"})
print(response["answer"])

System: You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Use three sentences maximum and keep the answer concise.

3.1. Overview of Vision Transformers (ViT)
Vision Transformer (ViT) [18] adapts the transformer
architecture of [68] to process 2D images with minimal
changes. In particular, ViT extracts N non-overlapping im-
age patches, xi ∈Rh×w, performs a linear projection and
then rasterises them into 1D tokens zi ∈Rd. The sequence
of tokens input to the following transformer encoder is
z = [zcls,Ex1,Ex2,..., ExN ] +p, (1)
where the projection byE is equivalent to a 2D convolution.

VideoMAE-L (Tong et al. 2022) ViT-L 16×3×5 305 305 9.0 85.2 96.8
VideoMAE-L (Tong et al. 2022) ViT-L 40×3×4 305 305 47.5 86.1 97.3
Well-prepared ViT with plug-and-play modules.
TimeSformer-L (Bertasius et al. 2021)ViT-B IN-21K 96×3×1 121 121 7.1 80.7 94.7
CoCa (Yu et al. 2022) ViT