In [2]:
from pathlib import Path
 
import torch
from auto_gptq import AutoGPTQForCausalLM
from langchain.chains import ConversationalRetrievalChain
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, GenerationConfig, TextStreamer, pipeline

2024-07-15 08:37:38.150859: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def load_tokenizer_model(modelPath):  
        from transformers import AutoTokenizer, AutoModelForCausalLM,BitsAndBytesConfig
        bit_quantization=4
        if bit_quantization == 4:
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                #load_4bit_use_double_quant=True,  #now it is deprecated and do not use anymore in latest package
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.float16
            )
        else:
            bnb_config = BitsAndBytesConfig(
                load_in_8bit=True,
                #load_8bit_use_double_quant=True,
                bnb_8bit_quant_type="nf4",
                bnb_8bit_compute_dtype=torch.float16
            )

        model = AutoModelForCausalLM.from_pretrained(
            modelPath, 
            quantization_config=bnb_config,
            device_map="auto", 
            #torch_dtype="auto", 
            trust_remote_code=True, 
        )
        tokenizer = AutoTokenizer.from_pretrained(modelPath, trust_remote_code=True)

        return tokenizer,model


tokenizer,model=load_tokenizer_model("internlm/internlm2_5-7b-chat")


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [3]:
# from PyPDF2 import PdfReader
# from langchain.schema import Document

# def get_pdf_text(file):
#     documents=[]
#     pdf_reader = PdfReader(file)
#     text=""
#     for page_num,page in enumerate(pdf_reader.pages):
#         text = page.extract_text()
#         if text:
#             documents.append(Document(page_content=text, metadata={"page": page_num}))
#     return documents

# data=get_pdf_text('../Embedding/arvix/arivx_pdfs/1.pdf')
# data


# #for document vectorstore = FAISS.from_documents(text_chunks, hf)

In [3]:
from PyPDF2 import PdfReader
from langchain.schema import Document

def get_pdf_text(file):
    pdf_reader = PdfReader(file)
    text=""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

data=get_pdf_text('../Embedding/arvix/arivx_pdfs/1.pdf')
data[:200]


'X-VILA: Cross-Modality Alignment for\nLarge Language Model\nHanrong Ye1,2∗, De-An Huang1, Yao Lu1, Zhiding Yu1, Wei Ping1, Andrew Tao1,\nJan Kautz1, Song Han1,3, Dan Xu2, Pavlo Molchanov1, Hongxu Yin1\nNV'

In [4]:
def split_text(text,search_type):
        if search_type=='scalar':
            import json
            text = json.dumps(text, indent=4)
            separator_symbol=" "
        else:
            separator_symbol=" "
        from langchain.text_splitter import CharacterTextSplitter
        # Splitting up the text into smaller chunks for indexing
        text_splitter = CharacterTextSplitter(        
            separator = separator_symbol,
            chunk_size = 3900,
            chunk_overlap  = 150, #striding over the text
            length_function = len,
        )
        texts = text_splitter.split_text(text)
        return texts

splitted_text=split_text(data,'vector')

In [8]:
len(splitted_text)

18

In [5]:
def get_vectorstore(text_chunks):
        from langchain_community.vectorstores import FAISS 
        from langchain_community.embeddings import HuggingFaceBgeEmbeddings
        model_name="../Embedding/model"  #have to install microsoft/Phi-3-mini-4k-instruct
        model_kwargs = {"device": "cpu"}
        encode_kwargs = {"normalize_embeddings": True}
        hf = HuggingFaceBgeEmbeddings(
            model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
        )
        vectorstore = FAISS.from_texts(text_chunks, hf) #for text, 
        return vectorstore

ve_store=get_vectorstore(splitted_text)

No sentence-transformers model found with name ../Embedding/model. Creating a new one with MEAN pooling.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You are not running the flash-attention implementation, expect numerical differences.


In [20]:
system_prompt = (
            "You are an assistant for question-answering tasks. "
            "Use the following pieces of retrieved context to answer "
            "the question. If you don't know the answer, say that you "
            "don't know. Use three sentences maximum and keep the "
            "answer concise."
            "\n\n"
            "{context}"
        )

def generate_prompt() -> str:
    from langchain_core.prompts import ChatPromptTemplate
    

    return ChatPromptTemplate.from_messages(
            [
                ("system", system_prompt),
                ("human", "{input}"),
            ]
        )

In [22]:
def get_llm(tokenizer,model):
        from transformers import pipeline
        from langchain import HuggingFacePipeline
        text_pipeline = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=512,  
        )
        llm = HuggingFacePipeline(pipeline=text_pipeline, model_kwargs={"temperature": 0})
        return llm

llm=get_llm(tokenizer,model)

In [6]:
retriever=get_vectorstore(splitted_text).as_retriever()

No sentence-transformers model found with name ../Embedding/model. Creating a new one with MEAN pooling.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [23]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

question_answer_chain = create_stuff_documents_chain(llm, generate_prompt())
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [24]:
response = rag_chain.invoke({"input": "What is LLM?"})
response["answer"]

"System: You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Use three sentences maximum and keep the answer concise.\n\nX-VILA: Cross-Modality Alignment for\nLarge Language Model\nHanrong Ye1,2∗, De-An Huang1, Yao Lu1, Zhiding Yu1, Wei Ping1, Andrew Tao1,\nJan Kautz1, Song Han1,3, Dan Xu2, Pavlo Molchanov1, Hongxu Yin1\nNVIDIA1HKUST2MIT3\nX-VILA: The image shows a \nbeach with a sandy shore.Prompt: What is in the image?Image -Language\nX-VILA: A video with people \nenjoying the beach.Prompt : Can you make a \nsimilar video with people in it?Language -Video\nX-VILA: This is the requested \naudio.Prompt: Can you make some \nsound for the video?Video -Audio\nX-VILA: Here is the image.Prompt : Can you make an \nimage based on the video?Video -Image\nPrompt : Generate a video \nfollowing the audio .Audio -Video\nX-VILA: The man start s to ski.Prompt : What do you thin

## Adding chat history

In [25]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [26]:
from langchain_core.messages import AIMessage, HumanMessage

chat_history = []

question = "What is LLM?"
ai_msg_1 = rag_chain.invoke({"input": question, "chat_history": chat_history})
chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=ai_msg_1["answer"]),
    ]
)

second_question = "What is it good for?"
ai_msg_2 = rag_chain.invoke({"input": second_question, "chat_history": chat_history})

print(ai_msg_2["answer"])

System: You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Use three sentences maximum and keep the answer concise.

X-VILA: Cross-Modality Alignment for
Large Language Model
Hanrong Ye1,2∗, De-An Huang1, Yao Lu1, Zhiding Yu1, Wei Ping1, Andrew Tao1,
Jan Kautz1, Song Han1,3, Dan Xu2, Pavlo Molchanov1, Hongxu Yin1
NVIDIA1HKUST2MIT3
X-VILA: The image shows a 
beach with a sandy shore.Prompt: What is in the image?Image -Language
X-VILA: A video with people 
enjoying the beach.Prompt : Can you make a 
similar video with people in it?Language -Video
X-VILA: This is the requested 
audio.Prompt: Can you make some 
sound for the video?Video -Audio
X-VILA: Here is the image.Prompt : Can you make an 
image based on the video?Video -Image
Prompt : Generate a video 
following the audio .Audio -Video
X-VILA: The man start s to ski.Prompt : What do you think 
is going to happe

In [27]:
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)
conversational_rag_chain.invoke(
    {"input": "What is LLM?"},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
)["answer"]

Parent run b524cc38-0376-49bc-b01a-14271fed1955 not found for run 783821fc-9e6e-4108-951a-b6efdf6ac988. Treating as a root run.


"System: You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Use three sentences maximum and keep the answer concise.\n\nX-VILA: Cross-Modality Alignment for\nLarge Language Model\nHanrong Ye1,2∗, De-An Huang1, Yao Lu1, Zhiding Yu1, Wei Ping1, Andrew Tao1,\nJan Kautz1, Song Han1,3, Dan Xu2, Pavlo Molchanov1, Hongxu Yin1\nNVIDIA1HKUST2MIT3\nX-VILA: The image shows a \nbeach with a sandy shore.Prompt: What is in the image?Image -Language\nX-VILA: A video with people \nenjoying the beach.Prompt : Can you make a \nsimilar video with people in it?Language -Video\nX-VILA: This is the requested \naudio.Prompt: Can you make some \nsound for the video?Video -Audio\nX-VILA: Here is the image.Prompt : Can you make an \nimage based on the video?Video -Image\nPrompt : Generate a video \nfollowing the audio .Audio -Video\nX-VILA: The man start s to ski.Prompt : What do you thin

In [28]:
conversational_rag_chain.invoke(
    {"input": "What are common ways of doing it?"},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]

Parent run 8efae06e-c668-4578-aaf8-ec466b81871c not found for run 26bf9409-7829-4091-ac13-ff60373453a1. Treating as a root run.


"System: You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Use three sentences maximum and keep the answer concise.\n\nX-VILA: Cross-Modality Alignment for\nLarge Language Model\nHanrong Ye1,2∗, De-An Huang1, Yao Lu1, Zhiding Yu1, Wei Ping1, Andrew Tao1,\nJan Kautz1, Song Han1,3, Dan Xu2, Pavlo Molchanov1, Hongxu Yin1\nNVIDIA1HKUST2MIT3\nX-VILA: The image shows a \nbeach with a sandy shore.Prompt: What is in the image?Image -Language\nX-VILA: A video with people \nenjoying the beach.Prompt : Can you make a \nsimilar video with people in it?Language -Video\nX-VILA: This is the requested \naudio.Prompt: Can you make some \nsound for the video?Video -Audio\nX-VILA: Here is the image.Prompt : Can you make an \nimage based on the video?Video -Image\nPrompt : Generate a video \nfollowing the audio .Audio -Video\nX-VILA: The man start s to ski.Prompt : What do you thin

In [29]:
for message in store["abc123"].messages:
    if isinstance(message, AIMessage):
        prefix = "AI"
    else:
        prefix = "User"

    print(f"{prefix}: {message.content}\n")

User: What is Task Decomposition?

AI: System: You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Use three sentences maximum and keep the answer concise.

X-VILA: Cross-Modality Alignment for
Large Language Model
Hanrong Ye1,2∗, De-An Huang1, Yao Lu1, Zhiding Yu1, Wei Ping1, Andrew Tao1,
Jan Kautz1, Song Han1,3, Dan Xu2, Pavlo Molchanov1, Hongxu Yin1
NVIDIA1HKUST2MIT3
X-VILA: The image shows a 
beach with a sandy shore.Prompt: What is in the image?Image -Language
X-VILA: A video with people 
enjoying the beach.Prompt : Can you make a 
similar video with people in it?Language -Video
X-VILA: This is the requested 
audio.Prompt: Can you make some 
sound for the video?Video -Audio
X-VILA: Here is the image.Prompt : Can you make an 
image based on the video?Video -Image
Prompt : Generate a video 
following the audio .Audio -Video
X-VILA: The man start s to ski.Prompt

## Tested for messgaes chat with Patent_Chat2


In [14]:
from langchain_community.chat_models import ChatOllama
#llm = ChatOllama(model="llama3")
llm=ChatOllama(model='llama2:7b-chat')

In [7]:
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")

In [15]:
from langchain.chains import create_retrieval_chain
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains.combine_documents import create_stuff_documents_chain

contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [19]:
from langchain_core.messages import HumanMessage

chat_history = []

while True:
    question=input("Enter input: ")
    if question=='quit':
        break
    ai_msg_1 = rag_chain.invoke({"input": question, "chat_history": chat_history})
    chat_history.extend([HumanMessage(content=question), ai_msg_1["answer"]])

    # second_question = "What are common ways of doing it?"
    # ai_msg_2 = rag_chain.invoke({"input": second_question, "chat_history": chat_history})

    print(ai_msg_1["answer"])

X-VILA is an AI model that can perform cross-modality chat, which means it can understand and generate responses in multiple modalities, including text, images, and audio. It was designed to demonstrate its ability to comprehend visual input and perform reasoning based on it, as well as to engage in natural language conversations.

In the conversation examples provided, X-VILA shows strong multi-modal understanding and generation ability, as it can recognize and respond to visual stimuli such as images and videos, as well as text-based prompts. For example, when shown an image of a snowboarder, X-VILA can generate a response related to the image, such as identifying the person in the image or providing additional information about snowboarding.

Overall, X-VILA represents a significant advancement in AI technology, demonstrating its ability to integrate and process multiple modalities of input to produce coherent and contextually appropriate responses. Its applications could potentiall

In [20]:
chat_history

[HumanMessage(content='what is X-VILA?'),
 'X-VILA is an AI model that can perform cross-modality chat, which means it can understand and generate responses in multiple modalities, including text, images, and audio. It was designed to demonstrate its ability to comprehend visual input and perform reasoning based on it, as well as to engage in natural language conversations.\n\nIn the conversation examples provided, X-VILA shows strong multi-modal understanding and generation ability, as it can recognize and respond to visual stimuli such as images and videos, as well as text-based prompts. For example, when shown an image of a snowboarder, X-VILA can generate a response related to the image, such as identifying the person in the image or providing additional information about snowboarding.\n\nOverall, X-VILA represents a significant advancement in AI technology, demonstrating its ability to integrate and process multiple modalities of input to produce coherent and contextually appropri