In [1]:
! pip install -q "gdown==5.2.0"

In [None]:
!gdown --id 1xsXrlyyQm_SXawu_J_foKZFRApC3NCpS

In [None]:
!unzip pdf.zip

In [None]:
!pip install -q sentence-transformers
!pip install -q pymupdf
!pip install -q numpy
!pip install -q openai

In [5]:
from google.colab import userdata
from openai import OpenAI

In [6]:
client = OpenAI(api_key=userdata.get('openai_api'))

In [7]:
import fitz  # PyMuPDF

def extract_paragraphs_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    paragraphs = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text("text")
        page_paragraphs = text.split('\n\n')
        paragraphs.extend(page_paragraphs)

    return paragraphs

pdf_path = '/content/pdf/LLaVA.pdf'
paragraphs = extract_paragraphs_from_pdf(pdf_path)


In [8]:
import textwrap

In [None]:
for i, p in enumerate(paragraphs):
  wrapped_text = textwrap.fill(p, width=100)

  print("-----------------------------------------------------------------")
  print(wrapped_text)
  print("-----------------------------------------------------------------")

# Embedding model

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("Alibaba-NLP/gte-base-en-v1.5", trust_remote_code=True)

In [13]:
batch_size = 2  # Adjust batch size based on available memory
docs_embed = []
for i in range(0, len(paragraphs), batch_size):
    batch = paragraphs[i:i+batch_size]
    embeddings = model.encode(batch, normalize_embeddings=True)
    docs_embed.extend(embeddings)


In [32]:
import numpy as np

# Assuming docs_embed is your list of embeddings
docs_embed = np.array(docs_embed)
print(docs_embed.shape)


(25, 768)


In [None]:
docs_embed[0]

# Embed the Query

In [18]:
query = "What are the LLMs trained on image-text pairs?"
query_embed = model.encode(query, normalize_embeddings=True)

In [19]:
query_embed.shape

(768,)

# Find the Closest Paragraphs to the Query

In [20]:
import numpy as np
similarities = np.dot(docs_embed, query_embed.T)

In [21]:
similarities.shape

(25,)

In [22]:
similarities

array([0.60228133, 0.644054  , 0.5384798 , 0.60089993, 0.5557829 ,
       0.53296244, 0.554567  , 0.51590645, 0.570125  , 0.5707488 ,
       0.5230238 , 0.59259653, 0.5915273 , 0.5372057 , 0.5857537 ,
       0.28285468, 0.26511437, 0.34307188, 0.38001484, 0.525018  ,
       0.49184284, 0.49078333, 0.3669213 , 0.32335132, 0.21852395],
      dtype=float32)

In [23]:
top_3_idx = np.argsort(similarities, axis=0)[-3:][::-1].tolist()

In [24]:
top_3_idx

[1, 0, 3]

In [25]:
most_similar_documents = [paragraphs[idx] for idx in top_3_idx]

In [26]:
CONTEXT = ""
for i, p in enumerate(most_similar_documents):
  wrapped_text = textwrap.fill(p, width=100)

  print("-----------------------------------------------------------------")
  print(wrapped_text)
  print("-----------------------------------------------------------------")
  CONTEXT += wrapped_text + "\n\n"

-----------------------------------------------------------------
utilize various machine-generated high-quality instruction-following samples to improve the LLM’s
alignment ability, reporting impressive performance compared with proprietary LLMs. Importantly,
this line of work is text-only. In this paper, we present visual instruction-tuning, the first
attempt to extend instruction-tuning to the language-image multimodal space, to pave the way towards
building a general-purpose visual assistant. In particular, our paper makes the following
contributions: • Multimodal instruction-following data. One key challenge is the lack of vision-
language instruction-following data. We present a data reformation perspective and pipeline to
convert image-text pairs into an appropriate instruction-following format, using ChatGPT/GPT-4. •
Large multimodal models. We develop a large multimodal model (LMM), by connecting the open-set
visual encoder of CLIP [40] with the language decoder Vicuna [9], an

In [27]:
query = "What are the LLMs trained on image-text pairs?"

In [28]:
prompt = f"""
use the following CONTEXT to answer the QUESTION at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

CONTEXT: {CONTEXT}
QUESTION: {query}

"""

In [29]:
response = client.chat.completions.create(
  model="gpt-4o",
  messages=[
    {"role": "user", "content": prompt},
  ]
)

In [30]:
print(response.choices[0].message.content)

The LLMs trained on image-text pairs mentioned in the context are:

- Flamingo
- BLIP-2
- FROMAGe
- KOSMOS-1
- PaLM-E
- OpenFlamingo
- LLaMA-Adapter
