#### Import Statements

In [1]:
import os
import pandas as pd
import pinecone
import re
import numpy as np
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec




#### Pre-processing Dataset

In [2]:
def clean_text(text):
    text = str(text)
    text = re.sub(r'[^\x20-\x7E]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()
    
corpus_path = "./sampling_img_kc/knowledge_corpus.csv"
try:
    corpus_df = pd.read_csv(corpus_path, encoding='latin1')
    print("Knowledge Corpus successfully loaded.")
except UnicodeDecodeError as e:
    print("Error with 'latin1' encoding:", e)

print("Available columns in the corpus:", corpus_df.columns.tolist())

passages = []

for idx, row in corpus_df.iterrows():

    caption = clean_text(row["caption"]) if "caption" in row else ""
    filename = clean_text(row["filename"]) if "filename" in row else ""
    img_id = row["img_id"]

    event1 = clean_text(row["Event 1"]) if "Event 1" in row else ""
    if event1 and event1.lower() != 'nan':
        passages.append({
            "id": f"{idx}_event1",
            "text": event1,
            "metadata": {
                "filename": filename,
                "img_id": img_id,
                "caption": caption,
                "event_type": "Event 1",
                "event_text": event1
            }
        })
    event2 = clean_text(row["Event 2"]) if "Event 2" in row else ""
    if event2 and event2.lower() != 'nan':
        passages.append({
            "id": f"{idx}_event2",
            "text": event2,
            "metadata": {
                "filename": filename,
                "img_id": img_id,
                "caption": caption,
                "event_type": "Event 2",
                "event_text": event2
            }
        })

print(f"Prepared {len(passages)} passages from events.")

Knowledge Corpus successfully loaded.
Available columns in the corpus: ['filename', 'img_id', 'caption', 'Event 1', 'Event 2']
Prepared 400 passages from events.


#### Loading Embedding Model

In [3]:
embedding_model = SentenceTransformer('multi-qa-mpnet-base-cos-v1')
print("Embedding model loaded.")

Embedding model loaded.


#### Creating Pinecone Index

In [None]:
pinecone_api_key = "your_api_key"
pinecone_region = "us-east-1"

pc = Pinecone(api_key=pinecone_api_key)

index_name = "llm-proj"

from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('multi-qa-mpnet-base-cos-v1')
dim = embedding_model.get_sentence_embedding_dimension()

indexes = pc.list_indexes().names()
if index_name not in indexes:
    pc.create_index(
        name=index_name,
        dimension=dim,
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region=pinecone_region
        )
    )
    print(f"Created index: {index_name}")
else:
    print(f"Using existing index: {index_name}")

index = pc.Index(index_name)

Created index: llm-proj


#### Embedding and Upserting Passages into Pinecone Index

In [5]:
texts = [p["text"] for p in passages]
ids = [p["id"] for p in passages]

print("Computing embeddings for passages...")
embeddings = embedding_model.encode(texts, show_progress_bar=True).tolist()

vectors = []
for i, emb in enumerate(embeddings):
    vectors.append((ids[i], emb, passages[i]["metadata"]))

index.upsert(vectors=vectors)
print("Upserted separate event passages into Pinecone.")

Computing embeddings for passages...


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Upserted separate event passages into Pinecone.


#### Retrieving Passages from Pinecone Index

In [6]:
def retrieve_facts(image_caption, top_k=5):

    raw_query_embedding = embedding_model.encode([example_caption])[0]
    query_embedding = np.array(raw_query_embedding, dtype=np.float32).tolist()
    query_response = index.query(
                        vector=query_embedding,
                        top_k=2,
                        include_values=True,
                        include_metadata=True,
                        )
    retrieved_items = [match['metadata'] for match in query_response['matches']]
    return retrieved_items

In [7]:
example_caption = "a man in a red jacket riding a small horse"
try:
    retrieved_items = retrieve_facts(example_caption, top_k=5)
    print("Retrieved Metadata:")
    for item in retrieved_items:
        print(item)
except Exception as e:
    print("Error during query:", e)

Retrieved Metadata:
{'caption': 'An old man, wearing a black beret and a black and red jacket, rides a pony in a desolate mountain location.', 'event_text': 'n the mid-20th century, Australian horseman and writer Ern Pedler embarked on a remarkable solo journey across the vast and arid landscapes of the Australian Outback. Dressed in traditional riding attire, including a wide-brimmed hat and durable riding clothes, Pedler traversed the desolate terrains on horseback, documenting his experiences and the challenges faced in such a remote environment. His journey not only showcased his resilience and horsemanship but also highlighted the profound solitude and beauty of the Australian wilderness. Pedler\'s writings, such as "The Big Lonely Horse," provide a vivid account of his adventures and the deep bond formed with his horse during this expedition.', 'event_type': 'Event 1', 'filename': '284105062.jpg', 'img_id': 8845.0}
{'caption': 'An old man, wearing a black beret and a black and re