<a href="https://www.kaggle.com/code/alessandrosolbiati/zero-shot-learning-vmware-text-embedding?scriptVersionId=222080170" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/vmware-zero-shot-information-retrieval/sample_submission.csv
/kaggle/input/vmware-zero-shot-information-retrieval/vmware_ir_content.csv
/kaggle/input/vmware-zero-shot-information-retrieval/test.csv


In [2]:
hyperparameters = {
    'chunk_size_characters': 400,
    'chunk_size_padding': 50,
}

# Chunk Documents

In [None]:
test = pd.read_csv("/kaggle/input/vmware-zero-shot-information-retrieval/test.csv")
sorted(list(test.Query), key=len)[1000:1100]

In [None]:
pp = filter(lambda x: 'network' in x, list(test.Query))
for p in pp: print(p)

In [4]:
import pandas as pd
content = pd.read_csv("/kaggle/input/vmware-zero-shot-information-retrieval/vmware_ir_content.csv")
len(content)

323963

In [5]:
content = content[content.raw_text.fillna("").apply(lambda x: 'network' in x)]
len(content)

62371

In [6]:
content['raw_text'].str.len().sum() # 750M
# num of chunks 750M / 20 = 30M 

275640277

In [7]:
sampled_content = content.sample(frac=0.01)
len(sampled_content)

624

In [8]:
from dataclasses import dataclass

chunk_size = hyperparameters['chunk_size_characters']
padding = hyperparameters['chunk_size_padding']
CHUNK_MAX_LENGTH = chunk_size + 2 * padding

def chunkify(document):
    ix, chunks = 0, []
    while ix < len(document):
        chunk_start = ix 
        chunk_end   = ix + padding + chunk_size + padding
        chunks.append(document[chunk_start:chunk_end])
        ix = ix + padding + chunk_size
    return chunks

@dataclass
class Chunk:
    document_id: int
    chunk_data: str

chunks = [] # id: chunk
for ix, (k,v) in enumerate(list(content['raw_text'].fillna("").items())):

    kchunks = chunkify(v)
    for kchunk in kchunks:
        if "network" in kchunk:
            chunks.append(Chunk(document_id=k, chunk_data=kchunk))
    if ix % 10000 == 0:
        print(f"id: {k}, passage length: {len(v)}, number of chunks: {len(kchunks)}")
print(len(chunks))

id: 3, passage length: 3918, number of chunks: 9
id: 49212, passage length: 1915, number of chunks: 5
id: 104920, passage length: 1252, number of chunks: 3
id: 157021, passage length: 3230, number of chunks: 8
id: 208526, passage length: 3191, number of chunks: 8
id: 259705, passage length: 1762, number of chunks: 4
id: 311489, passage length: 4123, number of chunks: 10
151101


In [None]:
c1 = chunks[0].chunk_data
c2 = chunks[30].chunk_data
c1, c2

# Text Embedding model

In [9]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [None]:

# Each input text should start with "query: " or "passage: ".
# For tasks other than retrieval, you can simply use the "query: " prefix.
input_texts = [f'query: {chunks[0].chunk_data}',
               f'query: {chunks[-10].chunk_data}',
               f'query: {chunks[1].chunk_data}',
               f'query: {chunks[-11].chunk_data}']
input_texts

In [10]:
import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-small-v2')
model = AutoModel.from_pretrained('intfloat/e5-small-v2').to(device)
#  model alone on GPU T4 is 243MiB /  15360MiB 


def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

def model_predict(input_texts: list[str], model=model, tokenizer=tokenizer):
    # Tokenize the input texts
    batch_dict = tokenizer(input_texts, max_length=CHUNK_MAX_LENGTH, padding=True, truncation=True, return_tensors='pt')
    batch_dict = {k: v.to(device) for k, v in batch_dict.items()}
    with torch.no_grad():
        outputs = model(**batch_dict)
        embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
        # normalize embeddings
        embeddings = F.normalize(embeddings, p=2, dim=1)
    return embeddings, batch_dict
    


In [None]:

embeddings, _ = model_predict(input_texts)
scores = (embeddings[:2] @ embeddings[2:].T) * 100
print(scores.tolist())

In [None]:
chunked_corpus = [f"query: {chunk.chunk_data}" for chunk in chunks][:3000]
len(chunked_corpus)

# 1315MiB /  15360MiB after first inference of 300
# 2709MiB /  15360MiB inference of 600
# 13913MiB /  15360MiB  inference for 3000 chunks, takes 3 seconds (3 * 10^3 seconds)

# total number of chunks is 3*10^7 = 10^4 seconds to embed them all = 3 hours
# I have two GPUs and I can do it in parallel and is going to take 1 hour

# 3000 embeddings are 3000 * 348 emb size * 4bytes = 1MB in memory  = 10^6
# I have 10GB memory 10^10 , can hold 3 * 10^3 * 10^4 = 3 10^7 embeddings

In [None]:
sum([len(c) for c in chunked_corpus])

In [None]:
from time import time
start_time = time()
embeddings = model_predict(chunked_corpus)
end_time = time()
print(f"Model inference for {len(chunked_corpus)} chunks: {end_time - start_time:.2f} seconds")

In [None]:
print(embeddings.shape)
end_time = time()
print(f"Model inference for {len(chunked_corpus)} chunks: {end_time - start_time:.2f} seconds")

In [13]:
!nvidia-smi


Wed Feb 12 01:22:27 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   55C    P0             28W /   70W |     243MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

In [None]:
torch.cuda.empty_cache()

In [None]:
embeddings.shape

In [11]:
from torch.utils.data import Dataset, DataLoader


class ChunkTextDataset(Dataset):
    def __init__(self, chunks: list[Chunk]):
        # Prepend "query: " to each chunk's text.
        self.texts = [f"query: {chunk.chunk_data}" for chunk in chunks]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]


dataset = ChunkTextDataset(chunks)
dataloader = DataLoader(dataset, batch_size=1000, shuffle=False)
    

In [12]:
torch.cuda.empty_cache()

In [16]:
all_embeddings = []
import gc
    
# Disable gradients for inference.
with torch.no_grad():
    for batch in dataloader:

        from time import time
        start_time = time()

        # Each 'batch' is a list of strings (already with "query: " prepended).
        embeddings, batch_dict = model_predict(batch)  # Assume this returns a tensor of shape [batch_size, embedding_dim]

        end_time = time()
        
        torch.cuda.empty_cache()
        print(f"Processed batch of {len(batch)} chunks, {end_time - start_time:.2f} seconds")

        # Move embeddings to CPU (if they are on GPU) and append.
        all_embeddings.append(embeddings.cpu())

        # Clean up GPU memory:
        del batch_dict
        del embeddings
        del batch
        torch.cuda.empty_cache()
        gc.collect()
              
        

        
        
# Concatenate all the batch embeddings into a single tensor.
all_embeddings = torch.cat(all_embeddings, dim=0)

Processed batch of 1000 chunks, 0.24 seconds
Processed batch of 1000 chunks, 0.24 seconds
Processed batch of 1000 chunks, 0.24 seconds
Processed batch of 1000 chunks, 0.23 seconds
Processed batch of 1000 chunks, 0.32 seconds
Processed batch of 1000 chunks, 0.27 seconds
Processed batch of 1000 chunks, 0.35 seconds
Processed batch of 1000 chunks, 0.35 seconds
Processed batch of 1000 chunks, 0.36 seconds
Processed batch of 1000 chunks, 0.28 seconds
Processed batch of 1000 chunks, 0.31 seconds
Processed batch of 1000 chunks, 0.38 seconds
Processed batch of 1000 chunks, 0.35 seconds
Processed batch of 1000 chunks, 0.30 seconds
Processed batch of 1000 chunks, 0.30 seconds
Processed batch of 1000 chunks, 0.30 seconds
Processed batch of 1000 chunks, 0.40 seconds
Processed batch of 1000 chunks, 0.33 seconds
Processed batch of 1000 chunks, 0.34 seconds
Processed batch of 1000 chunks, 0.30 seconds
Processed batch of 1000 chunks, 0.30 seconds
Processed batch of 1000 chunks, 0.28 seconds
Processed 

In [25]:
len(chunks)

import pickle



# Save the chunks list to disk
with open('/kaggle/working/chunks151101_11_Feb_A__(network).pkl', 'wb') as f:
    pickle.dump(chunks, f)

In [24]:
all_embeddings.shape

np.save('/kaggle/working/embeddings151101x384_11_Feb_A__(network).npy', all_embeddings)


In [None]:
embeddings = all_embeddings

# Indexing

In [None]:


# Get the dimensionality of your embeddings.


dimension = embeddings.shape[1]

# Create a FAISS index. For example, use IndexFlatL2 for exact L2 (Euclidean) distance search:
index = faiss.IndexFlatL2(dimension)
# For inner product similarity, you could use:
# index = faiss.IndexFlatIP(dimension)

# Add your embeddings to the index.
index.add(embeddings)
print(f"Number of embeddings indexed: {index.ntotal}")

# Appendix

In [None]:
content.document_group.value_counts()

In [None]:
for k,v in content[content.document_group == 'docs'].iloc[0].to_dict().items():
    print(k,v)


In [None]:
l = list(content[content.document_group == 'docs'][content['raw_text'].notna()]['raw_text'])

In [None]:
lengths = sorted(list(map(len, l)))[:-100]
sum(lengths)/len(l), max(lengths), min(lengths)

In [None]:
import matplotlib.pyplot as plt
plt.hist(lengths[:-1000])
plt.title("histogram of number of documents with given length")
plt.xlabel("document length")
plt.ylabel("# of documents")
plt.show()

In [None]:
for k,v in content[content.document_group == 'blog'].iloc[100].to_dict().items():
    print(k,v)


In [None]:
l = list(content[content.document_group == 'blog'][content['raw_text'].notna()]['raw_text'])

In [None]:
lengths = sorted(list(map(len, l)))
sum(lengths)/len(l), max(lengths), min(lengths)

In [None]:
import matplotlib.pyplot as plt
plt.hist(lengths[:-1000])
plt.title("histogram of number of documents with given length")
plt.xlabel("document length")
plt.ylabel("# of documents")
plt.show()

# Queries EDA

In [None]:
import pandas as pd
queries = pd.read_csv("/kaggle/input/vmware-zero-shot-information-retrieval/test.csv")

In [None]:
queries = list(queries['Query'])
len(queries)

In [None]:
from random import random


In [None]:
for _ in range(20):
    print(queries[int(random() * len(queries))])
    

In [None]:

fqueries = filter(lambda text: ('how to' not in text) and ('what is' not in text), queries)
how_to = filter(lambda text: 'how to' in text, queries)
what_is = filter(lambda text: 'what is' in text, queries)

In [None]:
def stats(fqueries):
    fqueries = list(fqueries)
    print(len(fqueries), len(fqueries)/len(queries))
    for _ in range(20):
        print(fqueries[int(random() * len(fqueries) - 1)])
    


In [None]:
stats(fqueries)

In [None]:
stats(how_to)

In [None]:
stats(what_is)

In [None]:
sample_submission = pd.read_csv("/kaggle/input/vmware-zero-shot-information-retrieval/sample_submission.csv")

In [None]:
list(sample_submission['DocumentId'])[:10]