In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/vmware-zero-shot-information-retrieval/sample_submission.csv
/kaggle/input/vmware-zero-shot-information-retrieval/vmware_ir_content.csv
/kaggle/input/vmware-zero-shot-information-retrieval/test.csv


In [2]:
hyperparameters = {
    'chunk_size_characters': 200,
    'chunk_size_padding': 50,
}

# Chunk Documents

In [30]:
import pandas as pd
content = pd.read_csv("/kaggle/input/vmware-zero-shot-information-retrieval/vmware_ir_content.csv")

In [75]:
content['raw_text'].str.len().sum() # 750M
# num of chunks 750M / 20 = 30M 

750988279.0

In [31]:
sampled_content = content.sample(frac=0.01)

In [40]:
chunk_size = hyperparameters['chunk_size_characters']
padding = hyperparameters['chunk_size_padding']

def chunkify(document):
    ix, chunks = 0, []
    while ix < len(document):
        chunk_start = ix 
        chunk_end   = ix + padding + chunk_size + padding
        chunks.append(document[chunk_start:chunk_end])
        ix = ix + padding + chunk_size
    return chunks


chunks = {} # id: chunk
for k,v in list(sampled_content['raw_text'].items())[:10]:
    
    chunks[k] = chunkify(v)
    print(f"id: {k}, passage length: {len(v)}, number of chunks: {len(chunks[k])}")

id: 129637, passage length: 2061, number of chunks: 9
id: 315611, passage length: 4791, number of chunks: 20
id: 120563, passage length: 3339, number of chunks: 14
id: 67449, passage length: 287, number of chunks: 2
id: 152059, passage length: 71, number of chunks: 1
id: 149767, passage length: 861, number of chunks: 4
id: 288907, passage length: 9009, number of chunks: 37
id: 244542, passage length: 722, number of chunks: 3
id: 25297, passage length: 6475, number of chunks: 26
id: 48129, passage length: 1945, number of chunks: 8


In [42]:
c1 = chunks[129637][0]
c2 = chunks[288907][0]
c1, c2

('When you customize the request form for a vRealize Automation blueprint, you can base the behavior of some fields on the results of a vRealize Orchestrator action.\nThere are several ways that you can use vRealize Orchestrator actions.\nYou might have an action that pulls the data from a third source,',
 'The following metrics are available for each ElastiCache Cache Node instance in your vRealize Operations Manager environment.\nFor a description of each metric, see the Amazon Web Service documentation at http://docs.aws.amazon.com/AmazonElastiCache/latest/UserGuide/CacheMetrics.Redis.html, http://do')

In [60]:


chunkify(v)
        
        
        
    

['You can uninstall the Log Insight Linux Agent RPM package.\nPrerequisites\nLog in as root or use sudo to run console commands.\nLog in to the Linux machine on which you installed the Log Insight Linux Agent, open a terminal console and run pgrep liagent to verify that the VMware Log Insight Linux Agent',
 ' to verify that the VMware Log Insight Linux Agent is installed and running.\nProcedure\n♦Run the following command replacing VERSION and BUILD_NUMBER with the version and build number of the installed agent.\nrpm -e VMware-Log-Insight-Agent-VERSION-BUILD_NUMBER\nResults\nThe uninstaller stops the VMware',
 'LD_NUMBER\nResults\nThe uninstaller stops the VMware Log Insight Linux Agent daemon and removes all its files except its own logs from the system.']

# Text Embedding model

In [16]:
import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel


def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


# Each input text should start with "query: " or "passage: ".
# For tasks other than retrieval, you can simply use the "query: " prefix.
input_texts = ['query: what node metrics are available?',
               'query: how do I customize the request form for my blueprint?',
               f"passage: {c1}",
               f"passage: {c2"]

tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-small-v2')
model = AutoModel.from_pretrained('intfloat/e5-small-v2')

# Tokenize the input texts
batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
outputs = model(**batch_dict)
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

# normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)
scores = (embeddings[:2] @ embeddings[2:].T) * 100
print(scores.tolist())


[[92.2717056274414, 71.97750854492188], [73.05535888671875, 90.54825592041016]]


In [20]:
batch_dict.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [24]:
batch_dict.input_ids.shape, batch_dict.attention_mask.shape

(torch.Size([4, 75]), torch.Size([4, 75]))

In [13]:
outputs.last_hidden_state.shape

torch.Size([4, 75, 384])

In [25]:
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
embeddings.shape

torch.Size([4, 384])

In [26]:
embeddings = F.normalize(embeddings, p=2, dim=1)

In [29]:
embeddings

tensor([[-0.0645, -0.0157,  0.0472,  ...,  0.0176, -0.0115, -0.0080],
        [-0.0903,  0.0329,  0.0039,  ..., -0.0069,  0.0005,  0.0178],
        [-0.0526,  0.0210,  0.0388,  ...,  0.0021, -0.0063,  0.0017],
        [-0.0766,  0.0658, -0.0060,  ...,  0.0067,  0.0048,  0.0367]],
       grad_fn=<DivBackward0>)

In [9]:
import torch.nn as nn

class ModelWrapper(nn.Module):
    def __init__(self, model):
        super(ModelWrapper, self).__init__()
        self.model = model

    def forward(self, input_ids, attention_mask):
        # Get the model output (which is a dict or ModelOutput)
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        # Return a fixed container, e.g., the last_hidden_state tensor
        return outputs.last_hidden_state

# Wrap your model
wrapped_model = ModelWrapper(model)

# Create a dummy input tuple (make sure these are on the same device as your model)
dummy_input = (batch_dict['input_ids'], batch_dict['attention_mask'])

from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter("runs/my_experiment")
writer.add_graph(wrapped_model, dummy_input)
writer.close()


In [11]:
# Load the TensorBoard extension
%load_ext tensorboard

# Launch TensorBoard pointing to the log directory you used ("runs/my_experiment")
%tensorboard --logdir runs/my_experiment

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 119), started 0:00:26 ago. (Use '!kill 119' to kill it.)

<IPython.core.display.Javascript object>

In [12]:
 %reload_ext tensorboard

# Appendix

In [None]:
content.document_group.value_counts()

In [None]:
for k,v in content[content.document_group == 'docs'].iloc[0].to_dict().items():
    print(k,v)


In [None]:
l = list(content[content.document_group == 'docs'][content['raw_text'].notna()]['raw_text'])

In [None]:
lengths = sorted(list(map(len, l)))[:-100]
sum(lengths)/len(l), max(lengths), min(lengths)

In [None]:
import matplotlib.pyplot as plt
plt.hist(lengths[:-1000])
plt.title("histogram of number of documents with given length")
plt.xlabel("document length")
plt.ylabel("# of documents")
plt.show()

In [None]:
for k,v in content[content.document_group == 'blog'].iloc[100].to_dict().items():
    print(k,v)


In [None]:
l = list(content[content.document_group == 'blog'][content['raw_text'].notna()]['raw_text'])

In [None]:
lengths = sorted(list(map(len, l)))
sum(lengths)/len(l), max(lengths), min(lengths)

In [None]:
import matplotlib.pyplot as plt
plt.hist(lengths[:-1000])
plt.title("histogram of number of documents with given length")
plt.xlabel("document length")
plt.ylabel("# of documents")
plt.show()

# Queries EDA

In [None]:
import pandas as pd
queries = pd.read_csv("/kaggle/input/vmware-zero-shot-information-retrieval/test.csv")

In [None]:
queries = list(queries['Query'])
len(queries)

In [None]:
from random import random


In [None]:
for _ in range(20):
    print(queries[int(random() * len(queries))])
    

In [None]:

fqueries = filter(lambda text: ('how to' not in text) and ('what is' not in text), queries)
how_to = filter(lambda text: 'how to' in text, queries)
what_is = filter(lambda text: 'what is' in text, queries)

In [None]:
def stats(fqueries):
    fqueries = list(fqueries)
    print(len(fqueries), len(fqueries)/len(queries))
    for _ in range(20):
        print(fqueries[int(random() * len(fqueries) - 1)])
    


In [None]:
stats(fqueries)

In [None]:
stats(how_to)

In [None]:
stats(what_is)

In [None]:
sample_submission = pd.read_csv("/kaggle/input/vmware-zero-shot-information-retrieval/sample_submission.csv")

In [None]:
list(sample_submission['DocumentId'])[:10]