In [4]:
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer


def test_passage_encoding():
    model_name = "vblagoje/dpr-ctx_encoder-single-lfqa-wiki"
    test_text = "Where an aircraft passes through a cloud, it can disperse the cloud in its path."

    tokenizer = DPRContextEncoderTokenizer.from_pretrained(model_name)
    model = DPRContextEncoder.from_pretrained(model_name)

    input_ids = tokenizer(test_text, return_tensors="pt")["input_ids"]
    embeddings = model(input_ids).pooler_output

    return embeddings

In [16]:
test_y = test_passage_encoding()

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


In [33]:
type(test_y)

torch.Tensor

In [30]:
test_y

tensor([[-0.3305, -0.5552,  0.5714, -0.5512, -0.5668, -0.1285,  0.8487, -0.2048,
          0.7291, -0.5411, -0.7498, -0.6651, -0.4083,  0.1175,  0.0721,  0.9723,
          0.5629, -0.7849,  0.3385,  1.1212,  1.5044,  1.4968,  1.2595, -1.3583,
         -0.4184, -1.8523, -0.1297, -0.6417,  0.6851,  1.2277, -0.5808,  0.0867,
         -0.1943, -0.3925, -0.0498,  0.2869,  0.5289,  0.1897, -0.8804, -0.0174,
         -0.6025,  0.8758,  0.5837, -0.5072,  0.2960, -0.4323, -0.1153, -0.7718,
          0.3938,  0.8190, -1.8458, -2.0164,  0.2391,  0.1442, -0.2318,  0.5177,
         -0.3433, -0.8197, -0.8672,  1.3128,  0.5167,  0.4904, -0.7432,  0.8290,
          1.3873, -0.8285,  0.3470, -0.0715, -0.6253, -1.1326, -0.0498,  0.4060,
         -1.0165, -0.0336, -0.2082, -0.8550, -0.6420,  0.8271, -0.8328,  0.5363,
         -0.3401, -1.0566,  0.0377,  0.0340, -0.3640,  0.0858, -0.4931,  0.4870,
          0.6979,  1.2391, -0.1718, -0.0192, -0.1439,  0.2411, -0.4914,  0.2170,
          0.3045,  0.3282,  

In [23]:
len(test_y.detach().numpy()[0])

128

In [None]:
from pathlib import Path

input_dir = "/askem/data/debug_data"
files = Path(input_dir).glob("**/*.txt")


In [6]:
import weaviate

client = weaviate.Client(
    "http://localhost:8080",
    auth_client_secret=weaviate.auth.AuthApiKey("XVMvxrzHqPu85GD2qClHn0by2cSon0kt"),
)


In [7]:
client.vectorizer.create("")

AttributeError: 'Client' object has no attribute 'vectorizer'

In [None]:
client.get_meta()


### Preprocessing

In [None]:
from haystack import Pipeline
from haystack.nodes import PreProcessor, TextConverter

text_converter = TextConverter()
preprocessor = PreProcessor(
    clean_whitespace=True,
    clean_header_footer=True,
    clean_empty_lines=True,
    split_by="word",
    split_length=200,
    split_respect_sentence_boundary=False,
    split_overlap=5,
)

pipeline = Pipeline()
pipeline.add_node(text_converter, name="text_converter", inputs=["File"])
pipeline.add_node(preprocessor, name="preprocessor", inputs=["text_converter"])


#### Run preprocessing

In [None]:
input_dir = "/askem/data/debug_data"
files = Path(input_dir).glob("**/*.txt")
docs = pipeline.run(file_paths=[str(file) for file in files])

### Upload data to weaviate

In [24]:
client.schema.delete_all()

In [25]:
import json

# Create passage schema
passage_schema = {
    "class": "Passage",
    "description": "Paragraph chunk of a document",
    "vectorizer": "text2vec-transformers",
    "moduleConfig": {
        "text2vec-transformers": {"vectorizeClassName": False}
    },
    "vectorIndexConfig": {
        "distance": "dot"
    },  # DPR is designed for dot-product similarity
    "properties": [
        {"name": "content", "dataType": ["text"]},
        {"name": "paper_id", "dataType": ["text"], "moduleConfig": {"text2vec-transformers": {"skip": True}}},
    ],
}

# Create class in Weaviate
client.schema.create_class(passage_schema)

# Dump full schema to file
with open("askem/schema/passage.json", "w") as f:
    json.dump(client.schema.get("passage"), f, indent=2)

## Load data into Weaviate

In [26]:
data_obj = {
    "content": "Where an aircraft passes through a cloud, it can disperse the cloud in its path.",
    "paper_id": "Title of the document",
}

with client.batch as batch:
    batch.batch_size = 1
    batch.dynamic = True
    batch.add_data_object(data_object=data_obj, class_name="passage")

In [27]:
client.query.aggregate("passage").with_meta_count().do()

{'data': {'Aggregate': {'Passage': [{'meta': {'count': 1}}]}}}

In [28]:
y = (
    client.query.get("passage", ["content"])
    .with_near_text({"concepts": ["animal"]})
    .with_additional(["vector", "distance", "id"])
    .do()
)

In [32]:
y

{'data': {'Get': {'Passage': [{'_additional': {'distance': -0.87897265,
      'id': 'bd2e916d-1e72-4282-89a3-3c28f8d35778',
      'vector': [-0.330512,
       -0.5551501,
       0.5713861,
       -0.55120754,
       -0.56681365,
       -0.1284562,
       0.84872055,
       -0.20482332,
       0.72912794,
       -0.5411461,
       -0.74979305,
       -0.6650699,
       -0.40830395,
       0.11752158,
       0.07209335,
       0.9723168,
       0.5628617,
       -0.7848689,
       0.33852035,
       1.121164,
       1.5043601,
       1.4968308,
       1.2595344,
       -1.3583002,
       -0.41837773,
       -1.8523325,
       -0.12974942,
       -0.6417205,
       0.6850843,
       1.2277026,
       -0.5807784,
       0.08673872,
       -0.19425073,
       -0.39245838,
       -0.049770813,
       0.28685445,
       0.52892125,
       0.18966648,
       -0.8804232,
       -0.017353758,
       -0.6024761,
       0.8758074,
       0.583739,
       -0.50723875,
       0.2959907,
       -0.43

In [15]:
len(y["data"]["Get"]["Passage"][0]["_additional"]["vector"])

128

In [None]:
text = "I am not sure about this."

with client.batch as batch:
    batch.batch_size = 50
    batch.dynamic = True

    batch.add_data_object({"passage": text}, class_name="passage")


In [None]:
client.query.get("passage", ["passage", "vector"]).do()


In [None]:
client.schema.delete_all()

In [None]:
client.schema.get("passage")


In [None]:
client.query.aggregate("Passage").with_meta_count().do()


In [None]:
client.schema.get()["classes"]


In [None]:
client.get_meta()["modules"]["text2vec-transformers"].keys()