In [1]:
import weaviate

client = weaviate.Client("http://localhost:8080")


client.batch.configure(
    batch_size=100,
    dynamic=False,
    timeout_retries=3,
    callback=weaviate.util.check_batch_result,
    consistency_level=weaviate.data.replication.ConsistencyLevel.ALL,  # default QUORUM
)

<weaviate.batch.crud_batch.Batch at 0x7f039c2f4c10>

In [5]:
class_obj = {"class": "passage", "vectorizer": "text2vec-transformers"}
client.schema.create_class(class_obj)

In [13]:
client.schema.get("passage")

{'class': 'Passage',
 'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
  'cleanupIntervalSeconds': 60,
  'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
 'moduleConfig': {'text2vec-transformers': {'poolingStrategy': 'masked_mean',
   'vectorizeClassName': True}},
 'properties': [{'dataType': ['text'],
   'description': "This property was generated by Weaviate's auto-schema feature on Fri May  5 21:29:13 2023",
   'indexFilterable': True,
   'indexSearchable': True,
   'moduleConfig': {'text2vec-transformers': {'skip': False,
     'vectorizePropertyName': False}},
   'name': 'passage',
   'tokenization': 'word'}],
 'replicationConfig': {'factor': 1},
 'shardingConfig': {'virtualPerPhysical': 128,
  'desiredCount': 1,
  'actualCount': 1,
  'desiredVirtualCount': 128,
  'actualVirtualCount': 128,
  'key': '_id',
  'strategy': 'hash',
  'function': 'murmur3'},
 'vectorIndexConfig': {'skip': False,
  'cleanupIntervalSeconds': 300,
  'maxConnections': 64,
  'efCon

In [17]:
client.query.get("passage", ["passage"]).with_near_text(
    {"concepts": ["are you sure?"]}
).with_additional(["vector", "distance", "id"]).do()

{'data': {'Get': {'Passage': [{'_additional': {'distance': 0.69266975,
      'id': 'e04656ea-4d11-4e3a-be28-34d3e3cd4f14',
      'vector': [-0.13605526,
       -0.35305062,
       0.68230677,
       -0.77277356,
       -1.0995371,
       -0.48000315,
       -0.6515379,
       0.6029717,
       -0.0536196,
       -0.18245442,
       -0.6804116,
       -0.5405119,
       -0.6405677,
       0.20868714,
       0.23363034,
       0.30014825,
       0.39301825,
       0.64479864,
       -0.18261391,
       0.87439567,
       0.3610479,
       0.71831435,
       0.2807732,
       -0.98202753,
       -0.19589275,
       -1.121207,
       -0.17648916,
       -0.5177107,
       0.63387245,
       0.19702686,
       -0.81997204,
       0.2750283,
       0.021586474,
       -0.43989575,
       -0.72645265,
       0.64203036,
       0.14953962,
       -0.32458213,
       -0.016760755,
       -0.3329675,
       -1.4782261,
       1.4495848,
       0.13901362,
       0.35032842,
       0.43210477,
  

In [3]:
client.get_meta()


{'hostname': 'http://[::]:8080',
 'modules': {'text2vec-transformers': {'passage': {'model': {'_name_or_path': './models/model',
     'add_cross_attention': False,
     'architectures': ['DPRContextEncoder'],
     'attention_probs_dropout_prob': 0.1,
     'bad_words_ids': None,
     'begin_suppress_tokens': None,
     'bos_token_id': None,
     'chunk_size_feed_forward': 0,
     'cross_attention_hidden_size': None,
     'decoder_start_token_id': None,
     'diversity_penalty': 0,
     'do_sample': False,
     'early_stopping': False,
     'encoder_no_repeat_ngram_size': 0,
     'eos_token_id': None,
     'exponential_decay_length_penalty': None,
     'finetuning_task': None,
     'forced_bos_token_id': None,
     'forced_eos_token_id': None,
     'gradient_checkpointing': False,
     'hidden_act': 'gelu',
     'hidden_dropout_prob': 0.1,
     'hidden_size': 768,
     'id2label': {'0': 'LABEL_0', '1': 'LABEL_1'},
     'initializer_range': 0.02,
     'intermediate_size': 3072,
     'is_d

In [8]:
text = "I am not sure about this."

with client.batch as batch:
    batch.batch_size = 50
    batch.dynamic = True

    batch.add_data_object({"passage": text}, class_name="passage")

In [12]:
client.query.get("passage", ["passage", "vector"]).do()

{'data': {'Get': {'Passage': [{'passage': 'I am not sure about this.'}]}}}

In [2]:
client.schema.delete_all()


In [6]:
client.schema.get("passage")

{'class': 'Passage',
 'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
  'cleanupIntervalSeconds': 60,
  'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
 'moduleConfig': {'text2vec-transformers': {'poolingStrategy': 'masked_mean',
   'vectorizeClassName': True}},
 'properties': [],
 'replicationConfig': {'factor': 1},
 'shardingConfig': {'virtualPerPhysical': 128,
  'desiredCount': 1,
  'actualCount': 1,
  'desiredVirtualCount': 128,
  'actualVirtualCount': 128,
  'key': '_id',
  'strategy': 'hash',
  'function': 'murmur3'},
 'vectorIndexConfig': {'skip': False,
  'cleanupIntervalSeconds': 300,
  'maxConnections': 64,
  'efConstruction': 128,
  'ef': -1,
  'dynamicEfMin': 100,
  'dynamicEfMax': 500,
  'dynamicEfFactor': 8,
  'vectorCacheMaxObjects': 1000000000000,
  'flatSearchCutoff': 40000,
  'distance': 'cosine',
  'pq': {'enabled': False,
   'bitCompression': False,
   'segments': 0,
   'centroids': 256,
   'encoder': {'type': 'kmeans', 'distribution': '

In [None]:
client.query.aggregate("Document").with_meta_count().do()

In [None]:
client.schema.get()["classes"]

In [None]:
client.get_meta()["modules"]["text2vec-transformers"].keys()


In [None]:
with client.batch as batch:
    # Add object without a custom vector.
    # When using vectorization modules this can be used
    # or when you don't want to set a vector
    batch.add_data_object(
        first_object_props, "Author", "36ddd591-2dee-4e7e-a3cc-eb86d30a4303"
    )
    # Add object with a custom vector
    batch.add_data_object(
        second_object_props,
        "Author",
        "36ddd591-2dee-4e7e-a3cc-eb86d30a4304",
        vector=[0.1, 0.2, 0.3],
    )


In [None]:
data_path = "data/covid1000/"
paths = [path for path in Path(data_path).glob("**/*.txt")]

In [None]:
all_docs = convert_files_to_docs("data/covid1000/", split_paragraphs=True)

In [None]:
all_docs

In [None]:
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=True,
)
docs_default = preprocessor.process(all_docs)


Pipeline:

1. Cleaning
2. Split to small chunks (e.g., paragraph)
3. Convert to documents in Haystack format
4. Push to vector store


In [None]:
file_dir = Path("./data/covid1000/")
file_paths = [p for p in Path(file_dir).glob("**/*.txt")]


In [None]:
doc.content

In [None]:
from askem.preprocessing import TextProcessor, convert_files_to_docs

tp = TextProcessor()


In [None]:
def

In [None]:
import logging
import re
import string
from pathlib import Path
from typing import Callable, Dict, List, Optional
from tqdm.autonotebook import tqdm
import nltk
from haystack.nodes.file_converter import (
    BaseConverter,
    DocxToTextConverter,
    PDFToTextConverter,
    TextConverter,
)
from haystack.schema import Document
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from transformers import pipeline

In [None]:
def to_sentences(text: str) -> List[str]:
    """Generic text cleaning function."""

    sentences = sent_tokenize(text)
    words = [word_tokenize(sentence) for sentence in sentences]
    words = [
        [word.lower() for word in sentence if word not in string.punctuation]
        for sentence in words
    ]

    # Remove stop words
    stop_words = set(stopwords.words("english"))
    words = [
        [word for word in sentence if word not in stop_words] for sentence in words
    ]

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in words]

    return [" ".join(sentence) for sentence in words]

In [None]:
def to_paragraphs(self, text) -> str:
    """Separate text into paragraphs using `\n\n` as separator."""

    text = re.sub("\s+", " ", text).strip()
    sentences = sent_tokenize(text)
    sentences = [s for s in sentences if len(s) > self.min_characters]
    is_continuous = self._get_is_continuous(sentences)

    # Group sentences into paragraphs
    paragraphs = [sentences[0]]
    for i, sentence in enumerate(sentences[1:]):
        not_too_long = len(word_tokenize(paragraphs[-1] + " " + sentence)) < 1024
        if is_continuous[i] and not_too_long:
            paragraphs[-1] += " " + sentence  # Append to last paragraph
        else:
            paragraphs.append(sentence)  # Start a new paragraph

    return "\n\n".join(paragraphs)


In [None]:
!pip install nltk

In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords", quiet=True)
stopwords.words("english")

In [None]:
doc_dir = "data/covid1000/"
docs = convert_files_to_docs(
    dir_path=doc_dir, clean_func=tp.to_paragraphs, split_paragraphs=True
)

document_store = FAISSDocumentStore(embedding_dim=128, faiss_index_factory_str="Flat")
document_store.write_documents(docs)
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="vblagoje/dpr-question_encoder-single-lfqa-wiki",
    passage_embedding_model="vblagoje/dpr-ctx_encoder-single-lfqa-wiki",
)
document_store.update_embeddings(retriever)


In [None]:
generator = Seq2SeqGenerator(model_name_or_path="vblagoje/bart_lfqa")
pipe = GenerativeQAPipeline(generator, retriever)