In [1]:
import weaviate
import json
import pandas

In [2]:
client = weaviate.Client(   
    url="http://localhost:8080"
)

In [9]:
print(client.schema.get())

{'classes': [{'class': 'Paper', 'description': 'Articles', 'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2}, 'cleanupIntervalSeconds': 60, 'stopwords': {'additions': None, 'preset': 'en', 'removals': None}}, 'moduleConfig': {'text2vec-transformers': {'poolingStrategy': 'masked_mean', 'vectorizeClassName': True}}, 'properties': [{'dataType': ['string'], 'description': 'The id', 'indexInverted': False, 'moduleConfig': {'text2vec-transformers': {'skip': True, 'vectorizePropertyName': False}}, 'name': 'pdfId', 'tokenization': 'word'}, {'dataType': ['text'], 'description': 'The abstract', 'indexInverted': True, 'moduleConfig': {'text2vec-transformers': {'skip': False, 'vectorizePropertyName': False}}, 'name': 'abstract', 'tokenization': 'word'}, {'dataType': ['string[]'], 'description': 'The categories', 'indexInverted': False, 'moduleConfig': {'text2vec-transformers': {'options': {'useCache': True, 'useGPU': True, 'waitForModel': True}, 'skip': True, 'vectorizePropertyName': False}},

In [8]:
client.schema.create_class({
    "class": "Paper",
    "description": "Articles",  # description of the class
    "properties": [
        {
            "dataType": ["string"],
            "description": "The id",
            "name": "pdfId",
            "moduleConfig": {
                "text2vec-transformers": {
                    "skip": True
                }
            },
            "indexInverted": False
        },
        {
            "dataType": ["text"],
            "description": "The abstract",
            "name": "abstract",
            "moduleConfig": {
                "text2vec-transformers": {
                    "skip": False
                }
            },
            "indexInverted": True
        },
        {
            "dataType": ["string[]"],
            "description": "The categories",
            "name": "categories",
            "moduleConfig": {
                "text2vec-transformers": {
                    "skip": True,
                    "options": {
                        "waitForModel": True,
                        "useGPU": True,
                        "useCache": True
                    }
                }
            },
            "indexInverted": False
        },
        {
            "dataType": ["int"],
            "description": "The year of the most recent version of the paper",
            "name": "version",
            "moduleConfig": {
                "text2vec-transformers": {
                    "skip": True
                }
            },
            "indexInverted": False
        },
    ],
    "vectorizer": "text2vec-transformers"
})

In [65]:
data = client.query.get("Paper", ["pdfId"]).with_limit(500).do()

In [14]:
data = client.query.aggregate("Paper").with_fields("meta {count}").do()

In [17]:
nearText = {
    "concepts": ["Specifically the LDA equation has two arguments. The first being a dirichlet distribution of documents in a space in which each axis is a topic. The second being a word space with each word being an axis with the topics placed within that space to show how words are weighted between topics. These arguments are then refined to try to get the best probability of reproducing the original documents. These refined arguments can then be used on a new document to place it into the space mentioned above and assign a probability of being a certain topic."]
}

In [20]:
data = client.query.get("Paper", ["pdfId"]).with_near_text(nearText).with_limit(5).do()

In [6]:
#client.schema.delete_class(class_name="Paper")

In [21]:
print(json.dumps(data, indent=4))

{
    "data": {
        "Get": {
            "Paper": [
                {
                    "pdfId": "2102-04449"
                },
                {
                    "pdfId": "1606-00577"
                },
                {
                    "pdfId": "1812-05813"
                },
                {
                    "pdfId": "1506-08422"
                },
                {
                    "pdfId": "1410-6991"
                }
            ]
        }
    }
}


In [10]:
def loadFromJson():
    dataset = pandas.read_json("ArxivCSOnlyDataset.json", lines=True)
    dataset = dataset.reset_index()

    for index, row in dataset.iterrows():
        returnRow = row.to_dict()
        del returnRow["index"]
        returnRow["pdfId"] = str(returnRow["pdfId"]).replace(".", "-")
        yield returnRow

# Batch Efficiency Testing

batch-size num-workers time-to-complete 10000 inserts

5000        1           1m 51.1s
5000        2           1m 50.5s

1000        1           1m 48.7s
1000        2           1m 48.3s

500         1           1m 50.9s
500         2           1m 51.4s

10          1           1m 51s
10          2           1m 53s


In [12]:
def check_batch_result(results: dict):
  """
  Check batch results for errors.

  Parameters
  ----------
  results : dict
      The Weaviate batch creation return value.
  """

  if results is not None:
    for result in results:
      if "result" in result and "errors" in result["result"]:
        if "error" in result["result"]["errors"]:
          print(result["result"])

In [13]:
client.batch.configure(
  batch_size=1000,
  dynamic=True,
  num_workers=1,
  callback=check_batch_result
)

with client.batch as batch:
    for obj in loadFromJson():
        #print(obj["pdfId"])
        batch.add_data_object(
            obj,
            class_name="Paper"
        )