# Création de la base de données vectorielles

### Connexion à Weaviate

In [1]:
import weaviate
from weaviate.classes.init import Auth
import os
from dotenv import load_dotenv

load_dotenv()

mistral_key = os.getenv("MISTRAL_API_KEY")
weaviate_url = os.getenv("WEAVIATE_URL")
weaviate_api_key = os.getenv("WEAVIATE_API_KEY")
headers = {
    "X-Mistral-Api-Key": mistral_key,
}

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,
    auth_credentials=Auth.api_key(weaviate_api_key),
    headers=headers
)

print(client.is_ready())

True


### Création de la collection

In [2]:
from weaviate.classes.config import Configure, Property, DataType, VectorDistances

vector_config_dict = {}

vector_indexs = {
    "hnsw": Configure.VectorIndex.hnsw,
    "flat": Configure.VectorIndex.flat
}
vector_distances = {
    "cos": VectorDistances.COSINE,
    "dot": VectorDistances.DOT,
    "l2s": VectorDistances.L2_SQUARED,
    "ham": VectorDistances.HAMMING,
    "man": VectorDistances.MANHATTAN
}

for index_name, index_object in vector_indexs.items():
    for distance_name, distance_object in vector_distances.items():
        vector_config_dict[f"vector_{index_name}_{distance_name}"] = index_object(distance_metric=distance_object)

print(vector_config_dict)

{'vector_hnsw_cos': _VectorIndexConfigHNSWCreate(distance=<VectorDistances.COSINE: 'cosine'>, multivector=None, quantizer=None, cleanupIntervalSeconds=None, dynamicEfMin=None, dynamicEfMax=None, dynamicEfFactor=None, efConstruction=None, ef=None, filterStrategy=None, flatSearchCutoff=None, maxConnections=None, vectorCacheMaxObjects=None), 'vector_hnsw_dot': _VectorIndexConfigHNSWCreate(distance=<VectorDistances.DOT: 'dot'>, multivector=None, quantizer=None, cleanupIntervalSeconds=None, dynamicEfMin=None, dynamicEfMax=None, dynamicEfFactor=None, efConstruction=None, ef=None, filterStrategy=None, flatSearchCutoff=None, maxConnections=None, vectorCacheMaxObjects=None), 'vector_hnsw_l2s': _VectorIndexConfigHNSWCreate(distance=<VectorDistances.L2_SQUARED: 'l2-squared'>, multivector=None, quantizer=None, cleanupIntervalSeconds=None, dynamicEfMin=None, dynamicEfMax=None, dynamicEfFactor=None, efConstruction=None, ef=None, filterStrategy=None, flatSearchCutoff=None, maxConnections=None, vector

In [4]:
client.collections.create(
    "val_de_fensch",
    properties=[
        Property(name="year", data_type=DataType.INT),
        Property(name="page", data_type=DataType.INT),
        Property(name="category", data_type=DataType.TEXT),
        Property(name="chunk", data_type=DataType.TEXT)
    ],
    vector_config=[
        Configure.Vectors.text2vec_mistral(
            name=config_name,
            source_properties=["chunk"],
            vector_index_config=config
        )
        for config_name, config in vector_config_dict.items()
    ]
)

<weaviate.collections.collection.sync.Collection at 0x22aab300550>

### Chargement du Dataset

In [5]:
val_de_fensch_collection = client.collections.use("val_de_fensch")

In [6]:
import polars as pl

df = pl.read_csv("../datasets/chunks_cleaned.csv")

df.head(5)

year,page,category,chunk
i64,i64,str,str
2012,27,"""Bâtiments""","""Bâtiments > Copropriété les Ti…"
2012,21,"""Bâtiments""","""Bâtiments > Culture, patrimoin…"
2019,20,"""Bâtiments et Cultes""","""Bâtiments et Cultes > Sports >…"
2020,21,"""INFRASTRUCTURES""","""INFRASTRUCTURES > Perspectives…"
2022,18,"""28""","""28 > 628 kg De déchets produit…"


### Population de la collection

In [7]:
with val_de_fensch_collection.batch.rate_limit(60) as batch:
    for row in df.iter_rows(named=True):
        batch.add_object(properties=row)
        if batch.number_errors > 10:
            print("Batch import stopped due to excessive errors.")
            break