In [2]:
import os
import weaviate
from weaviate.classes.init import Auth
from dotenv import load_dotenv
import os



### Load envs
load_dotenv()
WEAVIATE_URL = os.getenv("WEAVIATE_URL")
WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY")

# # Best practice: store your credentials in environment variables
# weaviate_url = os.environ["WEAVIATE_URL"]
# weaviate_api_key = os.environ["WEAVIATE_API_KEY"]

# Connect to Weaviate Cloud
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
)


client.is_ready()

True

In [None]:
import weaviate.classes.config as wvc
### Try creating a collection and defining the schema
"""
class_obj = {
    "class": "FOSSProject",
    "description": "Open source projects with name and description",
    "vectorizer": "none",  # Since you're supplying your own vectors
    "properties": [
        {
            "name": "name",
            "dataType": ["text"],
            "description": "Name of the project",
            "hash": "Hash of FOSS name"
        },
        {
            "name": "description",
            "dataType": ["text"],
            "description": "Project description",
            "hash": "Hash of FOSS name"
        }
    ]
}"""



# For Python client v4
foss_wvc_collection = client.collections.create(
    name="FOSSProject",
    description="Open source projects with name and description",
    vectorizer_config=[
        wvc.Configure.NamedVectors.none(name="name_vector"),
        wvc.Configure.NamedVectors.none(name="combined_vector")
    ],
    properties=[
        wvc.Property(name="name", data_type=wvc.DataType.TEXT, description="Name of the project"),
        wvc.Property(name="description", data_type=wvc.DataType.TEXT, description="Project description"),
        wvc.Property(name="foss_hash", data_type=wvc.DataType.TEXT,description="Hash of FOSS project name")
    ]
)

In [4]:
import ollama
from nomic import embed as NOMIC_EMBED
import sentence_transformers as SBERT
import numpy as np



OLLAMA_NOMIC_MODEL = 'nomic-embed-text'



def ollama_nomic_embed(prompt: str) -> list[float]:

    response = ollama.embeddings(
        model=OLLAMA_NOMIC_MODEL,
        prompt=prompt
    )

    return response['embedding']

string = ["ollama:ollama"]
# Get embeddings
embeddings = NOMIC_EMBED.text(string)
print(embeddings[0])

response = ollama_nomic_embed(string[0])
print(response)


ValueError: You have not configured your Nomic API token. Run `nomic login` to configure.

In [None]:
### Creating semantic embeddings using nomic via ollama model
### Test cases ###

response = ollama.embeddings(
    model='nomic-embed-text',
    prompt='facebook/react'
)

print(response['embedding'])  # This is your vector!

response2 = ollama.embeddings(
    model='nomic-embed-text',
    prompt='vuejs/vue'
)

vec1: np.array = np.array(response['embedding'])
vec2: np.array = np.array(response2['embedding'])
vectors: np.array = np.array([vec1,vec2])

cosine_similarity(response['embedding'], response2['embedding'])





[0.036939382553100586, -0.42223700881004333, -3.664456844329834, 0.4896526038646698, 2.703233003616333, -1.3221014738082886, 1.1219252347946167, 0.42916998267173767, 1.3650630712509155, -2.458561897277832, -0.4048089385032654, 0.9663305878639221, 1.2907313108444214, 1.5350327491760254, -0.72764652967453, -0.6362704634666443, 0.2352897673845291, -0.24649213254451752, -0.5770887732505798, -0.06262727081775665, -0.7517426609992981, -0.17988982796669006, -1.3437236547470093, 1.3741759061813354, 2.0462088584899902, 1.1033308506011963, 0.2465982586145401, 0.5096945762634277, -2.1097898483276367, -0.2841247022151947, -0.5877325534820557, 0.8428031802177429, -1.3818109035491943, 0.40201863646507263, 0.4478963315486908, -0.5781090259552002, -0.9265671372413635, -0.09855188429355621, -0.8459641337394714, -0.10208415240049362, -0.42278945446014404, -0.6062077879905701, 0.0696372538805008, -0.8224694132804871, 2.199431896209717, -0.6523141860961914, 0.5519704222679138, 0.6619137525558472, 1.800322

np.float64(0.432604400645278)

In [22]:
from pathlib import Path
import csv
import json
import hashlib

foss_proj_space_csv: Path = Path("../csv_github_data_cleaned/FOSS_projects_space.csv")
foss_name_description_json: Path = Path("../json_github_data_cleaned/github_repositories_final_ordered.json")


def create_data_object_and_store(json_file: str) -> None:

    data_objects = []
    with open(json_file,'r') as file:

        # load the json data
        data = json.load(file)

        for project in data:
            

            ### get project name from json
            project_name = project["FOSS project name"]
            print("processing " + project_name + "...")

            ### Hash project name
            hash_object = hashlib.sha1(project_name.encode())
            hashed_foss_name = hash_object.hexdigest()

            ### Get project description from json
            description = project["description"]

            ### Create combined string for vectorization
            if not project_name:
                print(f"Skipping entry with missing name: {project}")
                continue

            name_description = project_name + " " + (description or "")
            
            
            # Create data object which will be used for Weaviate
            data_object = {
                "name": project_name,
                "description": description,
                "foss_hash": hashed_foss_name
            }

            ### Create vector represenations of the project names & the names + project descriptions
            vectorized_name_description: list[float] = ollama_nomic_embed(name_description)
            vectorized_name: list[float] = ollama_nomic_embed(project_name)
            

            data_objects.append((data_object, vectorized_name, vectorized_name_description))
    print("#############################################")
    print("Starting to import the data into Weaviate!!!!")
    # Now batch import with error handling
    with foss_wvc_collection.batch.dynamic() as batch:
        for data_object, name_vector, combined_vector in data_objects:
            batch.add_object(
                properties=data_object,
                vector={
                "name_vector": name_vector,
                "combined_vector": combined_vector
                }
            )
            # Monitor errors during insertion
            if batch.number_errors > 10:
                print("Batch import stopped due to excessive errors.")
                break
            

    # Check for failed objects after batch completes
    failed_objects = foss_wvc_collection.batch.failed_objects
    if failed_objects:
        print(f"Number of failed imports: {len(failed_objects)}")
        for i, obj in enumerate(failed_objects[:5]):  # Print first 5 failures
            print(f"Failed object {i+1}: {obj}")
            
    

In [None]:
### Runnit 
#create_data_object_and_store(foss_name_description_json)

In [5]:
from weaviate.classes.query import MetadataQuery
### Test Queries ###

query_string = "freebsd:freebsd"
vector_query = ollama_nomic_embed(query_string)

### Get the collection ###
FOSS_COLLECTION = client.collections.get("FOSSProject")

### Get the response ###
response = FOSS_COLLECTION.query.near_vector(
    near_vector=vector_query,
    target_vector="name_vector",
    return_metadata=MetadataQuery(distance=True,certainty=True)
    # distance=
    # certainty
)

for o in response.objects:
    print(o.properties['name'])
    print(f"Distance: {o.metadata.distance:.3f}")
    print(f"Certainty: {o.metadata.certainty:.3f}")
    print("############")

freebsd freebsd-src
Distance: 0.240
Certainty: 0.880
############
FreeCAD FreeCAD
Distance: 0.276
Certainty: 0.862
############
Freeboard freeboard
Distance: 0.338
Certainty: 0.831
############
lTbgykio Books-Free-Books
Distance: 0.344
Certainty: 0.828
############
FreeRDP FreeRDP
Distance: 0.350
Certainty: 0.825
############
EbookFoundation free-programming-books
Distance: 0.362
Certainty: 0.819
############
freefq free
Distance: 0.364
Certainty: 0.818
############
Pawdroid Free-servers
Distance: 0.376
Certainty: 0.812
############
ARMmbed mbed-os
Distance: 0.379
Certainty: 0.811
############
justjavac free-programming-books-zh_CN
Distance: 0.386
Certainty: 0.807
############


In [13]:
for o in response.objects:
    print(o.metadata)
  

MetadataReturn(creation_time=None, last_update_time=None, distance=0.24045562744140625, certainty=0.8797721862792969, score=None, explain_score=None, is_consistent=None, rerank_score=None)
MetadataReturn(creation_time=None, last_update_time=None, distance=0.2764824628829956, certainty=0.8617587685585022, score=None, explain_score=None, is_consistent=None, rerank_score=None)
MetadataReturn(creation_time=None, last_update_time=None, distance=0.3378521203994751, certainty=0.8310739398002625, score=None, explain_score=None, is_consistent=None, rerank_score=None)
MetadataReturn(creation_time=None, last_update_time=None, distance=0.3442188501358032, certainty=0.8278905749320984, score=None, explain_score=None, is_consistent=None, rerank_score=None)
MetadataReturn(creation_time=None, last_update_time=None, distance=0.3495115041732788, certainty=0.8252442479133606, score=None, explain_score=None, is_consistent=None, rerank_score=None)
MetadataReturn(creation_time=None, last_update_time=None, d

In [11]:
### Get the response ###
response = FOSS_COLLECTION.query.near_vector(
    near_vector=vector_query,
    target_vector="combined_vector",
    return_metadata=MetadataQuery(distance=True,certainty=True)
    # distance=
    # certainty
)

for o in response.objects:
    print(o.properties['name'])
    print(f"Distance: {o.metadata.distance:.3f}")
    print(f"Certainty: {o.metadata.certainty:.3f}")
    print("############")

EbookFoundation free-programming-books
Distance: 0.311
Certainty: 0.844
############
freebsd freebsd-src
Distance: 0.322
Certainty: 0.839
############
openzfs zfs
Distance: 0.360
Certainty: 0.820
############
auctors free-lunch
Distance: 0.369
Certainty: 0.816
############
FreeRTOS FreeRTOS
Distance: 0.375
Certainty: 0.812
############
FreeCAD FreeCAD
Distance: 0.378
Certainty: 0.811
############
BlackrockDigital startbootstrap
Distance: 0.378
Certainty: 0.811
############
OpenMathLib OpenBLAS
Distance: 0.382
Certainty: 0.809
############
justjavac free-programming-books-zh_CN
Distance: 0.389
Certainty: 0.805
############
Freeboard freeboard
Distance: 0.391
Certainty: 0.804
############


In [6]:
### Close the connection to the weaviate database
client.close()

In [None]:
import weaviate

client = weaviate.connect_to_local()

print(client.is_ready())  # Should print: `True`

client.close()  # Free up resources

True
