In [10]:
import os
import weaviate
from weaviate.classes.init import Auth
from dotenv import load_dotenv
import os



### Load envs
load_dotenv()
WEAVIATE_URL = os.getenv("WEAVIATE_URL")
WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY")

# # Best practice: store your credentials in environment variables
# weaviate_url = os.environ["WEAVIATE_URL"]
# weaviate_api_key = os.environ["WEAVIATE_API_KEY"]

# Connect to Weaviate Cloud
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
)


client.is_ready()

True

In [16]:
import weaviate.classes.config as wvc
### Try creating a collection and defining the schema
"""
class_obj = {
    "class": "FOSSProject",
    "description": "Open source projects with name and description",
    "vectorizer": "none",  # Since you're supplying your own vectors
    "properties": [
        {
            "name": "name",
            "dataType": ["text"],
            "description": "Name of the project",
            "hash": "Hash of FOSS name"
        },
        {
            "name": "description",
            "dataType": ["text"],
            "description": "Project description",
            "hash": "Hash of FOSS name"
        }
    ]
}"""



# For Python client v4
foss_wvc_collection = client.collections.create(
    name="FOSSProject",
    description="Open source projects with name and description",
    vectorizer_config=[
        wvc.Configure.NamedVectors.none(name="name_vector"),
        wvc.Configure.NamedVectors.none(name="combined_vector")
    ],
    properties=[
        wvc.Property(name="name", data_type=wvc.DataType.TEXT, description="Name of the project"),
        wvc.Property(name="description", data_type=wvc.DataType.TEXT, description="Project description"),
        wvc.Property(name="foss_hash", data_type=wvc.DataType.TEXT,description="Hash of FOSS project name")
    ]
)

In [17]:
import ollama
from nomic import embed
import sentence_transformers as SBERT
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from typing import Optional, List



OLLAMA_NOMIC_MODEL = 'nomic-embed-text'

def cosine_similarity(vec1: list[float], vec2: list[float]):
    """Compute cosine similarity between two vectors."""
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def ollama_nomic_embed(prompt: str) -> list[float]:

    response = ollama.embeddings(
        model=OLLAMA_NOMIC_MODEL,
        prompt=prompt
    )

    return response['embedding']



In [None]:
### verify 

In [None]:
### Creating semantic embeddings using nomic via ollama model
### Test cases ###

response = ollama.embeddings(
    model='nomic-embed-text',
    prompt='facebook/react'
)

print(response['embedding'])  # This is your vector!

response2 = ollama.embeddings(
    model='nomic-embed-text',
    prompt='vuejs/vue'
)

vec1: np.array = np.array(response['embedding'])
vec2: np.array = np.array(response2['embedding'])
vectors: np.array = np.array([vec1,vec2])

cosine_similarity(response['embedding'], response2['embedding'])





[0.036939382553100586, -0.42223700881004333, -3.664456844329834, 0.4896526038646698, 2.703233003616333, -1.3221014738082886, 1.1219252347946167, 0.42916998267173767, 1.3650630712509155, -2.458561897277832, -0.4048089385032654, 0.9663305878639221, 1.2907313108444214, 1.5350327491760254, -0.72764652967453, -0.6362704634666443, 0.2352897673845291, -0.24649213254451752, -0.5770887732505798, -0.06262727081775665, -0.7517426609992981, -0.17988982796669006, -1.3437236547470093, 1.3741759061813354, 2.0462088584899902, 1.1033308506011963, 0.2465982586145401, 0.5096945762634277, -2.1097898483276367, -0.2841247022151947, -0.5877325534820557, 0.8428031802177429, -1.3818109035491943, 0.40201863646507263, 0.4478963315486908, -0.5781090259552002, -0.9265671372413635, -0.09855188429355621, -0.8459641337394714, -0.10208415240049362, -0.42278945446014404, -0.6062077879905701, 0.0696372538805008, -0.8224694132804871, 2.199431896209717, -0.6523141860961914, 0.5519704222679138, 0.6619137525558472, 1.800322

np.float64(0.432604400645278)

In [22]:
from pathlib import Path
import csv
import json
import hashlib

foss_proj_space_csv: Path = Path("../csv_github_data_cleaned/FOSS_projects_space.csv")
foss_name_description_json: Path = Path("../json_github_data_cleaned/github_repositories_final_ordered.json")


def create_data_object_and_store(json_file: str) -> None:

    data_objects = []
    with open(json_file,'r') as file:

        # load the json data
        data = json.load(file)

        for project in data:
            

            ### get project name from json
            project_name = project["FOSS project name"]
            print("processing " + project_name + "...")

            ### Hash project name
            hash_object = hashlib.sha1(project_name.encode())
            hashed_foss_name = hash_object.hexdigest()

            ### Get project description from json
            description = project["description"]

            ### Create combined string for vectorization
            if not project_name:
                print(f"Skipping entry with missing name: {project}")
                continue

            name_description = project_name + " " + (description or "")
            
            
            # Create data object which will be used for Weaviate
            data_object = {
                "name": project_name,
                "description": description,
                "foss_hash": hashed_foss_name
            }

            ### Create vector represenations of the project names & the names + project descriptions
            vectorized_name_description: list[float] = ollama_nomic_embed(name_description)
            vectorized_name: list[float] = ollama_nomic_embed(project_name)
            

            data_objects.append((data_object, vectorized_name, vectorized_name_description))
    print("#############################################")
    print("Starting to import the data into Weaviate!!!!")
    # Now batch import with error handling
    with foss_wvc_collection.batch.dynamic() as batch:
        for data_object, name_vector, combined_vector in data_objects:
            batch.add_object(
                properties=data_object,
                vector={
                "name_vector": name_vector,
                "combined_vector": combined_vector
                }
            )
            # Monitor errors during insertion
            if batch.number_errors > 10:
                print("Batch import stopped due to excessive errors.")
                break
            

    # Check for failed objects after batch completes
    failed_objects = foss_wvc_collection.batch.failed_objects
    if failed_objects:
        print(f"Number of failed imports: {len(failed_objects)}")
        for i, obj in enumerate(failed_objects[:5]):  # Print first 5 failures
            print(f"Failed object {i+1}: {obj}")
            
    

In [23]:
### Runnit 
create_data_object_and_store(foss_name_description_json)

processing freeCodeCamp freeCodeCamp...
processing codecrafters-io build-your-own-x...
processing sindresorhus awesome...
processing EbookFoundation free-programming-books...
processing public-apis public-apis...
processing jwasham coding-interview-university...
processing kamranahmedse developer-roadmap...
processing donnemartin system-design-primer...
processing 996icu 996.ICU...
processing vinta awesome-python...
processing facebook react...
processing awesome-selfhosted awesome-selfhosted...
processing practical-tutorials project-based-learning...
processing vuejs vue...
processing TheAlgorithms Python...
processing torvalds linux...
processing trekhleb javascript-algorithms...
processing tensorflow tensorflow...
processing getify You-Dont-Know-JS...
processing CyC2018 CS-Notes...
processing ossu computer-science...
processing ohmyzsh ohmyzsh...
processing Significant-Gravitas AutoGPT...
processing twbs bootstrap...
processing microsoft vscode...
processing flutter flutter...
proce

In [24]:
### Close the connection to the weaviate database
client.close()

In [None]:
### Below is the schema that I'm going to use for my data in the vector database
### I'm including the fields of the metadata that I want

# Embedding 1: Foss project name (with space)

'''
Foss project name (with a space)
UID: aka hash of project name (with a space)
foss project description


'''

# Embed 2: Foss project name (with space) + foss project description
'''
Foss project name + foss project description (with a space)
UID: hash of project name (with a space)
foss project description

'''

In [None]:
#foss_name_vectors_space: list[list[str]] = vectorize_foss_names(foss_proj_space_csv)