### Custom Embedding

In [1]:
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
import json

2025-06-19 16:03:16.693383: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750328296.713580 4086181 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750328296.719191 4086181 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750328296.735246 4086181 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750328296.735275 4086181 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750328296.735277 4086181 computation_placer.cc:177] computation placer alr

In [None]:
qdrant_api_key = ""

In [None]:
# 1. Load your data (example: movies, places, animals)
data = {
    "movies": ["Inception", "The Matrix", "Interstellar", "Titanic"],
    "places": ["Paris", "Tokyo", "New York", "Mount Everest"],
    "animals": ["Tiger", "Elephant", "Dolphin", "Eagle"]
}

In [None]:
# 2. Generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')  # 384-dim
all_texts = data["movies"] + data["places"] + data["animals"]
embeddings = model.encode(all_texts).tolist()  # Convert to list


In [None]:
# 3. Prepare metadata
ids = [f"item_{i}" for i in range(len(all_texts))]
metadatas = [{"type": "movie"} for _ in data["movies"]] + \
            [{"type": "place"} for _ in data["places"]] + \
            [{"type": "animal"} for _ in data["animals"]]


In [None]:
# 4. Store in Qdrant Cloud (free tier)
client = QdrantClient(
    url="https://2147d34e-a19b-4156-a7e8-7c8cb4b79b98.us-west-2-0.aws.cloud.qdrant.io:6333", 
    api_key=qdrant_api_key,
)

client.create_collection(
    collection_name="game_data",
    vectors_config=VectorParams(size=384, distance=Distance.COSINE)
)

client.upsert(
    collection_name="game_data",
    points=[
        {
            "id": idx,
            "vector": embedding,
            "payload": {"text": text, "type": metadata["type"]}
        }
        for idx, (embedding, text, metadata) in enumerate(zip(embeddings, all_texts, metadatas))
    ]
)

print("Embeddings uploaded to Qdrant Cloud!")

In [2]:
def split_glove_file(file_path, lines_per_file=200000):
    file_number = 1
    line_count = 0
    output = open(f"{file_path}_part{file_number}.txt", 'w', encoding='utf-8')
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            output.write(line)
            line_count += 1

            if line_count >= lines_per_file:
                output.close()
                file_number += 1
                line_count = 0
                output = open(f"{file_path}_part{file_number}.txt", 'w', encoding='utf-8')
    
    output.close()
    print(f"Finished splitting {file_path} into {file_number} parts.")

In [3]:
split_glove_file('glove.6B.50d.txt')

Finished splitting glove.6B.50d.txt into 3 parts.


### GloVa Embedding

In [None]:
## To get the data/embeddings

# wget http://nlp.stanford.edu/data/glove.6B.zip
# unzip glove.6B.zip glove.6B.50d.txt

In [None]:
# prepare_glove_embeddings.py
from qdrant_client import QdrantClient, models
import numpy as np
import os
from datetime import time
from dotenv import load_dotenv
import random
import mmap

In [None]:
load_dotenv()

In [None]:
# Configuration
QDRANT_URL = "https://2147d34e-a19b-4156-a7e8-7c8cb4b79b98.us-west-2-0.aws.cloud.qdrant.io:6333"  # From Qdrant Cloud dashboard
api_key = os.getenv('qdrant_api_key')
GLOVE_PATH = "glove.6B.50d.txt"  # 50-dim
BATCH_SIZE = 1000

In [None]:
# Initialize Qdrant
client = QdrantClient(
    url=QDRANT_URL,
    api_key=api_key,
    timeout=120  # Increased timeout for large uploads
)

In [None]:
# Create collection (only run once!)
client.recreate_collection(
    collection_name="word_game_full",
    vectors_config=models.VectorParams(
        size=50,  # 50 dimensions for GloVe-50d
        distance=models.Distance.COSINE,
    )
)

In [None]:
GLOVE_PARTS = [
    "glove.6B.50d_1.txt",
    "glove.6B.50d_2.txt",
]

In [None]:
def process_split_glove():
    word_count = 0
    batch = []
    start_time = time.time()

    for glove_path in GLOVE_PARTS:
        print(f"Processing {glove_path}...")
        with open(glove_path, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.rstrip().split()
                if len(parts) != 51:  # Skip malformed lines
                    continue

                word = parts[0]
                vector = [float(x) for x in parts[1:51]]  # 50 dimensions

                batch.append({
                    "id": word_count,  # Using line number as ID
                    "vector": vector,
                    "payload": {
                        "word": word,
                        "length": len(word)
                    }
                })

                word_count += 1

                if len(batch) >= BATCH_SIZE:
                    client.upsert(
                        collection_name="word_game_full",
                        points=batch,
                        wait=True
                    )
                    batch = []
                    print(f"Processed {word_count} words | {word_count/(time.time()-start_time):.1f} words/sec")

        print(f"Finished processing {glove_path}")

    if batch:
        client.upsert(
            collection_name="word_game_full",
            points=batch
        )

    print(f"\nFinished! Processed {word_count} total words")
    print(f"Average speed: {word_count/(time.time()-start_time):.1f} words/sec")


In [None]:
# Run the processing
process_full_glove()

In [None]:
# 1. Load your data (example: movies, places, animals)
data = {
    "movies": ["Inception", "The Matrix", "Interstellar", "Titanic"],
    "places": ["Paris", "Tokyo", "New York", "Mount Everest"],
    "animals": ["Tiger", "Elephant", "Dolphin", "Eagle"]
}# 2. Generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')  # 384-dim
all_texts = data["movies"] + data["places"] + data["animals"]
embeddings = model.encode(all_texts).tolist()  # Convert to list
# 3. Prepare metadata
ids = [f"item_{i}" for i in range(len(all_texts))]
metadatas = [{"type": "movie"} for _ in data["movies"]] + \
            [{"type": "place"} for _ in data["places"]] + \
            [{"type": "animal"} for _ in data["animals"]]
# 4. Store in Qdrant Cloud (free tier)
client = QdrantClient(
    url="https://2147d34e-a19b-4156-a7e8-7c8cb4b79b98.us-west-2-0.aws.cloud.qdrant.io:6333", 
    api_key=qdrant_api_key,
)

client.create_collection(
    collection_name="game_data",
    vectors_config=VectorParams(size=384, distance=Distance.COSINE)
)

client.upsert(
    collection_name="game_data",
    points=[
        {
            "id": idx,
            "vector": embedding,
            "payload": {"text": text, "type": metadata["type"]}
        }
        for idx, (embedding, text, metadata) in enumerate(zip(embeddings, all_texts, metadatas))
    ]
)

print("Embeddings uploaded to Qdrant Cloud!")