### Custom Embedding

In [15]:
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient, models
from qdrant_client.models import Distance, VectorParams
import json
import time
import os
from dotenv import load_dotenv

In [None]:
# Configuration
QDRANT_URL = "https://2147d34e-a19b-4156-a7e8-7c8cb4b79b98.us-west-2-0.aws.cloud.qdrant.io:6333"  # From Qdrant Cloud dashboard
api_key = os.getenv('qdrant_api_key')
GLOVE_PATH = "glove.6B.50d.txt"  # 50-dim
BATCH_SIZE = 1000

In [None]:
# 1. Load GRE words data (example - replace with your actual GRE word list)
gre_words = {
    "targets": ["abstruse", "convoluted", "recondite", "esoteric", "arcane"],
    "similar": {
        "abstruse": ["obscure", "cryptic", "enigmatic", "profound", "complex"],
        "convoluted": ["intricate", "tangled", "complicated", "byzantine", "tortuous"],
        "recondite": ["arcane", "abstruse", "erudite", "hermetic", "occult"],
        "esoteric": ["mystical", "obscure", "abstruse", "arcane", "recondite"],
        "arcane": ["mysterious", "esoteric", "occult", "cryptic", "abstruse"]
    }
}

In [None]:
# 2. Generate embeddings for all words
model = SentenceTransformer('all-MiniLM-L6-v2')  # 384-dim
all_words = gre_words["targets"] + [word for sublist in gre_words["similar"].values() for word in sublist]
unique_words = list(set(all_words))  # Remove duplicates
embeddings = model.encode(unique_words).tolist()

In [None]:
# 3. Prepare metadata
word_to_embedding = {word: embedding for word, embedding in zip(unique_words, embeddings)}

In [None]:
# 4. Store in Qdrant Cloud
client = QdrantClient(
    url=QDRANT_URL, 
    api_key=api_key,
)

client.recreate_collection(
    collection_name="gre_word_game",
    vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE)
)

# Prepare points for upload
points = []
for idx, word in enumerate(unique_words):
    points.append({
        "id": idx,
        "vector": word_to_embedding[word],
        "payload": {
            "word": word,
            "is_target": word in gre_words["targets"]
        }
    })

client.upsert(
    collection_name="gre_word_game",
    points=points
)

print("GRE word embeddings uploaded to Qdrant Cloud!")


### GloVa Embedding

In [2]:
def split_glove_file(file_path, lines_per_file=200000):
    file_number = 1
    line_count = 0
    output = open(f"{file_path}_part{file_number}.txt", 'w', encoding='utf-8')
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            output.write(line)
            line_count += 1

            if line_count >= lines_per_file:
                output.close()
                file_number += 1
                line_count = 0
                output = open(f"{file_path}_part{file_number}.txt", 'w', encoding='utf-8')
    
    output.close()
    print(f"Finished splitting {file_path} into {file_number} parts.")

In [3]:
split_glove_file('glove.6B.50d.txt')

Finished splitting glove.6B.50d.txt into 3 parts.


In [None]:
## To get the data/embeddings

# wget http://nlp.stanford.edu/data/glove.6B.zip
# unzip glove.6B.zip glove.6B.50d.txt

In [12]:
# prepare_glove_embeddings.py
from qdrant_client import QdrantClient, models
import numpy as np
import os
import time
from dotenv import load_dotenv
import random
import mmap

In [5]:
load_dotenv()

True

In [6]:
# Configuration
QDRANT_URL = "https://2147d34e-a19b-4156-a7e8-7c8cb4b79b98.us-west-2-0.aws.cloud.qdrant.io:6333"  # From Qdrant Cloud dashboard
api_key = os.getenv('qdrant_api_key')
GLOVE_PATH = "glove.6B.50d.txt"  # 50-dim
BATCH_SIZE = 1000

In [7]:
# Initialize Qdrant
client = QdrantClient(
    url=QDRANT_URL,
    api_key=api_key,
    timeout=120  # Increased timeout for large uploads
)

In [8]:
# Create collection (only run once!)
client.recreate_collection(
    collection_name="word_game_full",
    vectors_config=models.VectorParams(
        size=50,  # 50 dimensions for GloVe-50d
        distance=models.Distance.COSINE,
    )
)

  client.recreate_collection(


True

In [9]:
GLOVE_PARTS = [
    "glove.6B.50d_1.txt",
    "glove.6B.50d_2.txt",
]

In [13]:
def process_split_glove():
    word_count = 0
    batch = []
    start_time = time.time()

    for glove_path in GLOVE_PARTS:
        print(f"Processing {glove_path}...")
        with open(glove_path, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.rstrip().split()
                if len(parts) != 51:  # Skip malformed lines
                    continue

                word = parts[0]
                vector = [float(x) for x in parts[1:51]]  # 50 dimensions

                batch.append({
                    "id": word_count,  # Using line number as ID
                    "vector": vector,
                    "payload": {
                        "word": word,
                        "length": len(word)
                    }
                })

                word_count += 1

                if len(batch) >= BATCH_SIZE:
                    client.upsert(
                        collection_name="word_game_full",
                        points=batch,
                        wait=True
                    )
                    batch = []
                    print(f"Processed {word_count} words | {word_count/(time.time()-start_time):.1f} words/sec")

        print(f"Finished processing {glove_path}")

    if batch:
        client.upsert(
            collection_name="word_game_full",
            points=batch
        )

    print(f"\nFinished! Processed {word_count} total words")
    print(f"Average speed: {word_count/(time.time()-start_time):.1f} words/sec")


In [14]:
# Run the processing
process_split_glove()

Processing glove.6B.50d_1.txt...
Processed 1000 words | 394.0 words/sec
Processed 2000 words | 586.4 words/sec
Processed 3000 words | 662.4 words/sec
Processed 4000 words | 677.3 words/sec
Processed 5000 words | 684.9 words/sec
Processed 6000 words | 669.7 words/sec
Processed 7000 words | 612.3 words/sec
Processed 8000 words | 554.4 words/sec
Processed 9000 words | 501.9 words/sec
Processed 10000 words | 438.4 words/sec
Processed 11000 words | 421.7 words/sec
Processed 12000 words | 432.4 words/sec
Processed 13000 words | 446.2 words/sec
Processed 14000 words | 457.1 words/sec
Processed 15000 words | 464.3 words/sec
Processed 16000 words | 464.0 words/sec
Processed 17000 words | 461.6 words/sec
Processed 18000 words | 453.6 words/sec
Processed 19000 words | 447.8 words/sec
Processed 20000 words | 441.9 words/sec
Processed 21000 words | 439.7 words/sec
Processed 22000 words | 437.7 words/sec
Processed 23000 words | 438.2 words/sec
Processed 24000 words | 425.5 words/sec
Processed 25000 