In [1]:
# Quick fix for your specific error
import os
import requests
import zipfile
import io
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

# Create directory for downloaded files
os.makedirs('downloaded_model', exist_ok=True)



In [2]:
# Download GloVe vectors
print("Downloading GloVe vectors...")
url = "https://nlp.stanford.edu/data/glove.6B.zip"
response = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(response.content))
z.extractall("downloaded_model")

# Convert to word2vec format
glove_input_file = 'downloaded_model/glove.6B.100d.txt'
word2vec_output_file = 'downloaded_model/glove.6B.100d.word2vec.txt'

print("Converting GloVe format to Word2Vec format...")
glove2word2vec(glove_input_file, word2vec_output_file)




Downloading GloVe vectors...
Converting GloVe format to Word2Vec format...


  glove2word2vec(glove_input_file, word2vec_output_file)


(400000, 100)

In [3]:
# Load and test the model
print("Loading the model...")
model = KeyedVectors.load_word2vec_format(word2vec_output_file)
print(f"Model loaded successfully with {len(model.key_to_index)} words")


Loading the model...
Model loaded successfully with 400000 words


In [4]:
# Test the model
print("\nWords similar to 'computer':")
similar_words = model.most_similar('computer', topn=5)
for word, score in similar_words:
    print(f"  {word}: {score:.4f}")




Words similar to 'computer':
  computers: 0.8752
  software: 0.8373
  technology: 0.7642
  pc: 0.7366
  hardware: 0.7290


In [5]:
import pandas as pd
import torch
import os
import numpy as np

def extract_glove_data(glove_file_path, existing_vocab_path=None, output_dir='downloaded_model'):
    """
    Extract GloVe embeddings and vocabulary from a GloVe word2vec format file.
    Can optionally use an existing vocabulary mapping to maintain ID consistency.
    
    Args:
        glove_file_path: Path to the GloVe word2vec format file (e.g., glove.6B.100d.word2vec.txt)
        existing_vocab_path: Path to existing vocabulary CSV file (optional)
        output_dir: Directory to save the output files
    
    Returns:
        Vocabulary DataFrame and embeddings tensor
    """
    print(f"Reading GloVe embeddings from {glove_file_path}...")
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Check if we're using an existing vocabulary mapping
    existing_vocab = None
    if existing_vocab_path and os.path.exists(existing_vocab_path):
        print(f"Loading existing vocabulary from {existing_vocab_path}")
        existing_vocab = pd.read_csv(existing_vocab_path)
        # Create a word-to-id mapping dictionary
        word_to_id = dict(zip(existing_vocab['Word'], existing_vocab['Token_ID']))
        print(f"Found {len(word_to_id)} existing word mappings")
    
    # Read the first line to get dimensions
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        first_line = f.readline().strip()
        vocab_size, dims = map(int, first_line.split())
        print(f"Vocabulary size: {vocab_size}, Dimensions: {dims}")
    
    # Initialize storage for words and vectors
    words = []
    id_mappings = []
    vectors = np.zeros((vocab_size, dims), dtype=np.float32)
    
    # Track the next available token ID
    next_token_id = 1
    if existing_vocab is not None:
        next_token_id = existing_vocab['Token_ID'].max() + 1
    
    # Read file line by line
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        # Skip header line
        f.readline()
        
        # Process each line
        for i, line in enumerate(f):
            if i >= vocab_size:
                break
                
            values = line.rstrip().split(' ')
            word = values[0]
            vector = np.array(values[1:], dtype=np.float32)
            
            # Determine token ID
            if existing_vocab is not None and word in word_to_id:
                token_id = word_to_id[word]
            else:
                token_id = next_token_id
                next_token_id += 1
            
            words.append(word)
            id_mappings.append(token_id)
            vectors[i] = vector
            
            # Show progress
            if (i + 1) % 10000 == 0:
                print(f"Processed {i + 1}/{vocab_size} words")
    
    # Create vocabulary CSV file
    print("Creating vocabulary mapping file...")
    vocab_df = pd.DataFrame({'Token_ID': id_mappings, 'Word': words})
    
    # Sort by Token_ID for better readability
    vocab_df = vocab_df.sort_values('Token_ID')
    
    vocab_path = os.path.join(output_dir, 'glove_ids_to_words.csv')
    vocab_df.to_csv(vocab_path, index=False)
    print(f"Vocabulary saved to {vocab_path}")
    
    # Convert numpy array to PyTorch tensor
    print("Converting embeddings to PyTorch tensor...")
    embeddings_tensor = torch.from_numpy(vectors)
    
    # Save embeddings as PyTorch tensor
    embeddings_path = os.path.join(output_dir, 'glove_embeddings.pt')
    torch.save(embeddings_tensor, embeddings_path)
    print(f"Embeddings saved to {embeddings_path}")
    
    return vocab_df, embeddings_tensor

if __name__ == "__main__":
    # Path to your GloVe word2vec file
    glove_file = 'downloaded_model/glove.6B.100d.word2vec.txt'
    
    # Path to existing vocabulary (optional)
    # Set to None if you don't want to use existing vocabulary
    existing_vocab = None
    
    # Extract and save data
    vocab_df, embeddings = extract_glove_data(glove_file, existing_vocab)
    
    # Show sample of vocabulary
    print("\nSample of vocabulary mapping:")
    print(vocab_df.head(10))
    
    # Show some statistics
    print(f"\nTotal vocabulary size: {len(vocab_df)}")
    print(f"Embedding dimensions: {embeddings.shape[1]}")

Reading GloVe embeddings from downloaded_model/glove.6B.100d.word2vec.txt...
Vocabulary size: 400000, Dimensions: 100
Processed 10000/400000 words
Processed 20000/400000 words
Processed 30000/400000 words
Processed 40000/400000 words
Processed 50000/400000 words
Processed 60000/400000 words
Processed 70000/400000 words
Processed 80000/400000 words
Processed 90000/400000 words
Processed 100000/400000 words
Processed 110000/400000 words
Processed 120000/400000 words
Processed 130000/400000 words
Processed 140000/400000 words
Processed 150000/400000 words
Processed 160000/400000 words
Processed 170000/400000 words
Processed 180000/400000 words
Processed 190000/400000 words
Processed 200000/400000 words
Processed 210000/400000 words
Processed 220000/400000 words
Processed 230000/400000 words
Processed 240000/400000 words
Processed 250000/400000 words
Processed 260000/400000 words
Processed 270000/400000 words
Processed 280000/400000 words
Processed 290000/400000 words
Processed 300000/4000

In [7]:
from huggingface_hub import HfApi
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

api = HfApi(token=os.getenv("HF_TOKEN"))
api.upload_folder(
    folder_path="downloaded_model",
    repo_id="nodozi/MLX_Week2",
    repo_type="dataset",
)

2025_04_18__14_41_55.5.cbow.pth:   0%|          | 0.00/65.2M [00:00<?, ?B/s]
[A

[A[A


[A[A[A



2025_04_18__14_41_55.5.cbow.pth:   2%|▏         | 1.52M/65.2M [00:00<00:04, 15.2MB/s]

[A[A


[A[A[A
[A

2025_04_18__14_41_55.5.cbow.pth:   5%|▍         | 3.05M/65.2M [00:00<00:05, 11.3MB/s]

[A[A
[A
[A


2025_04_18__14_41_55.5.cbow.pth:   7%|▋         | 4.24M/65.2M [00:00<00:10, 5.62MB/s]


[A[A[A
[A


2025_04_18__14_41_55.5.cbow.pth:  10%|█         | 6.67M/65.2M [00:00<00:07, 8.29MB/s]
[A

[A[A


2025_04_18__14_41_55.5.cbow.pth:  16%|█▋        | 10.6M/65.2M [00:00<00:03, 14.7MB/s]

[A[A
[A


2025_04_18__14_41_55.5.cbow.pth:  20%|██        | 13.2M/65.2M [00:01<00:03, 16.0MB/s]

2025_04_18__14_41_55.5.cbow.pth:  24%|██▎       | 15.4M/65.2M [00:01<00:02, 17.2MB/s]

[A[A


[A[A[A

2025_04_18__14_41_55.5.cbow.pth:  27%|██▋       | 17.4M/65.2M [00:01<00:04, 10.3MB/s]

2025_04_18__14_41_55.5.cbow.pth:  30%|██▉       | 19.5M/65.2M [00:01<00:04, 11.2MB/s]

[A[A




CommitInfo(commit_url='https://huggingface.co/datasets/nodozi/MLX_Week2/commit/c298b6de87552649624a3908c2f4ffbf5ff7a7c5', commit_message='Upload folder using huggingface_hub', commit_description='', oid='c298b6de87552649624a3908c2f4ffbf5ff7a7c5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/nodozi/MLX_Week2', endpoint='https://huggingface.co', repo_type='dataset', repo_id='nodozi/MLX_Week2'), pr_revision=None, pr_num=None)