In [1]:
import urllib.request
import zipfile
import os

# URL for the FastText embeddings
url = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip"
zip_path = "wiki-news-300d-1M.vec.zip"
extract_dir = "./data"

# Download the zip file
print("Downloading embeddings...")
urllib.request.urlretrieve(url, zip_path)

# Extract the contents
print("Extracting embeddings...")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

# Verify the extracted file
embedding_path = os.path.join(extract_dir, "wiki-news-300d-1M.vec")
print(f"Embeddings extracted to: {embedding_path}")
print("Download and extraction complete.")


Downloading embeddings...
Extracting embeddings...
Embeddings extracted to: ./data/wiki-news-300d-1M.vec
Download and extraction complete.


In [None]:
from gensim.models import KeyedVectors

# Path to the extracted FastText embeddings
fasttext_path = './data/wiki-news-300d-1M.vec'

# Load the embeddings
print("Loading FastText embeddings...")
model = KeyedVectors.load_word2vec_format(fasttext_path, binary=False)
print("Embeddings loaded successfully!")

# Test: Find the vector for 'apple'
print("\nVector for 'apple':")
print(model['apple'])

# Test: Find the most similar words to 'apple'
print("\nMost similar words to 'apple':")
similar_words = model.most_similar('apple', topn=5)
for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")


Loading FastText embeddings...
Embeddings loaded successfully!

Vector for 'apple':
[-0.066  -0.027  -0.0403  0.0651 -0.0168 -0.0405  0.1743  0.0953 -0.0455
  0.0235 -0.2745 -0.0228 -0.1808  0.0835 -0.0733  0.127  -0.076   0.0418
 -0.0321 -0.1173 -0.2255  0.1108 -0.0487 -0.0328 -0.1202 -0.0645 -0.0133
  0.1224 -0.1095  0.1556  0.355   0.2831  0.0757  0.0459  0.0502  0.0282
  0.036   0.1501 -0.1976 -0.0697 -0.0221  0.0708 -0.0812 -0.0199  0.0299
  0.2296  0.1669  0.1569 -0.1004  0.0126 -0.0841 -0.2551 -0.7234 -0.1224
 -0.1237  0.1448  0.0594 -0.0535  0.0048  0.0465  0.1258 -0.1345 -0.1895
 -0.1805  0.0839 -0.2655  0.2866 -0.0662 -0.1446 -0.0867 -0.1296  0.1133
 -0.1888  0.0754 -0.0438  0.0416  0.2305  0.0672  0.1048 -0.0222  0.0622
  0.1131 -0.0529 -0.1493  0.2413  0.2086 -0.0391  0.1578  0.2828 -0.1328
  0.0055  0.0099  0.209  -0.1234  0.201   0.0484 -0.0601 -0.0774  0.115
 -0.2089 -0.1844  0.1884 -0.0387  0.0153  0.0716 -0.1435  0.0174  0.1532
  0.0087  0.1044 -0.0441  0.1227  0.1691 

In [3]:
import pandas as pd
from gensim.models import KeyedVectors
import numpy as np

# Load the FastText embeddings
print("Loading FastText embeddings...")
fasttext_path = './data/wiki-news-300d-1M.vec'
model = KeyedVectors.load_word2vec_format(fasttext_path, binary=False)
print("Embeddings loaded successfully!")

# Create a word-to-index mapping from the embeddings
word_to_index = {word: idx for idx, word in enumerate(model.index_to_key)}




Loading FastText embeddings...
Embeddings loaded successfully!


In [5]:
def get_embedding(word, model, word_to_index):
    """Retrieve the embedding for a given word."""
    return model[word]

def get_k_nearest_words(k, result_embedding, model, word_to_index):
    """Find the k nearest words to the given embedding."""
    most_similar = model.similar_by_vector(result_embedding, topn=k)
    return [word for word, _ in most_similar]

def test_analogy(model, word_to_index, analogy_file, subset_size=None):
    """
    Method to test accuracy of embeddings on analogy tasks.

    Arguments
    ---------
    model : KeyedVectors
        Trained word embeddings model.
    word_to_index : Dictionary
        Dictionary mapping words to indices {word: index}.
    analogy_file : String
        File containing analogy tasks.
    subset_size : int, optional
        Number of rows to use from the dataset (for testing purposes).

    Returns
    -------
    accuracy : float
        Accuracy of the model on the analogy tasks.
    """
    # Load the CSV file
    df = pd.read_csv(analogy_file)

    # Filter by the 'capital-common-countries' category
    df = df[df['category'] == 'capital-common-countries']

    # Reduce the dataset size for testing, if subset_size is provided
    if subset_size:
        df = df.head(subset_size)

    correct = 0
    total = 0
    skipped = 0  # Counter for skipped tasks

    # Iterate through each analogy task
    for index, row in df.iterrows():
        word_one = row['word_one'].lower()
        word_two = row['word_two'].lower()
        word_three = row['word_three'].lower()
        word_four = row['word_four'].lower()

        # Skip tasks if any word is not in the vocabulary
        if (word_one not in word_to_index) or (word_two not in word_to_index) or \
           (word_three not in word_to_index) or (word_four not in word_to_index):
            skipped += 1
            continue

        # Get embeddings for the words
        embedding_word_one = get_embedding(word_one, model, word_to_index)
        embedding_word_two = get_embedding(word_two, model, word_to_index)
        embedding_word_three = get_embedding(word_three, model, word_to_index)

        # Compute the resulting analogy vector
        result_embedding = embedding_word_two - embedding_word_one + embedding_word_three

        # Find the top 10 nearest words to the result
        predictions = get_k_nearest_words(10, result_embedding, model, word_to_index)

        # Print detailed results
        is_correct = word_four in predictions
        print(f"Analogy: {word_one} -> {word_two} :: {word_three} -> {word_four} | Prediction: {predictions}, Correct: {is_correct}")

        if is_correct:
            correct += 1

        total += 1

    # Print final stats
    print(f"Total tasks skipped due to missing words: {skipped}")
    if total == 0:
        print("No valid tasks were processed.")
        return 'No word was found in the embeddings'

    accuracy = correct / total
    print(f"Analogy task accuracy: {accuracy:.4f}")
    return accuracy

# Example usage: Test the model on a reduced subset for quick debugging
accuracy = test_analogy(model, word_to_index, 'TestSet_sample.csv', subset_size=100)
print(f"Final Accuracy: {accuracy}")

Analogy: athens -> greece :: baghdad -> iraq | Prediction: ['baghdad', 'greece', 'iraq', 'kurdistan', 'afganistan', 'syria', 'Iraq.', 'israel', 'irak', 'syria.'], Correct: True
Analogy: athens -> greece :: bangkok -> thailand | Prediction: ['bangkok', 'greece', 'thailand', 'italy', 'europe', 'hungary', 'spain', 'china.', 'france', 'Thailand.'], Correct: True
Analogy: athens -> greece :: beijing -> china | Prediction: ['beijing', 'greece', 'china.', 'russia', 'xinjiang', 'taiwan', 'CHina', 'chinas', 'china', 'chinese'], Correct: True
Analogy: athens -> greece :: berlin -> germany | Prediction: ['berlin', 'germany', 'greece', 'poland', 'france', 'europe', 'italy', 'russia', 'ww2', 'germany.'], Correct: True
Analogy: athens -> greece :: bern -> switzerland | Prediction: ['bern', 'greece', 'slovenia', 'croatia', 'slovakia', 'france', 'israel', 'spain', 'denmark', 'italy'], Correct: False
Analogy: athens -> greece :: cairo -> egypt | Prediction: ['cairo', 'greece', 'egypt', 'israel', 'ethio

In [7]:
def get_embedding(word, model, word_to_index):
    """Retrieve the embedding for a given word."""
    return model[word]

def get_k_nearest_words(k, result_embedding, model, word_to_index):
    """Find the k nearest words to the given embedding."""
    most_similar = model.similar_by_vector(result_embedding, topn=k)
    return [word for word, _ in most_similar]

def test_analogy(model, word_to_index, analogy_file, subset_size=None):
    """
    Method to test accuracy of embeddings on analogy tasks.

    Arguments
    ---------
    model : KeyedVectors
        Trained word embeddings model.
    word_to_index : Dictionary
        Dictionary mapping words to indices {word: index}.
    analogy_file : String
        File containing analogy tasks.
    subset_size : int, optional
        Number of rows to use from the dataset (for testing purposes).

    Returns
    -------
    accuracy : float
        Accuracy of the model on the analogy tasks.
    """
    # Load the CSV file
    df = pd.read_csv(analogy_file)

    # Filter by the 'capital-common-countries' category
    df = df[df['category'] == 'family']

    # Reduce the dataset size for testing, if subset_size is provided
    if subset_size:
        df = df.head(subset_size)

    correct = 0
    total = 0
    skipped = 0  # Counter for skipped tasks

    # Iterate through each analogy task
    for index, row in df.iterrows():
        word_one = row['word_one'].lower()
        word_two = row['word_two'].lower()
        word_three = row['word_three'].lower()
        word_four = row['word_four'].lower()

        # Skip tasks if any word is not in the vocabulary
        if (word_one not in word_to_index) or (word_two not in word_to_index) or \
           (word_three not in word_to_index) or (word_four not in word_to_index):
            skipped += 1
            continue

        # Get embeddings for the words
        embedding_word_one = get_embedding(word_one, model, word_to_index)
        embedding_word_two = get_embedding(word_two, model, word_to_index)
        embedding_word_three = get_embedding(word_three, model, word_to_index)

        # Compute the resulting analogy vector
        result_embedding = embedding_word_two - embedding_word_one + embedding_word_three

        # Find the top 10 nearest words to the result
        predictions = get_k_nearest_words(10, result_embedding, model, word_to_index)

        # Print detailed results
        is_correct = word_four in predictions
        print(f"Analogy: {word_one} -> {word_two} :: {word_three} -> {word_four} | Prediction: {predictions}, Correct: {is_correct}")

        if is_correct:
            correct += 1

        total += 1

    # Print final stats
    print(f"Total tasks skipped due to missing words: {skipped}")
    if total == 0:
        print("No valid tasks were processed.")
        return 'No word was found in the embeddings'

    accuracy = correct / total
    print(f"Analogy task accuracy: {accuracy:.4f}")
    return accuracy

# Example usage: Test the model on a reduced subset for quick debugging
accuracy = test_analogy(model, word_to_index, 'TestSet_sample.csv', subset_size=100)
print(f"Final Accuracy: {accuracy}")

Analogy: boy -> girl :: brother -> sister | Prediction: ['brother', 'sister', 'cousin', 'sister-in-law', 'brother-in-law', 'niece', 'sisters', 'nephew', 'uncle', 'half-brother'], Correct: True
Analogy: boy -> girl :: brothers -> sisters | Prediction: ['brothers', 'sisters', 'brother', 'siblings', 'Brothers', 'cousins', 'sister', 'sisters-in-law', 'husbands', 'Sisters'], Correct: True
Analogy: boy -> girl :: dad -> mom | Prediction: ['dad', 'mom', 'mum', 'stepdad', 'grandma', 'stepmom', 'mother', 'step-dad', 'step-mom', 'boyfriend'], Correct: True
Analogy: boy -> girl :: father -> mother | Prediction: ['father', 'mother', 'husband', 'daughter', 'grandmother', 'wife', 'grandfather', 'ex-husband', 'brother', 'boyfriend'], Correct: True
Analogy: boy -> girl :: grandfather -> grandmother | Prediction: ['grandfather', 'grandmother', 'granddaughter', 'great-grandfather', 'uncle', 'aunt', 'great-grandmother', 'grandson', 'niece', 'father'], Correct: True
Analogy: boy -> girl :: grandpa -> gran

In [None]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go  # <-- Make sure this import is here
from gensim.models import KeyedVectors

word_to_index = {word: idx for idx, word in enumerate(model.index_to_key)}

# Step 2: Load the analogy dataset
analogy_df = pd.read_csv('TestSet_sample.csv')

# Step 3: Extract words by category
def extract_words_by_category(df, category):
    words = set(df[df['category'] == category][['word_one', 'word_two', 'word_three', 'word_four']].values.flatten())
    words = [word.lower() for word in words if word.lower() in word_to_index]
    return words

family_words = extract_words_by_category(analogy_df, 'family')
capital_words = extract_words_by_category(analogy_df, 'capital-common-countries')

# Debugging: Print unmatched words
unmatched_family = [word for word in family_words if word not in word_to_index]
unmatched_capital = [word for word in capital_words if word not in word_to_index]
print(f"Unmatched family words: {unmatched_family}")
print(f"Unmatched capital words: {unmatched_capital}")

# Ensure categories are not empty
if not family_words or not capital_words:
    raise ValueError("One of the categories has no valid words in the vocabulary.")

# Step 4: Extract embeddings for each category
def get_embeddings_for_words(words):
    return np.array([model[word] for word in words])

family_embeddings = get_embeddings_for_words(family_words)
capital_embeddings = get_embeddings_for_words(capital_words)

# Combine all embeddings and labels
all_embeddings = np.vstack([family_embeddings, capital_embeddings])
all_labels = family_words + capital_words

# Step 5: Apply t-SNE on the combined embeddings
print("Running t-SNE...")
tsne = TSNE(n_components=2, random_state=0, perplexity=30)
reduced_embeddings = tsne.fit_transform(all_embeddings)

# Step 6: Create a scatter plot using Plotly with color-coding by category
colors = ['blue'] * len(family_words) + ['green'] * len(capital_words)  # Assign colors based on category

fig = go.Figure()

# Add scatter plot with color-coding
fig.add_trace(go.Scatter(
    x=reduced_embeddings[:, 0],
    y=reduced_embeddings[:, 1],
    mode='markers+text',
    text=all_labels,
    textposition='top center',
    marker=dict(size=10, color=colors, showscale=False),
    hoverinfo='text'
))

# Set plot title and axis labels
fig.update_layout(
    title="t-SNE Visualization of Family and Capital Words",
    xaxis=dict(title='t-SNE Dimension 1'),
    yaxis=dict(title='t-SNE Dimension 2'),
    height=800
)

# Display the plot
fig.show()


Unmatched family words: []
Unmatched capital words: []
Running t-SNE...


In [None]:
# Check the vocabulary size
vocab_size = len(model.index_to_key)
print(f"Vocabulary size: {vocab_size}")


Vocabulary size: 999994


In [12]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from gensim.models import KeyedVectors

# Load the word embeddings (make sure the model is already loaded)
word_to_index = {word: idx for idx, word in enumerate(model.index_to_key)}

# Step 2: Load the analogy dataset
analogy_df = pd.read_csv('TestSet_sample.csv')

# Step 3: Extract words by category
def extract_words_by_category(df, category):
    words = set(df[df['category'] == category][['word_one', 'word_two', 'word_three', 'word_four']].values.flatten())
    words = [word.lower() for word in words if word.lower() in word_to_index]
    return words

family_words = extract_words_by_category(analogy_df, 'family')
capital_words = extract_words_by_category(analogy_df, 'capital-common-countries')

# Debugging: Print unmatched words
unmatched_family = [word for word in family_words if word not in word_to_index]
unmatched_capital = [word for word in capital_words if word not in word_to_index]
print(f"Unmatched family words: {unmatched_family}")
print(f"Unmatched capital words: {unmatched_capital}")

# Ensure categories are not empty
if not family_words or not capital_words:
    raise ValueError("One of the categories has no valid words in the vocabulary.")

# Step 4: Extract embeddings for each category
def get_embeddings_for_words(words):
    return np.array([model[word] for word in words])

family_embeddings = get_embeddings_for_words(family_words)
capital_embeddings = get_embeddings_for_words(capital_words)

# Combine all embeddings and labels
all_embeddings = np.vstack([family_embeddings, capital_embeddings])
all_labels = family_words + capital_words

# Step 5: Apply t-SNE on the combined embeddings (3D version)
print("Running 3D t-SNE...")
tsne = TSNE(n_components=3, random_state=0, perplexity=30)
reduced_embeddings = tsne.fit_transform(all_embeddings)

# Step 6: Create a 3D scatter plot using Plotly with color-coding by category
colors = ['blue'] * len(family_words) + ['green'] * len(capital_words)  # Assign colors based on category

fig = go.Figure()

# Add 3D scatter plot with color-coding
fig.add_trace(go.Scatter3d(
    x=reduced_embeddings[:, 0],
    y=reduced_embeddings[:, 1],
    z=reduced_embeddings[:, 2],
    mode='markers+text',
    text=all_labels,
    textposition='top center',
    marker=dict(size=8, color=colors, opacity=0.8),
    hoverinfo='text'
))

# Set plot title and axis labels
fig.update_layout(
    title="3D t-SNE Visualization of Family and Capital Words",
    scene=dict(
        xaxis=dict(title='t-SNE Dimension 1'),
        yaxis=dict(title='t-SNE Dimension 2'),
        zaxis=dict(title='t-SNE Dimension 3')
    ),
    height=800
)

# Display the plot
fig.show()


Unmatched family words: []
Unmatched capital words: []
Running 3D t-SNE...
