# Odia Word Embeddings Evaluation

This notebook helps you evaluate and visualize Odia word embeddings trained with Word2Vec and GloVe.


In [7]:
# Install required packages if running in a fresh environment
# !pip install gensim numpy pandas matplotlib scikit-learn


## Load Word2Vec Model


In [8]:
from gensim.models import Word2Vec
import os

w2v_path = '../models/word2vec.model'
if os.path.exists(w2v_path):
    w2v_model = Word2Vec.load(w2v_path)
    print('Word2Vec model loaded!')
else:
    print('Word2Vec model not found!')

w2vsg_path = '../models/word2vec_sg.model'
if os.path.exists(w2v_path):
    w2vsg_model = Word2Vec.load(w2vsg_path)
    print('Word2Vec_sg model loaded!')
else:
    print('Word2Vec_sg model not found!')

Word2Vec model loaded!
Word2Vec_sg model loaded!


## Load GloVe Embeddings


In [9]:
import numpy as np
import json

glove_emb_path = '../models/glove_embeddings.npy'
glove_vocab_path = '../models/glove_vocab.json'
if os.path.exists(glove_emb_path) and os.path.exists(glove_vocab_path):
    glove_embeddings = np.load(glove_emb_path)
    with open(glove_vocab_path, 'r', encoding='utf-8') as f:
        glove_vocab = json.load(f)
    print('GloVe embeddings loaded!')
else:
    print('GloVe embeddings not found!')


GloVe embeddings loaded!


## Nearest Neighbors (Word2Vec)


In [10]:
# Example Odia words (edit as needed)
odia_words = [
    "ଭଲ",     # good
    "ମନ୍ଦ",     # bad
    "ପ୍ରେମ",   # love
    "ଘୃଣା",    # hate
    "ଶିକ୍ଷା",   # education
    "ଗ୍ରାମ",   # village
    "ନଗର",     # city
    "ପାଣି",     # water
    "ଅଗ୍ନି",    # fire
    "ବାୟୁ"     # air
]
for word in odia_words:
    if word in w2v_model.wv:
        print(f'Words similar to {word}:')
        for sim_word, score in w2v_model.wv.most_similar(word, topn=5):
            print(f'  {sim_word}: {score:.4f}')
    else:
        print(f'{word} not in vocabulary.')


Words similar to ଭଲ:
  ଖରାପ: 0.5172
  ସହଜ: 0.4499
  ଖୁସି: 0.4427
  ନିଶ୍ଚିତ: 0.4412
  ବଡ଼: 0.4389
Words similar to ମନ୍ଦ:
  ଲାଗୁନାହିଁ: 0.5991
  ପାଉଥିଲି: 0.5783
  ଲାଗିବ୤: 0.5746
  ବୁଝିବା: 0.5681
  ଜରାରୋଗ: 0.5467
Words similar to ପ୍ରେମ:
  ଅନାବିଳ: 0.6509
  ପ୍ରେମିକା: 0.6298
  ସ୍ନେହ: 0.6243
  ଆବେଗ: 0.6104
  ପ୍ରୀତି: 0.6099
Words similar to ଘୃଣା:
  ଉପହାସ: 0.7648
  ଭୟ: 0.7437
  ଈର୍ଷା: 0.7320
  ପ୍ରତିବାଦ: 0.7018
  ହତାଦର: 0.7011
Words similar to ଶିକ୍ଷା:
  ଶିକ୍ଷ୍ୟା: 0.6877
  ବିଦ୍ୟା: 0.6349
  ଉଚ୍ଚଶିକ୍ଷା: 0.6348
  ଗଣିତ: 0.6232
  ତାଲିମ: 0.6125
Words similar to ଗ୍ରାମ:
  ତହସିଲ: 0.6835
  ଗ୍ରାମରୁ: 0.6628
  ବ୍ଲକ: 0.6588
  ଥାନା: 0.6482
  ସହର: 0.6467
Words similar to ନଗର:
  ଜୟପୁର: 0.7284
  ମଥୁରା: 0.7102
  କାନପୁର: 0.7065
  ବାରଣାସୀ: 0.7030
  ଉତ୍ତରପ୍ରଦେଶ: 0.7027
Words similar to ପାଣି:
  ପାଣିରେ: 0.6755
  କ୍ଷୀର: 0.6646
  ଚିନି: 0.6448
  ତେଲ: 0.6447
  ବାମ୍ଫ: 0.6403
Words similar to ଅଗ୍ନି:
  ଦେବତାଙ୍କର: 0.6402
  ସୂର୍ଯ୍ୟ: 0.6254
  ଗ୍ରହ: 0.6251
  ପୁଷ୍କରିଣୀର: 0.6113
  ବୃହସ୍ପତି: 0.6009
Words similar to ବାୟୁ:
  ପ୍ରଦୂଷଣ: 0.7

In [14]:
for word in odia_words:
    if word in w2vsg_model.wv:
        print(f'Words similar to {word}:')
        for sim_word, score in w2vsg_model.wv.most_similar(word, topn=5):
            print(f'  {sim_word}: {score:.4f}')
    else:
        print(f'{word} not in vocabulary.')


Words similar to ଭଲ:
  ଖରାପ: 0.5290
  ଉତ୍ତମ: 0.4792
  ଖୁସି: 0.4586
  ନିଶ୍ଚିତ: 0.4577
  ସହଜ: 0.4529
Words similar to ମନ୍ଦ:
  ଲାଗିବ୤: 0.6047
  ଲାଗୁନାହିଁ: 0.5954
  ପାଉଥିଲି: 0.5425
  ପାଉଥାଏ: 0.5334
  ଲାଗେନା: 0.5321
Words similar to ପ୍ରେମ:
  ପ୍ରେମର: 0.6476
  ପ୍ରେମିକା: 0.6306
  ଅନାବିଳ: 0.6268
  ସ୍ନେହ: 0.6148
  ପ୍ରୀତି: 0.6131
Words similar to ଘୃଣା:
  ଭୟ: 0.7651
  ଉପହାସ: 0.7594
  ଈର୍ଷା: 0.7435
  ହତାଦର: 0.7241
  ଶୋଷଣ: 0.7096
Words similar to ଶିକ୍ଷା:
  ଶିକ୍ଷ୍ୟା: 0.6527
  ଉଚ୍ଚଶିକ୍ଷା: 0.6374
  ତାଲିମ: 0.6238
  ବିଦ୍ୟା: 0.5987
  ଅଧ୍ୟୟନ: 0.5887
Words similar to ଗ୍ରାମ:
  ତହସିଲ: 0.7054
  ବ୍ଲକ: 0.7006
  ଘାଟି: 0.6615
  ଗ୍ରାମରୁ: 0.6537
  ଜିଲାର: 0.6533
Words similar to ନଗର:
  ବିହାରର: 0.7376
  ଉତ୍ତରପ୍ରଦେଶ: 0.7351
  କାଶ୍ମୀର: 0.7270
  ମଥୁରା: 0.7248
  ମଧ୍ୟପ୍ରଦେଶ: 0.7216
Words similar to ପାଣି:
  ପାଣିରେ: 0.6840
  ଚିନି: 0.6542
  ଘିଅ: 0.6522
  କ୍ଷୀର: 0.6510
  ଦୁଧ: 0.6498
Words similar to ଅଗ୍ନି:
  ସୂର୍ଯ୍ୟ: 0.6372
  ଦେବତାଙ୍କର: 0.6263
  ଗ୍ରହ: 0.6018
  ରୋହିଣୀ: 0.6011
  ଦାରୁବ୍ରହ୍ମ: 0.6007
Words similar to ବାୟୁ:
  ପ୍ରଦୂଷ

## Nearest Neighbors (GloVe)


In [12]:
from sklearn.metrics.pairwise import cosine_similarity
# Reverse vocab for index to word
glove_idx_to_word = {idx: word for word, idx in glove_vocab.items()}
def get_glove_neighbors(word, topn=5):
    if word not in glove_vocab:
        print(f'{word} not in GloVe vocabulary.')
        return
    idx = glove_vocab[word]
    vec = glove_embeddings[idx].reshape(1, -1)
    sims = cosine_similarity(vec, glove_embeddings)[0]
    best = np.argsort(-sims)[1:topn+1]
    print(f'Words similar to {word}:')
    for i in best:
        print(f'  {glove_idx_to_word[i]}: {sims[i]:.4f}')
# Example
for word in odia_words:
    get_glove_neighbors(word)


Words similar to ଭଲ:
  ଶାଲ: 0.3077
  ଭାବରେ: 0.3075
  ହୋଇଥାଏ: 0.3072
  ଜରାରୋଗ: 0.2985
  ଦକ୍ଷ: 0.2975
Words similar to ମନ୍ଦ:
  ଗ୍ରେଗୋରୀ: 0.2659
  କୁମୁଦିନୀ: 0.2593
  ବିରିୟାନୀ: 0.2561
  ପନି: 0.2488
  ଘଟନା: 0.2458
Words similar to ପ୍ରେମ:
  ଶାନ୍ତି: 0.2872
  ୱେଲ୍: 0.2722
  ୟୁ: 0.2615
  କୁକୁରର: 0.2586
  ଗୁଡ୍: 0.2558
Words similar to ଘୃଣା:
  ବୋହୁଙ୍କୁ: 0.2974
  ଅନୁକୂଳ: 0.2802
  ପରିବହନର: 0.2659
  ଆବଶ୍ୟକତାକୁ: 0.2625
  ଥକାପଣ: 0.2613
Words similar to ଶିକ୍ଷା:
  ନୃତ୍ୟ: 0.4376
  ଉବ୍ଦେଶ୍ୟରେ: 0.4098
  ଦର୍ଶନ: 0.3414
  ଆହାରଣ: 0.3355
  ପ୍ରଶିକ୍ଷଣ: 0.3240
Words similar to ଗ୍ରାମ:
  ପଞ୍ଚାଯତ: 0.3266
  ପଞ୍ଚାୟତ: 0.3161
  ପଞ୍ଚାୟତର: 0.3001
  ସହର: 0.2956
  ହାତଲେଖା: 0.2908
Words similar to ନଗର:
  କବିସୂର୍ଯ୍ୟ: 0.3220
  ମୋନାରି: 0.2792
  ହାଇସ୍କୁଲ୍: 0.2777
  ବିରଚିତ: 0.2709
  ମଉସା: 0.2639
Words similar to ପାଣି:
  ରକ୍: 0.3196
  ଗାଧୋଇବା: 0.3051
  ବୁହେ: 0.3007
  କୁକର: 0.3005
  ଝରେ: 0.2866
Words similar to ଅଗ୍ନି:
  ନିର୍ବାପକ: 0.3221
  ଆଛା: 0.2892
  ପ୍ରାକ୍: 0.2814
  ପାଣି: 0.2780
  ନିର୍ବାପିତ: 0.2771
Words similar to ବାୟୁ:
  ନାଟ୍ୟ

## t-SNE Visualization


In [13]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

def plot_tsne(embeddings, vocab, words, title):
    idxs = [vocab[w] for w in words if w in vocab]
    vecs = embeddings[idxs]
    tsne = TSNE(n_components=2, random_state=42)
    reduced = tsne.fit_transform(vecs)
    plt.figure(figsize=(8, 6))
    plt.scatter(reduced[:, 0], reduced[:, 1])
    for i, word in enumerate(words):
        if word in vocab:
            plt.annotate(word, (reduced[i, 0], reduced[i, 1]), fontsize=12)
    plt.title(title)
    plt.show()
# Example words for visualization
words_to_plot = odia_words
if 'w2v_model' in globals():
    w2v_vecs = [w2v_model.wv[w] for w in words_to_plot if w in w2v_model.wv]
    if w2v_vecs:
        plot_tsne(np.array(w2v_vecs), {w: i for i, w in enumerate(words_to_plot)}, words_to_plot, 'Word2Vec t-SNE')
if 'glove_embeddings' in globals():
    plot_tsne(glove_embeddings, glove_vocab, words_to_plot, 'GloVe t-SNE')


ValueError: perplexity must be less than n_samples