<a href="https://colab.research.google.com/github/Twinkle-gawri/Word2Vec/blob/main/Intrinsic_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Intrinsic Evaluation is a way to evaluate word embeddings directly — by testing how well they capture the semantic and syntactic properties of words within the vector space, without using them in a downstream task like sentiment analysis or translation.

In [2]:
import pandas as pd
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m60.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [1]:
from gensim import downloader

In [2]:
wv=downloader.load('word2vec-google-news-300')



# 1. WORD SIMILARITY

In [3]:
import pandas as pd
df=pd.read_csv('/content/wordsim353.csv')
df.head()

Unnamed: 0,Word_1,Word_2,score
0,love,sex,6.77
1,tiger,cat,7.35
2,tiger,tiger,10.0
3,book,paper,7.46
4,computer,keyboard,7.62


In [4]:
# Each row contains a pair of words whose similarity you want to calculate using a pre-trained Word2Vec model
similarity=[]

for i in range(len(df)):
  if df['Word_1'][i] in wv.key_to_index and df['Word_2'][i] in wv.key_to_index:
    """
    wv.key_to_index -- dictionary containing all the words that the Word2Vec model knows.
    If a word appears in wv.key_to_index, it means the model has learned an embedding (vector) for it.
    """
    similarity.append(wv.similarity(df['Word_1'][i],df['Word_2'][i]))
  else:
    similarity.append('NaN')

In [5]:
similarity

[0.2639377,
 0.5172962,
 0.99999994,
 0.3634626,
 0.39639163,
 0.40686232,
 0.3779698,
 0.3402561,
 0.33218452,
 0.6114971,
 0.3899161,
 0.24085768,
 0.6417261,
 0.5678562,
 0.63195235,
 0.21336083,
 0.4206618,
 0.066302165,
 0.47047195,
 0.34156868,
 0.4680556,
 0.123267554,
 0.06611791,
 0.036066912,
 0.104177676,
 0.25652346,
 0.044471763,
 0.07456468,
 0.32453123,
 0.26132065,
 0.41751626,
 0.615122,
 0.05661814,
 0.118131705,
 0.6510957,
 0.20875593,
 0.42778125,
 0.66387475,
 0.63425124,
 0.10070594,
 0.22339173,
 0.2888383,
 0.73135483,
 0.66824675,
 0.5051179,
 0.392008,
 0.31498638,
 0.32154804,
 0.24712518,
 0.34465772,
 0.26419905,
 0.44535527,
 0.17430326,
 0.47419044,
 0.36640593,
 0.4371983,
 0.08025178,
 0.08876197,
 0.59717494,
 0.6881493,
 0.24563539,
 0.093469374,
 0.17304002,
 0.50702006,
 0.583977,
 0.15679939,
 0.5838368,
 0.6210811,
 0.68308526,
 0.5886159,
 0.5083667,
 0.2525393,
 0.48634958,
 0.5527407,
 0.60839105,
 0.3740926,
 0.36290243,
 0.30286193,
 0.21234

In [6]:
df['model_scores']=similarity

In [7]:
df.dropna(axis=0,how='any',inplace=True)

In [9]:
df.head(5)

Unnamed: 0,Word_1,Word_2,score,model_scores
0,love,sex,6.77,0.263938
1,tiger,cat,7.35,0.517296
2,tiger,tiger,10.0,1.0
3,book,paper,7.46,0.363463
4,computer,keyboard,7.62,0.396392


* If p-value < 0.05, we usually say: The correlation is statistically significant

* If p-value > 0.05, then: The correlation is not statistically significant (could be due to random chance).

* Value of coorelation --
  * +1: Perfect positive correlation — rankings match exactly.
  * 0: No correlation in rankings.
  * -1: Perfect negative correlation — rankings are opposite.

In [10]:
# how well your model’s similarity predictions align with human judgment.
# score -- human value , model_score -- model values
import scipy
print(scipy.stats.spearmanr(df['score'],df['model_scores']))

SignificanceResult(statistic=0.7000166486272194, pvalue=2.86866666051422e-53)


# 2. ANALOGY

In [11]:
df1=pd.read_csv('/content/Analogy.csv')

In [12]:
len(df1)

19544

In [13]:
df1.head(5)

Unnamed: 0,word1,word2,word3,word4
0,Athens,Greece,Baghdad,Iraq
1,Athens,Greece,Bangkok,Thailand
2,Athens,Greece,Beijing,China
3,Athens,Greece,Berlin,Germany
4,Athens,Greece,Bern,Switzerland


In [14]:
analogy=[]
for i in range(100):
  a = df1['word1'][i]
  b = df1['word2'][i]
  c = df1['word3'][i]
  if a in wv.key_to_index and b in wv.key_to_index and c in wv.key_to_index:
    result=wv.most_similar(positive=[b,c],negative=[a])
    analogy.append(result[0][0]) # result[0][0] is the top word from the returned similarity list.
  else:
    analogy.append('nan')

In [15]:
total = 0
correct = 0

for i in range(len(analogy)):
  if analogy[i]!='nan':
    total+=1
    if analogy[i]==df1['word4'][i]:
      correct+=1
print(correct/total)

0.81


In [16]:
analogy

['Iraqi',
 'Thailand',
 'China',
 'Germany',
 'Switzerland',
 'Egypt',
 'Australia',
 'Viet_Nam',
 'Cuba',
 'Finland',
 'Pakistan',
 'Afghan',
 'Britain',
 'Spain',
 'Russia',
 'Norway',
 'Canada',
 'France',
 'Italy',
 'Sweden',
 'Iran',
 'Japan',
 'Thailand',
 'China',
 'Germany',
 'coach_Bobby_Curlings',
 'Egypt',
 'Mr_Rudd',
 'Vietnam',
 'Cuba',
 'Finland',
 'Pakistan',
 'Afghanistan',
 'Britain',
 'Spain',
 'Russia',
 'Norway',
 'Prime_Minister_Jean_Chrétien',
 'France',
 'Italy',
 'Sweden',
 'Iran',
 'Japan',
 'Greece',
 'China',
 'Germany',
 'Switzerland',
 'Egypt',
 'Australia',
 'Viet_Nam',
 'Cuba',
 'Finland',
 'Pakistan',
 'Afghanistan',
 'Britain',
 'Spain',
 'Russia',
 'Norway',
 'Canada',
 'France',
 'Italy',
 'Sweden',
 'Iran',
 'Japan',
 'Greece',
 'Iraqi',
 'Germany',
 'Bern_NC',
 'Egypt',
 'Australia',
 'Viet_Nam',
 'Cuba',
 'Finland',
 'Pakistan',
 'Afghan',
 'UK',
 'Spain',
 'Russia',
 'Norway',
 'Canada',
 'France',
 'Italy',
 'Sweden',
 'Iran',
 'Japan',
 'Greece'

# 3. CLUSTERING

In [17]:
words = ['cat', 'dog', 'lion', 'tiger', 'apple', 'banana', 'grape', 'car', 'bus', 'train']

In [20]:
vectors = [wv[word] for word in words]

In [21]:
vectors

[array([ 0.0123291 ,  0.20410156, -0.28515625,  0.21679688,  0.11816406,
         0.08300781,  0.04980469, -0.00952148,  0.22070312, -0.12597656,
         0.08056641, -0.5859375 , -0.00445557, -0.296875  , -0.01312256,
        -0.08349609,  0.05053711,  0.15136719, -0.44921875, -0.0135498 ,
         0.21484375, -0.14746094,  0.22460938, -0.125     , -0.09716797,
         0.24902344, -0.2890625 ,  0.36523438,  0.41210938, -0.0859375 ,
        -0.07861328, -0.19726562, -0.09082031, -0.14160156, -0.10253906,
         0.13085938, -0.00346375,  0.07226562,  0.04418945,  0.34570312,
         0.07470703, -0.11230469,  0.06738281,  0.11230469,  0.01977539,
        -0.12353516,  0.20996094, -0.07226562, -0.02783203,  0.05541992,
        -0.33398438,  0.08544922,  0.34375   ,  0.13964844,  0.04931641,
        -0.13476562,  0.16308594, -0.37304688,  0.39648438,  0.10693359,
         0.22167969,  0.21289062, -0.08984375,  0.20703125,  0.08935547,
        -0.08251953,  0.05957031,  0.10205078, -0.1

In [22]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3)
labels = kmeans.fit_predict(vectors)

In [23]:
for word, label in zip(words, labels):
    print(f"{word} → Cluster {label}")

cat → Cluster 1
dog → Cluster 1
lion → Cluster 1
tiger → Cluster 1
apple → Cluster 0
banana → Cluster 0
grape → Cluster 0
car → Cluster 2
bus → Cluster 2
train → Cluster 2
