In [None]:
import pandas as pd
import gensim
import nltk
import unicodedata

from time import time
from gensim.models import Word2Vec

from nltk.corpus import stopwords
from nltk.corpus import gutenberg, brown, movie_reviews

In [None]:
nltk.download('brown')
nltk.download('gutenberg')
nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
def get_df(corpus):
  ds = []
  for sent in corpus.sents():
    sentence = ' '.join(sent)
    ds.append(sentence)

  df = pd.DataFrame(ds, columns = ['text'])
  df = df.dropna().reset_index(drop=True)
  df.drop_duplicates(inplace=True)
  return df

In [None]:
df1 = get_df(gutenberg)
df2 = get_df(brown)
df3 = get_df(movie_reviews)

In [None]:
df1

Unnamed: 0,text
0,[ Emma by Jane Austen 1816 ]
1,VOLUME I
2,CHAPTER I
3,"Emma Woodhouse , handsome , clever , and rich ..."
4,She was the youngest of the two daughters of a...
...,...
98547,Now for my last -- let me look back a moment ;...
98548,"Long have we lived , joy ' d , caress ' d toge..."
98549,"Yet let me not be too hasty , Long indeed have..."
98550,May - be it is you the mortal knob really undo...


In [None]:
df2

Unnamed: 0,text
0,The Fulton County Grand Jury said Friday an in...
1,The jury further said in term-end presentments...
2,The September-October term jury had been charg...
3,`` Only a relative handful of such reports was...
4,The jury said it did find that many of Georgia...
...,...
57335,S. J. Perelman
57336,revulsion in the desert
57337,"the doors of the D train slid shut , and as I ..."
57338,She was a living doll and no mistake -- the bl...


In [None]:
df3

Unnamed: 0,text
0,"plot : two teen couples go to a church party ,..."
1,they get into an accident .
2,"one of the guys dies , but his girlfriend cont..."
3,what ' s the deal ?
4,"watch the movie and "" sorta "" find out ."
...,...
71527,"it ' s a quick , straight shot to the movie ' ..."
71528,"in terms of overall quality , i would compare ..."
71529,both films are well made with interesting stor...
71530,but neither film really felt like it capitaliz...


In [None]:
from nltk.tokenize import word_tokenize
import string

def preprocessing(text_dataframe, set_stopwords=None):
    # Lowercase the text
    text_dataframe['text'] = text_dataframe['text'].str.lower()
    # Remove special characters and digits
    text_dataframe['text'] = text_dataframe['text'].str.replace('[{}]'.format(string.punctuation+string.digits), ' ')
    # Tokenize the text
    text_dataframe['words'] = text_dataframe['text'].apply(lambda x: word_tokenize(x))
    # Optional: remove stopwords
    if set_stopwords is not None:
        text_dataframe['words'] = text_dataframe['words'].apply(lambda x: [w for w in x if w not in set_stopwords])
    # Remove sentences with less than 4 words
    text_dataframe = text_dataframe[text_dataframe['words'].apply(len) > 4]
    return text_dataframe


In [None]:
df = pd.concat([df1, df2, df3], axis=0, ignore_index=True)
df

Unnamed: 0,text
0,[ Emma by Jane Austen 1816 ]
1,VOLUME I
2,CHAPTER I
3,"Emma Woodhouse , handsome , clever , and rich ..."
4,She was the youngest of the two daughters of a...
...,...
215999,"it ' s a quick , straight shot to the movie ' ..."
216000,"in terms of overall quality , i would compare ..."
216001,both films are well made with interesting stor...
216002,but neither film really felt like it capitaliz...


In [None]:
df = preprocessing(df, set_stopwords=stopwords.words('english'))
df

  text_dataframe['text'] = text_dataframe['text'].str.replace('[{}]'.format(string.punctuation+string.digits), ' ')


Unnamed: 0,text,words
3,emma woodhouse handsome clever and rich ...,"[emma, woodhouse, handsome, clever, rich, comf..."
4,she was the youngest of the two daughters of a...,"[youngest, two, daughters, affectionate, indul..."
5,her mother had died too long ago for her to ha...,"[mother, died, long, ago, indistinct, remembra..."
6,sixteen years had miss taylor been in mr woo...,"[sixteen, years, miss, taylor, mr, woodhouse, ..."
8,even before miss taylor had ceased to hold the...,"[even, miss, taylor, ceased, hold, nominal, of..."
...,...,...
215999,it s a quick straight shot to the movie ...,"[quick, straight, shot, movie, end]"
216000,in terms of overall quality i would compare ...,"[terms, overall, quality, would, compare, trum..."
216001,both films are well made with interesting stor...,"[films, well, made, interesting, stories, set,..."
216002,but neither film really felt like it capitaliz...,"[neither, film, really, felt, like, capitalize..."


In [None]:
from nltk.corpus import wordnet

model = Word2Vec(df['words'], min_count=5, window=10, seed=1, epochs=1)
total_words = len(model.wv.index_to_key)
print("Tokens:", total_words)

wordnet_words = sum(1 for word in model.wv.index_to_key if any(wordnet.synsets(w) for w in [word, word.lower(), word.upper()]))
coverage = (wordnet_words / len(model.wv.index_to_key)) * 100
print(f"Coverage: {coverage:.2f}%")

Tokens: 28924
Coverage: 85.08%


In [None]:
from itertools import combinations
from nltk.stem import WordNetLemmatizer

non_stopword_words = [w for w in model.wv.index_to_key if w.lower() not in stopwords.words('english')][:1000]
thresholds = [0.6, 0.7, 0.8]

for threshold in thresholds:
    syn_emb, syn_wn, com_syn = 0, 0, 0
    for word_pair in combinations(non_stopword_words, 2):
      sim_emb = model.wv.similarity(word_pair[0], word_pair[1])
      word1_lemma = WordNetLemmatizer().lemmatize(word_pair[0])
      word2_lemma = WordNetLemmatizer().lemmatize(word_pair[1])
      synsets_word1 = set(wordnet.synsets(word1_lemma))
      synsets_word2 = set(wordnet.synsets(word2_lemma))
      common_synsets = synsets_word1.intersection(synsets_word2)
      sim_wn = len(common_synsets)
      if sim_emb >= threshold:
        syn_emb += 1
      if sim_wn > 0:
        syn_wn += 1
      if sim_emb >= threshold and sim_wn > 0:
        com_syn += 1
    p = com_syn / syn_emb if syn_emb > 0 else 0.0
    r = com_syn / syn_wn if syn_wn > 0 else 0.0
    f1 = 2 * (p * r) / (p + r) if (p + r) > 0 else 0.0
    print(f"Threshold: {threshold} - Precision: {p} - Recall: {r} - F1-Score: {f1}")

Threshold: 0.6 - Precision: 0.004665683006943244 - Recall: 0.9995693367786391 - F1-Score: 0.009288012421366032
Threshold: 0.7 - Precision: 0.00469718311290516 - Recall: 0.9952627045650302 - F1-Score: 0.009350237397308217
Threshold: 0.8 - Precision: 0.004747555449308731 - Recall: 0.9750215331610681 - F1-Score: 0.009449101521497331


In [None]:
emb_words = set(model.wv.index_to_key)
wordnet_words = set(wordnet.all_lemma_names())
coverage_errors = emb_words.difference(wordnet_words)
print("Coverage Errors", list(coverage_errors)[:100])

Coverage Errors ['abilities', 'filed', 'thespians', 'leysure', 'chops', 'soundtracks', 'anybody', 'giffen', 'ariz', 'vermejo', 'prettiest', 'trials', 'spouts', 'rosie', 'cheere', 'reviled', 'fishermen', 'expelled', 'mountains', 'hetman', 'orson', 'bystanders', 'beresford', 'bestowed', 'teats', 'benhadad', 'angie', 'rejoiced', 'bouquets', 'finest', 'surrendered', 'horrour', 'cy', 'rags', 'vignettes', 'seeth', 'charges', 'bucking', 'wreathen', 'ashes', 'slighted', 'mocketh', 'horses', 'mazzello', 'overpowered', 'impacts', 'chins', 'marcie', 'halliwell', 'interviewing', 'farouk', 'realtors', 'restraints', 'dares', 'icons', 'doubts', 'crucified', 'carla', 'slept', 'tornadoes', 'gaped', 'crews', 'macleane', 'declaring', 'pumping', 'interiors', 'fighters', 'intriguingly', 'ritter', 'reproaches', 'minde', 'saunders', 'rewarded', 'pronouncements', 'confronts', 'heads', 'mulder', 'shane', 'nerdy', 'giuseppe', 'fx', 'interruptions', 'peered', 'unrolled', 'recounts', 'supports', 'buenos', 'origin

In [None]:
non_stopword_words = [word for word in model.wv.index_to_key if word.lower() not in stopwords.words('english')][:1000]
threshold = 0.8
precision_errors = [(word1, word2) for i, word1 in enumerate(non_stopword_words) for word2 in non_stopword_words[i+1:] if model.wv.similarity(word1, word2) >= threshold and not set(wordnet.synsets(word1)).intersection(set(wordnet.synsets(word2)))]
print("Precision Errors:", precision_errors[:100])


Precision Errors: [('one', 'shall'), ('one', 'film'), ('one', 'like'), ('one', 'man'), ('one', 'time'), ('one', 'could'), ('one', 'even'), ('one', 'two'), ('one', 'movie'), ('one', 'upon'), ('one', 'little'), ('one', 'also'), ('one', 'much'), ('one', 'first'), ('one', 'people'), ('one', 'see'), ('one', 'made'), ('one', 'may'), ('one', 'day'), ('one', 'come'), ('one', 'make'), ('one', 'great'), ('one', 'ye'), ('one', 'every'), ('one', 'way'), ('one', 'came'), ('one', 'men'), ('one', 'many'), ('one', 'house'), ('one', 'new'), ('one', 'life'), ('one', 'old'), ('one', 'long'), ('one', 'king'), ('one', 'son'), ('one', 'hand'), ('one', 'might'), ('one', 'mr'), ('one', 'take'), ('one', 'yet'), ('one', 'get'), ('one', 'though'), ('one', 'world'), ('one', 'things'), ('one', 'still'), ('one', 'away'), ('one', 'back'), ('one', 'another'), ('one', 'children'), ('one', 'thing'), ('one', 'went'), ('one', 'without'), ('one', 'last'), ('one', 'place'), ('one', 'ever'), ('one', 'israel'), ('one', 'work

In [None]:
non_stopword_words = [word for word in model.wv.index_to_key if word.lower() not in stopwords.words('english')][:1000]
threshold = 0.8
recall_errors = []
for word_pair in combinations(non_stopword_words, 2):
  similarity_emb = model.wv.similarity(word_pair[0], word_pair[1])
  word1_lemma = WordNetLemmatizer().lemmatize(word_pair[0])
  word2_lemma = WordNetLemmatizer().lemmatize(word_pair[1])
  synsets_word1 = set(wordnet.synsets(word1_lemma))
  synsets_word2 = set(wordnet.synsets(word2_lemma))
  common_synsets = synsets_word1.intersection(synsets_word2)
  similarity_wn = len(common_synsets)
  if similarity_emb < threshold and similarity_wn > 0:
    recall_errors.append(word_pair)
print("Recall Errors:", recall_errors[:100])

Recall Errors: [('said', 'state'), ('said', 'states'), ('man', 'men'), ('thou', 'thousand'), ('god', 'gods'), ('good', 'full'), ('good', 'near'), ('see', 'saw'), ('see', 'looked'), ('come', 'came'), ('come', 'fell'), ('make', 'took'), ('make', 'named'), ('make', 'building'), ('us', 'u'), ('know', 'living'), ('know', 'bed'), ('know', 'lived'), ('go', 'went'), ('go', 'last'), ('go', 'passed'), ('go', 'became'), ('go', 'running'), ('go', 'broken'), ('go', 'ran'), ('go', 'turns'), ('go', 'led'), ('go', 'died'), ('go', 'lived'), ('go', 'moved'), ('go', 'moving'), ('go', 'started'), ('came', 'get'), ('came', 'done'), ('came', 'follow'), ('men', 'human'), ('say', 'state'), ('say', 'order'), ('say', 'states'), ('say', 'supposed'), ('let', 'got'), ('let', 'gets'), ('hand', 'give'), ('take', 'took'), ('get', 'went'), ('get', 'took'), ('went', 'die'), ('years', 'class'), ('last', 'going'), ('last', 'death'), ('last', 'live'), ('give', 'left'), ('give', 'opened'), ('give', 'opening'), ('saw', 'fin

In [None]:
stopwords = set(stopwords.words('english'))
words = [w for w in model.wv.index_to_key if w.lower() not in stopwords][:1000]
threshold = 0.8
recall_errors = [(w1, w2) for w1, w2 in combinations(words, 2)if model.wv.similarity(w1, w2) < threshold and any(s1.wup_similarity(s2) for s1 in wordnet.synsets(w1)for s2 in wordnet.synsets(w2))]
print("Recall Errors:", recall_errors[:100])

Recall Errors: [('one', 'said'), ('one', 'lord'), ('one', 'thou'), ('one', 'god'), ('one', 'good'), ('one', 'well'), ('one', 'us'), ('one', 'know'), ('one', 'go'), ('one', 'must'), ('one', 'say'), ('one', 'never'), ('one', 'let'), ('one', 'father'), ('one', 'years'), ('one', 'give'), ('one', 'think'), ('one', 'love'), ('one', 'better'), ('one', 'mother'), ('one', 'voice'), ('one', 'john'), ('one', 'tell'), ('one', 'want'), ('one', 'art'), ('one', 'jesus'), ('one', 'understand'), ('one', 'ago'), ('said', 'thou'), ('said', 'time'), ('said', 'two'), ('said', 'first'), ('said', 'day'), ('said', 'great'), ('said', 'every'), ('said', 'came'), ('said', 'men'), ('said', 'house'), ('said', 'new'), ('said', 'old'), ('said', 'long'), ('said', 'king'), ('said', 'son'), ('said', 'hand'), ('said', 'mr'), ('said', 'away'), ('said', 'back'), ('said', 'another'), ('said', 'children'), ('said', 'went'), ('said', 'years'), ('said', 'last'), ('said', 'place'), ('said', 'three'), ('said', 'land'), ('said',