In [None]:
import nltk
nltk.download('gutenberg')
nltk.download('punkt')
from nltk.corpus import wordnet
nltk.download('wordnet')
import re
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.models import Word2Vec
from nltk.corpus import stopwords
nltk.download('stopwords')
from itertools import combinations
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer




[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Load the Gutenberg corpus
corpus = gutenberg.raw()

def preprocess_text(text):
  text = text.replace('\n', ' ')
  sentences = sent_tokenize(text)
  tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
  tokenized_sentences = [[word.lower() for word in sentence] for sentence in tokenized_sentences]
  tokenized_sentences = [[re.sub(r'[^a-zA-Z]', '', word) for word in sentence] for sentence in tokenized_sentences]
  tokenized_sentences = [[word for word in sentence if word] for sentence in tokenized_sentences]

  return tokenized_sentences

preprocessed_corpus = preprocess_text(corpus)


In [None]:
print(f"Length of preprocessed_corpus: {len(preprocessed_corpus)}")


Length of preprocessed_corpus: 94428


In [None]:
cbow_model = Word2Vec(
    min_count=3,
    window=8,
    sg=0,  #  skip-gram (sg=1) or CBOW (sg=0)
    seed=1
)

cbow_model.build_vocab(preprocessed_corpus)
cbow_model.train(preprocessed_corpus, total_examples=cbow_model .corpus_count, epochs=1)
voc = cbow_model.wv.index_to_key
print("Tokens:", len(voc))


Tokens: 21192


In [None]:
skipgram_model = Word2Vec(
    min_count=3,
    window=8,
    sg=1,  #  skip-gram (sg=1) or CBOW (sg=0)
    seed=1
)

skipgram_model.build_vocab(preprocessed_corpus)
skipgram_model.train(preprocessed_corpus, total_examples=skipgram_model.corpus_count, epochs=1)
voc = skipgram_model.wv.index_to_key
print("Tokens:", len(voc))


Tokens: 21192


PUNCTUL 1

In [None]:
def compute_wordnet_coverage(embedding_model):
  total_words = len(embedding_model.wv.index_to_key)
  wordnet_words = sum(
    1 for word in embedding_model.wv.index_to_key
    if wordnet.synsets(word) or
      wordnet.synsets(word.lower()) or
      wordnet.synsets(word.upper())
    )

  coverage = (wordnet_words / total_words) * 100
  return coverage

cbow_coverage = compute_wordnet_coverage(cbow_model)
skipgram_coverage = compute_wordnet_coverage(skipgram_model)

print("CBOW WordNet Coverage: {:.2f}%".format(cbow_coverage))
print("Skipgram WordNet Coverage: {:.2f}%".format(skipgram_coverage))


CBOW WordNet Coverage: 83.19%
Skipgram WordNet Coverage: 83.19%


PUNCTUL 2

In [None]:
def evaluate_model(model):
  non_stopword_words = [word for word in model.wv.index_to_key if word.lower() not in stopwords.words('english')][:1000]
  thresholds = [0.6, 0.7, 0.8]

  for threshold in thresholds:
    synonyms_emb = 0
    synonyms_wn = 0
    common_synonyms = 0

    for word_pair in combinations(non_stopword_words, 2):
      similarity_emb = model.wv.similarity(word_pair[0], word_pair[1])
      word1_lemma = WordNetLemmatizer().lemmatize(word_pair[0])
      word2_lemma = WordNetLemmatizer().lemmatize(word_pair[1])
      synsets_word1 = set(wn.synsets(word1_lemma))
      synsets_word2 = set(wn.synsets(word2_lemma))
      common_synsets = synsets_word1.intersection(synsets_word2)
      similarity_wn = len(common_synsets)

      if similarity_emb >= threshold:
        synonyms_emb += 1

      if similarity_wn > 0:
        synonyms_wn += 1

      if similarity_emb >= threshold and similarity_wn > 0:
        common_synonyms += 1

    precision = common_synonyms / synonyms_emb if synonyms_emb > 0 else 0.0
    recall = common_synonyms / synonyms_wn if synonyms_wn > 0 else 0.0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    print(f"Threshold: {threshold} - Precision: {precision} - Recall: {recall} - F1-Score: {f1_score}")




In [None]:
print("Evaluate CBOW Model")
evaluate_model(cbow_model)

Evaluate CBOW Model
Threshold: 0.6 - Precision: 0.0036946419849346436 - Recall: 0.9181514476614699 - F1-Score: 0.007359668658701503
Threshold: 0.7 - Precision: 0.003778582736872307 - Recall: 0.8212694877505567 - F1-Score: 0.007522554914650878
Threshold: 0.8 - Precision: 0.0038470985888205806 - Recall: 0.6191536748329621 - F1-Score: 0.007646684660214203


In [None]:
print("Evaluate Skip_Gram Model")
evaluate_model(skipgram_model)


Evaluate Skip_Gram Model
Threshold: 0.6 - Precision: 0.003993884912854987 - Recall: 0.8858574610244989 - F1-Score: 0.007951918631530282
Threshold: 0.7 - Precision: 0.00426456139141227 - Recall: 0.6887527839643652 - F1-Score: 0.008476637851580033
Threshold: 0.8 - Precision: 0.004910874268346097 - Recall: 0.41202672605790647 - F1-Score: 0.009706063666531133


PUNCTUL 3


In [None]:
#a. Coverage Errors
def coverage_errors(model):
  emb_words = set(model.wv.index_to_key)
  wordnet_words = set(wn.all_lemma_names())
  coverage_errors = emb_words - wordnet_words
  return list(coverage_errors)[:100]

print("Coverage errors: CBOW Model")
print(coverage_errors(cbow_model))


Coverage errors: CBOW Model
['hor', 'screamed', 'custome', 'fulness', 'sufferings', 'falshood', 'bethaven', 'pathros', 'pharez', 'hailing', 'personall', 'realities', 'thirsted', 'bowlinggreen', 'fanatics', 'oughtest', 'tithes', 'receiveth', 'unskilful', 'undoubted', 'confederates', 'zerah', 'bethzur', 'shadows', 'seyward', 'stedfastly', 'obtained', 'uncircumcision', 'lep', 'drowne', 'seizes', 'spades', 'duncans', 'purchased', 'deceiveth', 'goings', 'endured', 'jehoahaz', 'testifying', 'jenning', 'drowning', 'opens', 'perches', 'purposing', 'embraced', 'remaineth', 'eluded', 'seacaptains', 'fords', 'draweth', 'courtiers', 'falleth', 'pelatiah', 'shoved', 'thieves', 'phantoms', 'forests', 'earning', 'wisest', 'intuitions', 'overcharged', 'owes', 'shells', 'craftsmen', 'cic', 'conceived', 'tophet', 'intending', 'shaul', 'whiles', 'this', 'delusions', 'groaned', 'ministering', 'knew', 'pulpits', 'skipping', 'reu', 'illustrates', 'aske', 'azaziah', 'excepting', 'fellowservants', 'flattered'

In [None]:
# b. Precision Errors
non_stopword_words = [word for word in cbow_model.wv.index_to_key if word.lower() not in stopwords.words('english')][:1000]
threshold = 0.8
precision_errors = []
for word_pair in combinations(non_stopword_words, 2):
  similarity_emb = cbow_model.wv.similarity(word_pair[0], word_pair[1])
  word1_lemma = WordNetLemmatizer().lemmatize(word_pair[0])
  word2_lemma = WordNetLemmatizer().lemmatize(word_pair[1])
  synsets_word1 = set(wn.synsets(word1_lemma))
  synsets_word2 = set(wn.synsets(word2_lemma))
  common_synsets = synsets_word1.intersection(synsets_word2)
  similarity_wn = len(common_synsets)
  if similarity_emb >= threshold and similarity_wn == 0:
    precision_errors.append(word_pair)
print("Precision Errors:", precision_errors[:100])

Precision Errors: [('shall', 'let'), ('shall', 'may'), ('shall', 'must'), ('said', 'know'), ('said', 'well'), ('said', 'think'), ('said', 'therefore'), ('said', 'behold'), ('said', 'thought'), ('said', 'saith'), ('said', 'hear'), ('said', 'cried'), ('said', 'love'), ('said', 'oh'), ('said', 'sir'), ('said', 'always'), ('said', 'speak'), ('said', 'sure'), ('said', 'indeed'), ('said', 'enough'), ('said', 'answered'), ('said', 'dear'), ('said', 'keep'), ('said', 'fear'), ('said', 'hope'), ('said', 'yes'), ('said', 'believe'), ('said', 'call'), ('said', 'asked'), ('said', 'mean'), ('said', 'live'), ('said', 'alone'), ('said', 'matter'), ('said', 'wish'), ('said', 'answer'), ('said', 'replied'), ('said', 'ask'), ('said', 'help'), ('said', 'remember'), ('said', 'understand'), ('said', 'able'), ('said', 'ought'), ('said', 'master'), ('said', 'glad'), ('said', 'talk'), ('said', 'bad'), ('said', 'ca'), ('said', 'need'), ('said', 'wonder'), ('said', 'wanted'), ('said', 'added'), ('said', 'please

In [None]:
# c. Recall Errors
non_stopword_words = [word for word in cbow_model.wv.index_to_key if word.lower() not in stopwords.words('english')][:1000]
threshold = 0.8
recall_errors = []
for word_pair in combinations(non_stopword_words, 2):
  similarity_emb = cbow_model.wv.similarity(word_pair[0], word_pair[1])
  word1_lemma = WordNetLemmatizer().lemmatize(word_pair[0])
  word2_lemma = WordNetLemmatizer().lemmatize(word_pair[1])
  synsets_word1 = set(wn.synsets(word1_lemma))
  synsets_word2 = set(wn.synsets(word2_lemma))
  common_synsets = synsets_word1.intersection(synsets_word2)
  similarity_wn = len(common_synsets)
  if similarity_emb < threshold and similarity_wn > 0:
    recall_errors.append(word_pair)
print("Recall Errors:", recall_errors[:100])

Recall Errors: [('said', 'told'), ('said', 'read'), ('said', 'state'), ('said', 'order'), ('thou', 'thousand'), ('man', 'men'), ('man', 'world'), ('man', 'gentleman'), ('man', 'human'), ('man', 'gentlemen'), ('man', 'pieces'), ('god', 'gods'), ('come', 'came'), ('come', 'done'), ('come', 'fell'), ('come', 'coming'), ('come', 'got'), ('come', 'seed'), ('come', 'comes'), ('come', 'followed'), ('come', 'number'), ('come', 'follow'), ('come', 'fallen'), ('like', 'wish'), ('like', 'care'), ('came', 'done'), ('came', 'get'), ('came', 'coming'), ('came', 'got'), ('came', 'fall'), ('came', 'comes'), ('came', 'followed'), ('came', 'number'), ('came', 'follow'), ('came', 'fallen'), ('day', 'days'), ('king', 'power'), ('king', 'kings'), ('know', 'love'), ('know', 'knew'), ('know', 'known'), ('know', 'living'), ('know', 'bed'), ('know', 'lived'), ('know', 'loved'), ('know', 'knowing'), ('see', 'saw'), ('see', 'heard'), ('see', 'found'), ('see', 'looked'), ('see', 'seen'), ('see', 'looking'), ('see