In [5]:
import csv
import nltk
from nltk.corpus import wordnet as wn

def load_corpus_as_set(corpus_file):
    word_set = set()
    with open(corpus_file, 'r', encoding='utf-8') as file:
        for line in file:
            words = line.strip().split()
            word_set.update(words)
    return word_set

def extract_word_definition_pairs():
    word_def_pairs = []

    for synset in wn.all_synsets():
        for lemma in synset.lemmas():
            word = lemma.name()  # The word (lemma)
            definition = synset.definition()  # The definition (from the synset)
            word_def_pairs.append((word, definition, synset))
    
    return word_def_pairs

def create_tsv_with_presence_check(wordnet_data, corpus_file, output_file):
    corpus_word_set = load_corpus_as_set(corpus_file)
    word_def_pairs = extract_word_definition_pairs()

    with open(output_file, 'w', newline='', encoding='utf-8') as tsv_file:
        tsv_writer = csv.writer(tsv_file, delimiter='\t')
        tsv_writer.writerow(['index', 'word', 'definition', 'label'])

        for index, (word, definition, synset) in enumerate(word_def_pairs):
            is_present = 1 if word in corpus_word_set else 0

            if is_present == 0 and " " not in word and "_" not in word:
                hypernyms = synset.hypernyms()
                hyponyms = synset.hyponyms()
                # part_holonyms = synset.part_meronyms()
                # member_holonyms = synset.member_holonyms()
                # entailments = synset.entailments()
                
                similar_words_list = []
                for hypernym in hypernyms:
                    similar_words_list.append(hypernym.lemmas()[0].name())
                for hyponym in hyponyms:
                    similar_words_list.append(hyponym.lemmas()[0].name())

                if len(similar_words_list) > 0:
                    tsv_writer.writerow([index, synset.lemmas()[0].name(), similar_words_list, 1])
                else:
                    if len(synset.lemmas()) != 0:
                        synset = synset.lemmas()[0]
                        synset = synset.synset()
                        hypernyms = synset.hypernyms()
                        hyponyms = synset.hyponyms()
                        for hypernym in hypernyms:
                            similar_words_list.append(hypernym.lemmas()[0].name())
                        for hyponym in hyponyms:
                            similar_words_list.append(hyponym.lemmas()[0].name())
                        if len(similar_words_list) != 0:
                            tsv_writer.writerow([index, synset.lemmas()[0].name(), similar_words_list, 1])

corpus_file = '/media/saketh/New Volume/NAACL 2025/Datasets/en/en_10M_splits.txt'  # Your 4GB large corpus file
output_file = 'english/WaW_en.tsv'

create_tsv_with_presence_check(extract_word_definition_pairs(), corpus_file, output_file)

print(f"TSV file with word presence check created: {output_file}")


TSV file with word presence check created: english/WaW_en.tsv
