In [9]:
import pyiwn
import csv

# Function to read the large corpus and create a set of words
def load_corpus_as_set(corpus_file):
    word_set = set()
    with open(corpus_file, 'r', encoding='utf-8') as file:
        for line in file:
            # Assuming each line contains space-separated words
            words = line.strip().split()
            word_set.update(words)
    return word_set

# Function to extract WordNet word-definition pairs
def extract_word_definition_pairs():
    synsets = iwn.all_synsets()
    pairs = []
    # Iterate through each synset
    for synset in synsets:
        pairs.append((synset.head_word(), synset.gloss(), synset))

    return pairs
    

def create_tsv_with_presence_check(wordnet_data, corpus_file, output_file):
    corpus_word_set = load_corpus_as_set(corpus_file)

    word_def_pairs = extract_word_definition_pairs()

    with open(output_file, 'w', newline='', encoding='utf-8') as tsv_file:
        tsv_writer = csv.writer(tsv_file, delimiter='\t')
        tsv_writer.writerow(['index', 'word', 'definition', 'label'])

        # Write the word-definition pairs with the is_present flag
        for index, (word, definition, synset) in enumerate(word_def_pairs):
            # Check if the word is in the corpus
            is_present = 1 if word in corpus_word_set else 0
            # tsv_writer.writerow([index, is_present, word, definition, 1])
            if is_present == 0 and "_" not in word and " " not in word:
                tsv_writer.writerow([index, word, definition, 1])

# Example usage
LANG = "te"
LANG_FULL = "telugu"
corpus_file = f'/media/saketh/New Volume/NAACL 2025/Datasets/{LANG}/{LANG}_10M_splits.txt'  # Your 4GB large corpus file
output_file = f'{LANG_FULL}/WaD_{LANG}_temp_0.tsv'

iwn = pyiwn.IndoWordNet(lang=pyiwn.Language.HINDI)

# Create the TSV file
create_tsv_with_presence_check(extract_word_definition_pairs(), corpus_file, output_file)

print(f"TSV file with word presence check created: {output_file}")


2024-10-08:01:06:24,486 INFO     [iwn.py:43] Loading telugu language synsets...


TSV file with word presence check created: telugu/WaD_te_full.tsv


# Create negative samples

In [17]:
import pyiwn
import csv
import random
from difflib import SequenceMatcher

def load_corpus_as_set(corpus_file):
    word_set = set()
    with open(corpus_file, 'r', encoding='utf-8') as file:
        for line in file:
            words = line.strip().split()
            word_set.update(words)
    return word_set

# def create_negative_samples(word_def_pairs):
#     words, definitions, _ = zip(*word_def_pairs)
#     shuffled_definitions = list(definitions)
#     random.shuffle(shuffled_definitions)
    
#     negative_pairs = []
#     for word, shuffled_def in zip(words, shuffled_definitions):
#         negative_pairs.append((word, shuffled_def))
#     return negative_pairs

def lexical_similarity(word1, word2):
    return SequenceMatcher(None, word1, word2).ratio()

def select_lexically_similar_negatives(word_def_pairs, non_selected_pos_pairs, num_pos_samples):
    selected_negatives = []
    for index, (word, definition, _) in enumerate(non_selected_pos_pairs):
        similarities = [(neg_word, pos_def_of_neg_word, lexical_similarity(word, neg_word)) 
                        for neg_word, pos_def_of_neg_word, _ in word_def_pairs if neg_word != word]
        similarities.sort(key=lambda x: x[2], reverse=True)
        # print(f"-"*90)
        # print(f"ACTUAL WORD: {word}")
        # print(f"SIMILAR WORDS: ")
        # for similar in similarities[:10]:
            # print(f"{similar[0]}")
        # if similarities:
        #     selected_negatives.append((word, similarities[0][1]))

        i = 0
        temperature = 0
        for i in range(0, len(similarities)):
            if similarities[i][0] == word:
                continue
            else:
                if temperature > 0:
                    if definition != similarities[i][1]:
                        temperature -= 1
                    continue
                else:
                    break
        
        # Print selected negative sample
        # print(f"WORD: {word}")
        # print(f"POS_DEFINITION: {definition}")
        # print(f"LEXICALLY_SIMILAR_WORD: {similarities[i][0]}")
        
        neg_sample_idx = i # i = 3/3++
        i = 0
        neg_definition = ""
        for i in range(0, len(word_def_pairs)):
            if word_def_pairs[i][0] == similarities[neg_sample_idx][0]:
                # print(f"NEG_DEFINITION: {word_def_pairs[i][1]}")
                neg_definition = word_def_pairs[i][1]
                break

        selected_negatives.append((word, neg_definition))
        
        if len(selected_negatives) >= num_pos_samples:
            break

        if index % 10 == 0:
            print(f"iteration: {index}/{num_pos_samples}")
    
    print(f"num_neg_samples: {len(selected_negatives)}")

    return selected_negatives

def create_tsv_with_balanced_samples(word_def_pairs, corpus_file, output_file):
    corpus_word_set = load_corpus_as_set(corpus_file)
    
    all_positive_samples = []
    for (word, definition, _) in word_def_pairs:
        if word not in corpus_word_set and " " not in word and "_" not in word:
            all_positive_samples.append((word, definition, 1))
    
    random.shuffle(all_positive_samples)

    num_pos_samples = int(len(all_positive_samples) / 2)
    print(f"num_pos_samples: {num_pos_samples}")

    positive_samples = all_positive_samples[:num_pos_samples] # To keep only one occurance of a word in word_def pairs (comment this if we want to keep word_pos_def_pair and word_neg_def_pair both)
    # positive_samples = all_positive_samples # and uncomment this
    non_selected_pos_pairs = all_positive_samples[num_pos_samples:]

    negative_samples = select_lexically_similar_negatives(word_def_pairs, non_selected_pos_pairs, num_pos_samples)

    with open(output_file, 'w', newline='', encoding='utf-8') as tsv_file:
        tsv_writer = csv.writer(tsv_file, delimiter='\t')
        tsv_writer.writerow(['index', 'word', 'definition', 'label'])
        
        for index, (word, definition, _) in enumerate(positive_samples):
            tsv_writer.writerow([index, word, definition, 1])
        
        for index, (word, definition) in enumerate(negative_samples):
            tsv_writer.writerow([index, word, definition, 0])

if __name__ == "__main__":
    # Example usage
    LANG = "te"
    LANG_FULL = "telugu"
    corpus_file = f'/media/saketh/New Volume/NAACL 2025/Datasets/{LANG}/{LANG}_10M_splits.txt'
    output_file = f'{LANG_FULL}/WaD_neg_samples_10.tsv'

    iwn = pyiwn.IndoWordNet(lang=pyiwn.Language.TELUGU)

    def extract_word_definition_pairs():
        synsets = iwn.all_synsets()
        pairs = []
        for synset in synsets:
            if " " not in synset.head_word() and "_" not in synset.head_word():
                pairs.append((synset.head_word(), synset.gloss(), synset))
        return pairs

    word_def_pairs = extract_word_definition_pairs()
    create_tsv_with_balanced_samples(word_def_pairs, corpus_file, output_file)

    print(f"TSV file with balanced positive and negative samples created: {output_file}")

2024-10-16:18:24:30,399 INFO     [iwn.py:43] Loading telugu language synsets...


num_pos_samples: 3275
iteration: 0/3275
iteration: 10/3275
iteration: 20/3275
iteration: 30/3275
iteration: 40/3275
iteration: 50/3275
iteration: 60/3275
iteration: 70/3275
iteration: 80/3275
iteration: 90/3275
iteration: 100/3275
iteration: 110/3275
iteration: 120/3275
iteration: 130/3275
iteration: 140/3275
iteration: 150/3275
iteration: 160/3275
iteration: 170/3275
iteration: 180/3275
iteration: 190/3275
iteration: 200/3275
iteration: 210/3275
iteration: 220/3275
iteration: 230/3275
iteration: 240/3275
iteration: 250/3275
iteration: 260/3275
iteration: 270/3275
iteration: 280/3275
iteration: 290/3275
iteration: 300/3275
iteration: 310/3275
iteration: 320/3275
iteration: 330/3275
iteration: 340/3275
iteration: 350/3275
iteration: 360/3275
iteration: 370/3275
iteration: 380/3275
iteration: 390/3275
iteration: 400/3275
iteration: 410/3275
iteration: 420/3275
iteration: 430/3275
iteration: 440/3275
iteration: 450/3275
iteration: 460/3275
iteration: 470/3275
iteration: 480/3275
iteration

In [21]:
import pyiwn
import csv
import random
from difflib import SequenceMatcher

def load_corpus_as_set(corpus_file):
    word_set = set()
    with open(corpus_file, 'r', encoding='utf-8') as file:
        for line in file:
            words = line.strip().split()
            word_set.update(words)
    return word_set

# def create_negative_samples(word_def_pairs):
#     words, definitions, _ = zip(*word_def_pairs)
#     shuffled_definitions = list(definitions)
#     random.shuffle(shuffled_definitions)
    
#     negative_pairs = []
#     for word, shuffled_def in zip(words, shuffled_definitions):
#         negative_pairs.append((word, shuffled_def))
#     return negative_pairs

def lexical_similarity(word1, word2):
    return SequenceMatcher(None, word1, word2).ratio()

def select_lexically_similar_negatives(word_def_pairs, non_selected_pos_pairs, num_pos_samples):
    selected_negatives = []
    for index, (word, definition, _) in enumerate(non_selected_pos_pairs):
        similarities = [(neg_word, pos_def_of_neg_word, lexical_similarity(word, neg_word)) 
                        for neg_word, pos_def_of_neg_word, _ in word_def_pairs if neg_word != word]
        similarities.sort(key=lambda x: x[2], reverse=True)
        # print(f"-"*90)
        # print(f"ACTUAL WORD: {word}")
        # print(f"SIMILAR WORDS: ")
        # for similar in similarities[:10]:
            # print(f"{similar[0]}")
        # if similarities:
        #     selected_negatives.append((word, similarities[0][1]))

        i = 0
        temperature = 0
        for i in range(0, len(similarities)):
            if similarities[i][0] == word:
                continue
            else:
                if temperature > 0:
                    if definition != similarities[i][1]:
                        temperature -= 1
                    continue
                else:
                    break
        
        # Print selected negative sample
        # print(f"WORD: {word}")
        # print(f"POS_DEFINITION: {definition}")
        # print(f"LEXICALLY_SIMILAR_WORD: {similarities[i][0]}")
        
        neg_sample_idx = i # i = 3/3++
        i = 0
        neg_definition = ""
        for i in range(0, len(word_def_pairs)):
            if word_def_pairs[i][0] == similarities[neg_sample_idx][0]:
                # print(f"NEG_DEFINITION: {word_def_pairs[i][1]}")
                neg_definition = word_def_pairs[i][1]
                break

        selected_negatives.append((word, neg_definition))
        
        if len(selected_negatives) >= num_pos_samples:
            break

        if index % 10 == 0:
            print(f"iteration: {index}/{num_pos_samples}")
    
    print(f"num_neg_samples: {len(selected_negatives)}")

    return selected_negatives

def create_tsv_with_balanced_samples(word_def_pairs, corpus_file, output_file):
    corpus_word_set = load_corpus_as_set(corpus_file)
    
    all_positive_samples = []
    for (word, definition, _) in word_def_pairs:
        if word not in corpus_word_set and " " not in word and "_" not in word:
            all_positive_samples.append((word, definition, 1))
    
    random.shuffle(all_positive_samples)

    num_pos_samples = int(len(all_positive_samples) / 2)
    print(f"num_pos_samples: {num_pos_samples}")

    positive_samples = all_positive_samples[:num_pos_samples] # To keep only one occurance of a word in word_def pairs (comment this if we want to keep word_pos_def_pair and word_neg_def_pair both)
    # positive_samples = all_positive_samples # and uncomment this
    non_selected_pos_pairs = all_positive_samples[num_pos_samples:]

    negative_samples = select_lexically_similar_negatives(word_def_pairs, non_selected_pos_pairs, num_pos_samples)

    with open(output_file, 'w', newline='', encoding='utf-8') as tsv_file:
        tsv_writer = csv.writer(tsv_file, delimiter='\t')
        tsv_writer.writerow(['index', 'word', 'definition', 'label'])
        
        for index, (word, definition, _) in enumerate(positive_samples):
            tsv_writer.writerow([index, word, definition, 1])
        
        for index, (word, definition) in enumerate(negative_samples):
            tsv_writer.writerow([index, word, definition, 0])

if __name__ == "__main__":
    # Example usage
    LANG = "hi"
    LANG_FULL = "hindi"
    corpus_file = f'/media/saketh/New Volume/NAACL 2025/Datasets/{LANG}/{LANG}_10M_splits.txt'
    output_file = f'{LANG_FULL}/WaD.tsv'

    iwn = pyiwn.IndoWordNet(lang=pyiwn.Language.HINDI)

    def extract_word_definition_pairs():
        synsets = iwn.all_synsets()
        pairs = []
        for synset in synsets:
            if " " not in synset.head_word() and "_" not in synset.head_word():
                pairs.append((synset.head_word(), synset.gloss(), synset))
        return pairs

    word_def_pairs = extract_word_definition_pairs()
    create_tsv_with_balanced_samples(word_def_pairs, corpus_file, output_file)

    print(f"TSV file with balanced positive and negative samples created: {output_file}")

2024-10-16:19:07:19,331 INFO     [iwn.py:43] Loading hindi language synsets...


num_pos_samples: 4320
iteration: 0/4320
iteration: 10/4320
iteration: 20/4320
iteration: 30/4320
iteration: 40/4320
iteration: 50/4320
iteration: 60/4320
iteration: 70/4320
iteration: 80/4320
iteration: 90/4320
iteration: 100/4320
iteration: 110/4320
iteration: 120/4320
iteration: 130/4320
iteration: 140/4320
iteration: 150/4320
iteration: 160/4320
iteration: 170/4320
iteration: 180/4320
iteration: 190/4320
iteration: 200/4320
iteration: 210/4320
iteration: 220/4320
iteration: 230/4320
iteration: 240/4320
iteration: 250/4320
iteration: 260/4320
iteration: 270/4320
iteration: 280/4320
iteration: 290/4320
iteration: 300/4320
iteration: 310/4320
iteration: 320/4320
iteration: 330/4320
iteration: 340/4320
iteration: 350/4320
iteration: 360/4320
iteration: 370/4320
iteration: 380/4320
iteration: 390/4320
iteration: 400/4320
iteration: 410/4320
iteration: 420/4320
iteration: 430/4320
iteration: 440/4320
iteration: 450/4320
iteration: 460/4320
iteration: 470/4320
iteration: 480/4320
iteration

In [19]:
import csv
import random
from collections import defaultdict

def split_dataset(output_file, train_ratio=0.7, test_ratio=0.2, val_ratio=0.1):
    # Read the data from the TSV file
    with open(output_file, 'r', newline='', encoding='utf-8') as tsv_file:
        reader = csv.DictReader(tsv_file, delimiter='\t')
        data = list(reader)

    # Separate positive and negative samples
    positive_samples = [sample for sample in data if sample['label'] == '1']
    negative_samples = [sample for sample in data if sample['label'] == '0']

    # Ensure equal number of positive and negative samples
    min_samples = min(len(positive_samples), len(negative_samples))
    positive_samples = random.sample(positive_samples, min_samples)
    negative_samples = random.sample(negative_samples, min_samples)

    # Combine and shuffle the data
    balanced_data = positive_samples + negative_samples
    random.shuffle(balanced_data)

    # Calculate split sizes
    total_samples = len(balanced_data)
    train_size = int(total_samples * train_ratio)
    test_size = int(total_samples * test_ratio)
    val_size = total_samples - train_size - test_size

    # Split the data
    train_data = balanced_data[:train_size]
    test_data = balanced_data[train_size:train_size+test_size]
    val_data = balanced_data[train_size+test_size:]

    # Function to write data to TSV file
    def write_to_tsv(data, filename):
        with open(filename, 'w', newline='', encoding='utf-8') as tsv_file:
            fieldnames = ['index', 'word', 'definition', 'label']
            writer = csv.DictWriter(tsv_file, fieldnames=fieldnames, delimiter='\t')
            writer.writeheader()
            for index, row in enumerate(data):
                row['index'] = index
                writer.writerow(row)

    # Write splits to separate files
    write_to_tsv(train_data, output_file.replace('.tsv', '.train.tsv'))
    write_to_tsv(test_data, output_file.replace('.tsv', '.test.tsv'))
    write_to_tsv(val_data, output_file.replace('.tsv', '.dev.tsv'))

    # Print statistics
    print(f"Total samples: {total_samples}")
    print(f"Train samples: {len(train_data)} ({len(train_data)/total_samples:.2%})")
    print(f"Test samples: {len(test_data)} ({len(test_data)/total_samples:.2%})")
    print(f"Validation samples: {len(val_data)} ({len(val_data)/total_samples:.2%})")

    # Verify balance in each split
    for split_name, split_data in [("Train", train_data), ("Test", test_data), ("Validation", val_data)]:
        pos_count = sum(1 for sample in split_data if sample['label'] == '1')
        neg_count = sum(1 for sample in split_data if sample['label'] == '0')
        print(f"{split_name} - Positive: {pos_count}, Negative: {neg_count}")

# Usage example:
LANG = "te"
LANG_FULL = "telugu"
output_file = f'{LANG_FULL}/WaD.tsv'
split_dataset(output_file)

Total samples: 6550
Train samples: 4585 (70.00%)
Test samples: 1310 (20.00%)
Validation samples: 655 (10.00%)
Train - Positive: 2331, Negative: 2254
Test - Positive: 619, Negative: 691
Validation - Positive: 325, Negative: 330
