# **Project: Hypernym Discovery**
---




Team Name: Semantic Insighters  

## Word Embedding Generation

### Importing datasets

In [None]:
ENGLISH_HYPONYMS_TRAIN_FILE = "/content/drive/MyDrive/SemEval2018-Task9/training/data/1A.english.training.data.txt"
ENGLISH_HYPERNYMS_TRAIN_FILE = "/content/drive/MyDrive/SemEval2018-Task9/training/gold/1A.english.training.gold.txt"
ENGLISH_HYPONYMS_DEV_FILE = "/content/drive/MyDrive/SemEval2018-Task9/trial/data/1A.english.trial.data.txt"
ENGLISH_HYPONYMS_TEST_FILE = "/content/drive/MyDrive/SemEval2018-Task9/test/data/1A.english.test.data.txt"
ENGLISH_HYPERNYMS_DEV_FILE = "/content/drive/MyDrive/SemEval2018-Task9/trial/gold/2A.medical.trial.gold.txt"
ENGLISH_HYPERNYMS_TEST_FILE = "/content/drive/MyDrive/SemEval2018-Task9/test/gold/2A.medical.test.gold.txt"
ENGLISH_VOCAB_FILENAME = "english_merged_vocab.txt"

MEDICAL_HYPONYMS_TRAIN_FILE = "/content/drive/MyDrive/SemEval2018-Task9/training/data/2A.medical.training.data.txt"
MEDICAL_HYPERNYMS_TRAIN_FILE = "/content/drive/MyDrive/SemEval2018-Task9/training/gold/2A.medical.training.gold.txt"
MEDICAL_HYPONYMS_TEST_FILE = "/content/drive/MyDrive/SemEval2018-Task9/test/data/2A.medical.test.data.txt"
MEDICAL_HYPONYMS_DEV_FILE = "/content/drive/MyDrive/SemEval2018-Task9/trial/data/2A.medical.trial.data.txt"
MEDICAL_HYPERNYMS_DEV_FILE = "/content/drive/MyDrive/SemEval2018-Task9/trial/gold/2A.medical.trial.gold.txt"
MEDICAL_HYPERNYMS_TEST_FILE = "/content/drive/MyDrive/SemEval2018-Task9/test/gold/2A.medical.test.gold.txt"
MEDICAL_VOCAB_FILENAME = "medical_merged_vocab.txt"

In [None]:
def read_file(FILENAME):
    with open(FILENAME, encoding='utf-8', errors='ignore') as f:
          lines = f.readlines()
    modified_lines = list()
    for line in lines:
        words = line.strip().split("\t")
        modified_line = ""
        for word in words:
            underscored_word = "_".join(word.split(" ")).lower()
            modified_line += underscored_word + "\t"
        modified_lines.append(modified_line)
    return modified_lines

def merge_hypernyms(hypernyms):
    all_hypernyms = list()
    for hypers in hypernyms:
        all_hypernyms += hypers
    return list(set(all_hypernyms))

### Creating Vocabulory

In [None]:
def create_vocab_file(hyponyms_train_file, hypernyms_train_file, hyponyms_dev_file, hypernyms_dev_file, hyponyms_test_file, hypernyms_test_file, output_filename):
    hyponyms_train = [line.strip().split("\t")[0] for line in read_file(hyponyms_train_file)]
    hypernyms_train = merge_hypernyms([line.strip().split("\t") for line in read_file(hypernyms_train_file)])
    hyponyms_dev = [line.strip().split("\t")[0] for line in read_file(hyponyms_dev_file)]
    hypernyms_dev = merge_hypernyms([line.strip().split("\t") for line in read_file(hypernyms_dev_file)])
    hyponyms_test = [line.strip().split("\t")[0] for line in read_file(hyponyms_test_file)]
    hypernyms_test = merge_hypernyms([line.strip().split("\t") for line in read_file(hypernyms_test_file)])

    all_words = list()

    all_words += hyponyms_train
    all_words += hypernyms_train
    all_words += hyponyms_dev
    all_words += hypernyms_dev
    all_words += hyponyms_test
    all_words += hypernyms_test

    new_vocab = list(set(all_words))

    with open(output_filename, "w") as f:
        for word in new_vocab:
            f.write(word + "\n")

    print("New vocab file {} created successfully.".format(output_filename))

In [None]:
create_vocab_file(ENGLISH_HYPONYMS_TRAIN_FILE, ENGLISH_HYPERNYMS_TRAIN_FILE,
                  ENGLISH_HYPONYMS_DEV_FILE, ENGLISH_HYPERNYMS_DEV_FILE,
                  ENGLISH_HYPONYMS_TEST_FILE, ENGLISH_HYPERNYMS_TEST_FILE,
                  ENGLISH_VOCAB_FILENAME)

create_vocab_file(MEDICAL_HYPONYMS_TRAIN_FILE, MEDICAL_HYPERNYMS_TRAIN_FILE,
                  MEDICAL_HYPONYMS_DEV_FILE, MEDICAL_HYPERNYMS_DEV_FILE,
                  MEDICAL_HYPONYMS_TEST_FILE, MEDICAL_HYPERNYMS_TEST_FILE,
                  MEDICAL_VOCAB_FILENAME)

### Creating Dataloaders

In [None]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import numpy as np
from tqdm import tqdm

In [None]:
class Hypernymy_Dataset(Dataset):
    def __init__(self, HYPONYMS_FILENAME, HYPERNYMS_FILENAME, VOCAB_FILENAME, num_negs=5):
        self.hyponyms = [line.strip().split("\t")[0] for line in self.read_file(HYPONYMS_FILENAME)]
        self.hypernyms = [line.strip().split("\t") for line in self.read_file(HYPERNYMS_FILENAME)]
        self.vocab = [line.strip().split("\t")[0] for line in self.read_file(VOCAB_FILENAME)]
        self.all_hypernyms = self.merge_hypernyms()
        self.num_negs = num_negs
        self.word_id_map = {}
        self.id_word_map = {}
        for ndx, word in enumerate(self.vocab):
            self.word_id_map[word] = ndx
            self.id_word_map[ndx] = word
        self.data_size = len(self.hyponyms)

        self.generate_mappings()

    def read_file(self, FILENAME):
        with open(FILENAME, encoding='utf-8', errors='ignore') as f:
            lines = f.readlines()
        modified_lines = []
        for line in lines:
            words = line.strip().split("\t")
            modified_line = ""
            for word in words:
                underscored_word = "_".join(word.split(" ")).lower()
                modified_line += underscored_word + "\t"
            modified_lines.append(modified_line.strip())
        return modified_lines

    def merge_hypernyms(self):
        all_hypernyms = []
        for hypers in self.hypernyms:
            all_hypernyms += hypers
        return list(set(all_hypernyms))

    def generate_mappings(self):
        for ndx, word in enumerate(self.vocab):
            self.word_id_map[word] = ndx
            self.id_word_map[ndx] = word

    def generate_negative_samples(self, ndx):
        positives = self.hypernyms[ndx]
        negatives = []
        count = 0
        while count < self.num_negs*len(positives):
            rand_neg = self.all_hypernyms[np.random.randint(0, len(self.all_hypernyms))]
            if rand_neg not in positives and rand_neg not in negatives:
                negatives.append(rand_neg)
                count += 1
        return negatives

    @staticmethod
    def collate(batches):
        u = [u for b in batches for u, _, _ in b if len(b) > 0]
        v = [v for b in batches for _, v, _ in b if len(b) > 0]
        neg = [neg for b in batches for _, _, neg in b if len(b) > 0]

        return torch.tensor(u), torch.tensor(v), torch.tensor(neg)

    def __getitem__(self, ndx):
        hyponym = self.hyponyms[ndx]
        hypernyms = self.hypernyms[ndx]
        negative_samples = self.generate_negative_samples(ndx)
        items = []
        start, end = 0, self.num_negs
        for hypernym in hypernyms:
          if hyponym in self.word_id_map and hypernym in self.word_id_map:
              negatives = [self.word_id_map[neg] for neg in negative_samples[start:end] if neg in self.word_id_map]
              start = end
              end += self.num_negs
              items.append((self.word_id_map[hyponym],
                            self.word_id_map[hypernym],
                            negatives))
        return items

    def __len__(self):
        return len(self.hyponyms)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
sg_dataset = Hypernymy_Dataset(ENGLISH_HYPONYMS_TRAIN_FILE, ENGLISH_HYPERNYMS_TRAIN_FILE, ENGLISH_VOCAB_FILENAME)
print(len(sg_dataset))

1500


In [None]:
sg_dataset_medical = Hypernymy_Dataset(MEDICAL_HYPONYMS_TRAIN_FILE, MEDICAL_HYPERNYMS_TRAIN_FILE, MEDICAL_VOCAB_FILENAME)

In [None]:
print(len(sg_dataset_medical))

500


### Word Embedding using Skipgram Model

#### Importing Libraries

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
import torch.optim as optim

#### Model Architecture

In [None]:
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, emb_dimension):
        super(SkipGramModel, self).__init__()
        self.vocab_size = vocab_size
        self.emb_dimension = emb_dimension
        self.u_embeddings = nn.Embedding(vocab_size, emb_dimension, sparse=True)

        initrange = 1.0 / self.emb_dimension
        init.uniform_(self.u_embeddings.weight.data, -initrange, initrange)

    def forward(self, pos_u, pos_v, neg_v):
      emb_u = self.u_embeddings(pos_u)
      emb_v = self.u_embeddings(pos_v)
      emb_neg_v = self.u_embeddings(neg_v)

      score = torch.sum(emb_u * emb_v, dim=1)
      score = -F.logsigmoid(score)
      neg_score = torch.sum(-(emb_neg_v.matmul(emb_u.unsqueeze(2)).squeeze()), dim=1)
      neg_score = -F.logsigmoid(neg_score)

      return torch.mean(score + neg_score)

    def save_embedding(self, id_word_map, OUTFILE_NAME):
        embedding = self.u_embeddings.weight.cpu().data.numpy()
        with open(OUTFILE_NAME, 'w') as f:
            f.write('%d %d\n' % (self.vocab_size, self.emb_dimension))
            for wid, w in id_word_map.items():
                e = ' '.join(map(lambda x: str(x), embedding[wid]))
                f.write('%s %s\n' % (w, e))


#### Model Initialization

In [None]:
sg_dataloader = DataLoader(sg_dataset, batch_size=32, shuffle=True, collate_fn=sg_dataset.collate)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocab_size_eng = len(sg_dataset.vocab)
embedding_len = 300
word2vec_sg_model = SkipGramModel(vocab_size_eng, embedding_len).to(device)
OUTFILE_name_eng = "english"
epochs = 10

#### Model Training

In [None]:
def train(model, dataset, dataloader, epochs, device, OUTFILE_name):
    optimizer = optim.SparseAdam(word2vec_sg_model.parameters(), lr=0.00001)
    loss_logs = []

    for epoch in tqdm(range(epochs)):
        running_loss = 0.0
        for i, sample_batched in enumerate(dataloader):
            if len(sample_batched[0]) > 1:
                pos_u = sample_batched[0].to(device)
                pos_v = sample_batched[1].to(device)
                neg_v = sample_batched[2].to(device)

                optimizer.zero_grad()
                loss = model(pos_u, pos_v, neg_v)
                loss.backward()
                optimizer.step()

                running_loss = running_loss * 0.9 + loss.item() * 0.1

        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss}")
        loss_logs.append(running_loss)

    model.save_embedding(dataset.id_word_map, OUTFILE_name + "_sg_embed.txt")

    return loss_logs

In [None]:
train(word2vec_sg_model, sg_dataset, sg_dataloader, epochs, device, OUTFILE_name_eng)

 10%|█         | 1/10 [00:01<00:09,  1.04s/it]

Epoch 1/10, Loss: 1.3764923380693797


 20%|██        | 2/10 [00:01<00:07,  1.02it/s]

Epoch 2/10, Loss: 1.3764882917768733


 30%|███       | 3/10 [00:02<00:06,  1.05it/s]

Epoch 3/10, Loss: 1.376485367245891


 40%|████      | 4/10 [00:04<00:06,  1.08s/it]

Epoch 4/10, Loss: 1.3764788227309286


 50%|█████     | 5/10 [00:05<00:05,  1.17s/it]

Epoch 5/10, Loss: 1.3764719413280901


 60%|██████    | 6/10 [00:06<00:04,  1.23s/it]

Epoch 6/10, Loss: 1.37646724532988


 70%|███████   | 7/10 [00:07<00:03,  1.13s/it]

Epoch 7/10, Loss: 1.376462029616177


 80%|████████  | 8/10 [00:08<00:02,  1.06s/it]

Epoch 8/10, Loss: 1.3764573591073914


 90%|█████████ | 9/10 [00:09<00:01,  1.01s/it]

Epoch 9/10, Loss: 1.376450362706667


100%|██████████| 10/10 [00:10<00:00,  1.05s/it]

Epoch 10/10, Loss: 1.3764440483107785





[1.3764923380693797,
 1.3764882917768733,
 1.376485367245891,
 1.3764788227309286,
 1.3764719413280901,
 1.37646724532988,
 1.376462029616177,
 1.3764573591073914,
 1.376450362706667,
 1.3764440483107785]

In [None]:
sg_dataloader_medical = DataLoader(sg_dataset_medical, batch_size=32, shuffle=True, collate_fn=sg_dataset_medical.collate)
vocab_size_med = len(sg_dataset_medical.vocab)
word2vec_sg_model_med = SkipGramModel(vocab_size_med, embedding_len).to(device)
OUTFILE_name_med = "medical"
train(word2vec_sg_model_med, sg_dataset_medical, sg_dataloader_medical, epochs, device, OUTFILE_name_med)

 10%|█         | 1/10 [00:00<00:03,  2.78it/s]

Epoch 1/10, Loss: 1.1294110627291358


 20%|██        | 2/10 [00:01<00:04,  1.63it/s]

Epoch 2/10, Loss: 1.1294139980778541


 30%|███       | 3/10 [00:02<00:05,  1.21it/s]

Epoch 3/10, Loss: 1.1294143866665842


 40%|████      | 4/10 [00:03<00:06,  1.08s/it]

Epoch 4/10, Loss: 1.1294117478907326


 50%|█████     | 5/10 [00:05<00:06,  1.34s/it]

Epoch 5/10, Loss: 1.1294136441314913


 60%|██████    | 6/10 [00:07<00:06,  1.61s/it]

Epoch 6/10, Loss: 1.1294119358432149


 70%|███████   | 7/10 [00:10<00:06,  2.16s/it]

Epoch 7/10, Loss: 1.1294123254583848


 80%|████████  | 8/10 [00:13<00:04,  2.41s/it]

Epoch 8/10, Loss: 1.1294119304338195


 90%|█████████ | 9/10 [00:16<00:02,  2.63s/it]

Epoch 9/10, Loss: 1.1294127939634486


100%|██████████| 10/10 [00:20<00:00,  2.10s/it]

Epoch 10/10, Loss: 1.129412335080064





[1.1294110627291358,
 1.1294139980778541,
 1.1294143866665842,
 1.1294117478907326,
 1.1294136441314913,
 1.1294119358432149,
 1.1294123254583848,
 1.1294119304338195,
 1.1294127939634486,
 1.129412335080064]

## Hypernym Prediction

### Main Function

In [None]:
def hypernym_discovery_baseline(task, model="gru"):
    if task == "1A":
        dataset = "English"
        english_embeddings_file = open("english_sg_embed.txt", "r")
        embeddings = english_embeddings_file.read().splitlines()
        training_set = "/content/drive/MyDrive/SemEval2018-Task9/training/data/1A.english.training.data.txt"
        training_hypernym_set = "/content/drive/MyDrive/SemEval2018-Task9/training/gold/1A.english.training.gold.txt"
        testing_set = "/content/drive/MyDrive/SemEval2018-Task9/test/data/1A.english.test.data.txt"
        testing_hypernym_set = "/content/drive/MyDrive/SemEval2018-Task9/test/gold/1A.english.test.gold.txt"
        validation_set = "/content/drive/MyDrive/SemEval2018-Task9/trial/data/1A.english.trial.data.txt"
        validation_hypernym_set = "/content/drive/MyDrive/SemEval2018-Task9/trial/gold/1A.english.trial.gold.txt"
        if model == "gru":
            hypernyms_to_be_saved_for_trainset_data = "train_predicted_hypernym_gru_english.txt"
            hypernyms_to_be_saved_for_testset_data = "test_predicted_hypernym_gru_english.txt"
        else:
            hypernyms_to_be_saved_for_trainset_data = "train_predicted_hypernym_lstm_english.txt"
            hypernyms_to_be_saved_for_testset_data = "test_predicted_hypernym_lstm_english.txt"

    elif task == "2A":
        dataset = "Medical"
        medical_embeddings_file = open("medical_sg_embed.txt", "r")
        embeddings = medical_embeddings_file.read().splitlines()
        training_set = "/content/drive/MyDrive/SemEval2018-Task9/training/data/2A.medical.training.data.txt"
        training_hypernym_set = "/content/drive/MyDrive/SemEval2018-Task9/training/gold/2A.medical.training.gold.txt"
        testing_set = "/content/drive/MyDrive/SemEval2018-Task9/test/data/2A.medical.test.data.txt"
        testing_hypernym_set = "/content/drive/MyDrive/SemEval2018-Task9/test/gold/2A.medical.test.gold.txt"
        validation_set = "/content/drive/MyDrive/SemEval2018-Task9/trial/data/2A.medical.trial.data.txt"
        validation_hypernym_set = "/content/drive/MyDrive/SemEval2018-Task9/trial/gold/2A.medical.trial.gold.txt"
        if model == "gru":
            hypernyms_to_be_saved_for_trainset_data = "train_predicted_hypernym_gru_medical.txt"
            hypernyms_to_be_saved_for_testset_data = "test_predicted_hypernym_gru_medical.txt"
        else:
            hypernyms_to_be_saved_for_trainset_data = "train_predicted_hypernym_lstm_medical.txt"
            hypernyms_to_be_saved_for_testset_data = "test_predicted_hypernym_lstm_medical.txt"


    vocab_size = len(embeddings)
    embedding_matrix = np.zeros((vocab_size, 300))
    counter = 1
    word_vocab = []
    word_vocab.append("UNK")
    embedding_matrix[0] = np.random.random(300)


    for word_embed in embeddings[1:]:
        word_vocab.append(word_embed.split()[0])
        temp_embedding = word_embed.strip().split(' ')[1:]
        temp_array = np.zeros(shape=(1, 300))

        for i in range(len(temp_embedding)):
            temp_array[0, i] = np.float64(temp_embedding[i])
        embedding_matrix[counter] = temp_array[0]
        counter = counter + 1

    queryset_file = open(training_set, "r")
    queryset = queryset_file.readlines()

    for i in range(len(queryset)):
        queryset[i] = "_".join(queryset[i].split("\t")[0].split())

    queryset_test_file = open(testing_set, "r")
    queryset_test = queryset_test_file.readlines()

    for i in range(len(queryset_test)):
        queryset_test[i] = "_".join(queryset_test[i].split("\t")[0].split())

    queryset_validation_file = open(validation_set, "r")
    queryset_validation = queryset_validation_file.readlines()

    for i in range(len(queryset_validation)):
        queryset_validation[i] = "_".join(queryset_validation[i].split("\t")[0].split())

    training_hypernym_file = open(training_hypernym_set, "r")
    training_hypernyms = training_hypernym_file.read().splitlines()

    testing_hypernym_file = open(testing_hypernym_set, "r")
    testing_hypernyms = testing_hypernym_file.read().splitlines()

    validation_hypernym_file = open(validation_hypernym_set, "r")
    validation_hypernyms = validation_hypernym_file.read().splitlines()

    # Creating the train, test and validation datasets
    training_query_hypernym_pair, y_train = dataset_preparation(queryset, word_vocab, training_hypernyms,
                                                                embedding_matrix)
    testing_query_hypernym_pair, y_test = dataset_preparation(queryset_test, word_vocab, testing_hypernyms,
                                                              embedding_matrix)
    validation_query_hypernym_pair, y_validation = dataset_preparation(queryset_validation, word_vocab,
                                                                       validation_hypernyms, embedding_matrix)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Training the model
    trained_model, evaluation_test_set_score = model_training(training_query_hypernym_pair, y_train,
                                                              testing_query_hypernym_pair, y_test,
                                                              validation_query_hypernym_pair, y_validation,
                                                              dataset, model, device)

    # Predicting hypernyms
    final_total_hypernyms_predicted_trainset = predict_hypernyms(queryset, word_vocab, embedding_matrix, trained_model,
                                                                model, device)
    final_total_hypernyms_predicted_testset = predict_hypernyms(queryset_test, word_vocab, embedding_matrix,
                                                                trained_model,model, device)

    # Writing the hypernyms in a text file
    write_hypernyms(hypernyms_to_be_saved_for_trainset_data, final_total_hypernyms_predicted_trainset)
    write_hypernyms(hypernyms_to_be_saved_for_testset_data, final_total_hypernyms_predicted_testset)

### Writing the results

In [None]:
def write_hypernyms(hypernyms_to_be_saved, total_hypernyms_predicted):
    print("within write_hypernyms method")
    hyp_counter = 0
    for hyp in total_hypernyms_predicted:
        if len(hyp) > 15:
            counter_limit = 15  # selecting atmost top 15 hypernyms
        else:
            counter_limit = len(hyp)

        while counter_limit > 0:
            with open(hypernyms_to_be_saved, 'a') as f:
                f.writelines("%s\t" % hyp[hyp_counter][0])
                print(hyp[hyp_counter][0])
            f.close()
            hyp_counter = hyp_counter + 1
            counter_limit = counter_limit - 1
        with open(hypernyms_to_be_saved, 'a') as f:
            print("------------------")
            hyp_counter = 0
            f.writelines("\n")
        f.close()

### Model Architecture, Training and Prediction

In [None]:
class GRUModel(nn.Module):
    def __init__(self):
        super(GRUModel, self).__init__()
        self.gru1 = nn.GRU(input_size=300, hidden_size=300, num_layers=1, batch_first=True, dropout=0.3)
        self.gru2 = nn.GRU(input_size=300, hidden_size=300, num_layers=1, batch_first=True, dropout=0.2)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(300, 300)
        self.fc2 = nn.Linear(300, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out, _ = self.gru1(x)
        out, _ = self.gru2(out[:, -1, :].unsqueeze(1))
        out = self.flatten(out)
        out = self.fc1(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

class LSTMModel(nn.Module):
    def __init__(self):
        super(LSTMModel, self).__init__()
        self.lstm1 = nn.LSTM(input_size=300, hidden_size=300, num_layers=1, batch_first=True, dropout=0.3)
        self.lstm2 = nn.LSTM(input_size=300, hidden_size=300, num_layers=1, batch_first=True, dropout=0.2)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(300, 300)
        self.fc2 = nn.Linear(300, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out, _ = self.lstm1(x)
        out, _ = self.lstm2(out[:, -1, :].unsqueeze(1))
        out = self.flatten(out)
        out = self.fc1(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

def model_training(training_query_hypernym_pair, y_train, testing_query_hypernym_pair, y_test,
                   validation_query_hypernym_pair, y_validation, dataset, model_type, device, batch_size=32):
    loss_fn = nn.BCELoss()
    adam_optimiser = optim.Adam

    if model_type == "gru":
        model = GRUModel().to(device)
    else:
        model = LSTMModel().to(device)

    criterion = loss_fn
    optimizer = adam_optimiser(model.parameters(), lr=0.003)

    X_train_tensor = torch.tensor(training_query_hypernym_pair, dtype=torch.float32).to(device)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
    X_test_tensor = torch.tensor(testing_query_hypernym_pair, dtype=torch.float32).to(device)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32).to(device)
    X_val_tensor = torch.tensor(validation_query_hypernym_pair, dtype=torch.float32).to(device)
    y_val_tensor = torch.tensor(y_validation, dtype=torch.float32).to(device)

    train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    test_dataset = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

    for epoch in range(10):
        model.train()
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            targets = targets.unsqueeze(1)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

    model.eval()
    eval_loss = 0.0
    total_samples = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            targets = targets.unsqueeze(1)
            eval_loss += criterion(outputs, targets).item() * inputs.size(0)
            total_samples += inputs.size(0)

    evaluation_test_set_score = eval_loss / total_samples

    torch.save(model.state_dict(), f"{model_type}_{dataset}_trained_model.pth")

    return model, evaluation_test_set_score

def predict_hypernyms(queryset, given_vocab, embedding_matrix, model, model_type="gru", device='cuda'):
    vocab_size = len(given_vocab)
    prediction_array = np.zeros((vocab_size - 1, 2, 300))
    total_hypernyms_predicted = []
    target_hypernyms = []

    for query in queryset:
        if query in given_vocab:
            index = given_vocab.index(query)
            query_embed = embedding_matrix[index]
        else:
            query_embed = embedding_matrix[0]

        for j in range(vocab_size - 1):
            prediction_array[j][0] = query_embed
            prediction_array[j][1] = embedding_matrix[j + 1]

        prediction_array_tensor = torch.tensor(prediction_array, dtype=torch.float32).to(device)
        predicted_hypernyms = model(prediction_array_tensor).detach().cpu().numpy()

        for z in range(len(predicted_hypernyms)):
            if predicted_hypernyms[z][0] < 0.5:
                continue
            else:
                target_hypernyms.append([given_vocab[z], predicted_hypernyms[z][0]])
        total_hypernyms_predicted.append(target_hypernyms)
        target_hypernyms = []

    for i in range(len(total_hypernyms_predicted)):
        total_hypernyms_predicted[i] = sorted(total_hypernyms_predicted[i], key=lambda x: x[1], reverse=True)

    return total_hypernyms_predicted

### Dataset Preparation

In [None]:
import torch
import random

def dataset_preparation(queryset, word_vocab, hypernyms_tab_sep, embedding_matrix):
    # generating a list of query-hypernym pairs for given corpus using embedding
    query_hypernym_training_embedding = []
    y_label = []

    for i in range(len(queryset)):
        if queryset[i] in word_vocab:
            index = word_vocab.index(queryset[i])
            query_embedd = torch.tensor(embedding_matrix[index], dtype=torch.float32)

            for hypernyms_found in hypernyms_tab_sep[i].split("\t"):
                hypernyms_found = "_".join(hypernyms_found.split())
                if hypernyms_found in word_vocab:
                    hypernym_index = word_vocab.index(hypernyms_found)
                    hypernym_embedd = torch.tensor(embedding_matrix[hypernym_index], dtype=torch.float32)
                    query_hypernym_pair = torch.stack((query_embedd, hypernym_embedd), dim=0)
                    query_hypernym_training_embedding.append(query_hypernym_pair)
                    y_label.append(1)  # positive label for each positive pair

                    # generating 5 negative samples per positive sample
                    negative_samples_counter = 0

                    while negative_samples_counter == 5:
                        random_index = random.randint(0, len(word_vocab) - 1)
                        if random_index != index:
                            negative_hypernym_embedd = torch.tensor(embedding_matrix[random_index], dtype=torch.float32)
                            negative_pair = torch.stack((query_embedd, negative_hypernym_embedd), dim=0)
                            query_hypernym_training_embedding.append(negative_pair)
                            y_label.append(0)  # negative label for negative pairs
                            negative_samples_counter += 1
                else:
                    print(hypernyms_found)
                    query_embedd = torch.tensor(embedding_matrix[index], dtype=torch.float32)
                    hypernym_index = 0
                    hypernym_embedd = torch.tensor(embedding_matrix[hypernym_index], dtype=torch.float32)
                    query_hypernym_pair = torch.stack((query_embedd, hypernym_embedd), dim=0)
                    query_hypernym_training_embedding.append(query_hypernym_pair)
                    y_label.append(1)  # positive label for each positive pair

                    # generating 5 negative samples per positive sample
                    negative_samples_counter = 0
                    while negative_samples_counter == 5:
                        random_index = random.randint(0, len(word_vocab) - 1)
                        if random_index != index:
                            negative_hypernym_embedd = torch.tensor(embedding_matrix[random_index], dtype=torch.float32)
                            negative_pair = torch.stack((query_embedd, negative_hypernym_embedd), dim=0)
                            query_hypernym_training_embedding.append(negative_pair)
                            y_label.append(0)  # negative label for negative pairs
                            negative_samples_counter += 1

    # Shuffling the dataset
    combined_dataset = list(zip(query_hypernym_training_embedding, y_label))
    random.shuffle(combined_dataset)
    query_hypernym_training_embedding, y_label = zip(*combined_dataset)

    query_hypernym_training_embedding = torch.stack(query_hypernym_training_embedding, dim=0)
    y_label = torch.tensor(y_label, dtype=torch.float32)

    return query_hypernym_training_embedding, y_label

In [None]:
# Call main funtion for dataset 1A
hypernym_discovery_baseline("1A", "gru")
hypernym_discovery_baseline("1A", "lstm")