In [None]:
"""Pilot experiment for computational annotator"""

In [None]:
"""some helper functions"""
import torch, pandas as pd

def get_sentences_from_dataset(path_of_dataset):
    df = pd.read_csv(path_of_dataset, delimiter="\t",header=None, 
                    names=["word", "category", "index_string", "sentence_left", "sentence_right"])
    sentences_left = df.sentence_left.tolist()
    sentences_right = df.sentence_right.tolist()
    # print(df.sample)
    word_count_left = df.sentence_left.str.split(" ").str.len()
    word_count_right = df.sentence_right.str.split(" ").str.len()
    sentence_maximum_length = max(word_count_left.max(), word_count_right.max())
    print("maximum sentence length of the dataset " + path_of_dataset.split("/")[-1] + " is: " + str(sentence_maximum_length))
    # print(df.sentence_left[word_count_left.idxmax()])
    # print(df.sentence_right[word_count_right.idxmax()])
    # print(word_count_left.idxmax())
    # print(word_count_right.idxmax())
    return sentences_left, sentences_right

from collections import defaultdict
def get_index_of_duplicates(seq):
    tally = defaultdict(list)
    for i,item in enumerate(seq):
        if (item != None):
          tally[item].append(i + 1)
    tally = dict(tally)
    # for key in tally.copy():
    #   if len(tally[key]) == 1:
    #     del tally[key]
    return tally

def get_max_sentence_length_of_a_dataset_by_tokenizer(tokenizer, path_of_dataset):
    df = pd.read_csv(path_of_dataset, delimiter="\t",header=None, 
                    names=["word", "category", "index_string", "sentence_left", "sentence_right"])
    sentences_left = df.sentence_left.tolist()
    sentences_right = df.sentence_right.tolist()

    max_len = 0
    # For every sentence...
    for sent in sentences_left + sentences_right:

        # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
        input_ids = tokenizer.encode(sent, add_special_tokens=True)

        # Update the maximum sentence length.
        max_len = max(max_len, len(input_ids))
    print('Max sentence length of whole dataset by tokenizer is:', max_len)

from transformers import BertTokenizerFast, XLMRobertaTokenizerFast
def get_BERT_tokenizer():
    # Load the BERT tokenizer.
    print('Loading BERT tokenizer...')
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', do_lower_case=True)
    return tokenizer

def get_XLMRTokenizer():
    print('Loading XLMR tokenizer...')
    tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")
    return tokenizer

def get_word_from_id_for_XLMR(id):
    tokenizer = XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-base')
    word_id = id
    word = tokenizer.decode([word_id])
    return word

def get_input_ids_and_so_on(tokenizer, path_of_dataset):
    df = pd.read_csv(path_of_dataset, delimiter="\t",header=None, 
                    names=["word", "category", "index_string", "sentence_left", "sentence_right"])
    sentences_left = df.sentence_left.tolist()
    sentences_right = df.sentence_right.tolist()

    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids_left = []
    input_ids_right = []
    attention_masks_left = []
    attention_masks_right = []
    subword_index_list_left = []
    subword_index_list_right = []

    # For every sentence...
    for sent in sentences_left:
        # `encode_plus` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        #   (5) Pad or truncate the sentence to `max_length`
        #   (6) Create attention masks for [PAD] tokens.
        encoded_dict = tokenizer.encode_plus(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = 64,           # Pad & truncate all sentences.
                            padding='max_length',
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                    )
        
        subword_index_list_left.append(get_index_of_duplicates(encoded_dict.word_ids()[1:-1]))

        # Add the encoded sentence to the list.    
        input_ids_left.append(encoded_dict['input_ids'])
        
        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks_left.append(encoded_dict['attention_mask'])

    for sent in sentences_right:
        encoded_dict = tokenizer.encode_plus(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = 64,           # Pad & truncate all sentences.
                            padding='max_length',
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                    )
        
        subword_index_list_right.append(get_index_of_duplicates(encoded_dict.word_ids()[1:-1]))
        input_ids_right.append(encoded_dict['input_ids'])
        attention_masks_right.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.
    input_ids_left = torch.cat(input_ids_left, dim=0)
    attention_masks_left = torch.cat(attention_masks_left, dim=0)

    input_ids_right = torch.cat(input_ids_right, dim=0)
    attention_masks_right = torch.cat(attention_masks_right, dim=0)

    # labels = torch.tensor(labels)

    # Print sentence 0, now as a list of IDs.
    print('sample sentence original: ', sentences_left[0])
    print('sample sentence Token IDs:', input_ids_left[0])

    return (input_ids_left, input_ids_right, attention_masks_left, attention_masks_right, subword_index_list_left, subword_index_list_right)

def get_padded_index(tokenizer, path_of_dataset, subword_index_list_left, subword_index_list_right):
    df = pd.read_csv(path_of_dataset, delimiter="\t",header=None, 
                    names=["word", "category", "index_string", "sentence_left", "sentence_right"])
    
    sentences_left = df.sentence_left.tolist()
    sentences_right = df.sentence_right.tolist()


    # print(len(subword_index_list_left))
    # print(len(subword_index_list_right))

    index_list = df.index_string
    # print(len(index_list))
    index_left_padded = list()
    index_right_padded = list()

    for index, index_string in enumerate(index_list[:]):
        word_index_left = int(index_string.split("-")[0])
        word_index_right = int(index_string.split("-")[1])
        if len(subword_index_list_left[index][word_index_left]) > 1:
            index_left_padded.append(subword_index_list_left[index][word_index_left])
        else:
            index_left_padded.append(subword_index_list_left[index][word_index_left][0])
        # if len(subword_index_list_left[index][word_index_left]) > 1:
        #   print(sentences_left[index])
        #   print(tokenizer.tokenize(sentences_left[index]))
        #   print(subword_index_list_left[index])
        if len(subword_index_list_right[index][word_index_right]) > 1:
            index_right_padded.append(subword_index_list_right[index][word_index_right])
        else:
            index_right_padded.append(subword_index_list_right[index][word_index_right][0])

        # print(len(index_left_padded))
        # print(len(index_right_padded))

    print('indices of key word in first 10 sentences:')
    print(index_list[:10])
    print('indices of key word in first 10 sentences after padding:')
    print("on the left: " + str(index_left_padded[:10]))
    print("on the right: " + str(index_right_padded[:10]))
    # print("7th sentence: ")
    print("first sentence on the left: ", end='')
    print(sentences_left[0])
    print("first sentence on the right: ", end='')
    print(sentences_right[0])
    encoded_dict = tokenizer.encode_plus(
                        sentences_left[0],                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        padding='max_length',
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                    )
    # print('input ids of the 7th sentence:')
    print('input ids of the first sentence on the left:')
    print(encoded_dict['input_ids'])

    encoded_dict = tokenizer.encode_plus(
                        sentences_right[0],                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        padding='max_length',
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                    )
    print('input ids of the first sentence on the right:')
    print(encoded_dict['input_ids'])
    return (index_left_padded, index_right_padded)

from transformers import BertModel
import numpy as np
def save_embeddings(device, input_ids, attention_masks, index_padded, saving_path, model):
    print("enter save_embeddings function")

    input_ids = input_ids.to(device)
    attention_masks = attention_masks.to(device)
    print(input_ids.is_cuda)
    print(attention_masks.is_cuda)

    # model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
    
    model = model.to(device)

    set_length = len(input_ids)
    print(set_length)
    token_embeddings_output = list()

    layers_list = [1, 12]

    model.eval()
    for index in range(set_length):
        if index % 10 == 0:
            print(index)
        with torch.no_grad():
            outputs = model(input_ids[index].unsqueeze(0), token_type_ids=None, attention_mask=attention_masks[index].unsqueeze(0))
            hidden_states = outputs[2]
            token_embeddings = torch.stack(hidden_states, dim=0)
            # print(token_embeddings.shape)
            token_embeddings = torch.squeeze(token_embeddings, dim=1)
            # print(token_embeddings.shape)
            token_embeddings = token_embeddings.permute(1, 0, 2)
            # print(token_embeddings.shape)
            if type(index_padded[index]) is int:
                # print("enter this condition")
                sum_vec = np.sum([np.array(token_embeddings[index_padded[index]][layer].cpu()) for layer in layers_list], axis=0)
                token_embeddings_output.append(sum_vec)
            else:
                sum_vec = np.sum([np.array(token_embeddings[index_padded[index][0]][layer].cpu()) for layer in layers_list], axis=0)
                for i in range(1, len(index_padded[index])):
                    sum_vec = np.add(sum_vec, np.sum([np.array(token_embeddings[index_padded[index][i]][layer].cpu()) for layer in layers_list], axis=0))
                sum_vec = sum_vec/len(index_padded[index])
                # print(sum_vec.shape)
                token_embeddings_output.append(sum_vec)

    token_embeddings_output = np.array(token_embeddings_output)
    print(token_embeddings_output.shape)
    # print(token_embeddings_output)
    np.save(saving_path, token_embeddings_output)
    

In [None]:
'''use GPU if possible'''
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
'''prepare data for train set'''
sentences_left_train, sentences_right_train = get_sentences_from_dataset("WiC_dataset/train/train.data.txt")
# tokenizer = get_BERT_tokenizer()
tokenizer = get_XLMRTokenizer()

# # Print the original sentence.
# print(' Original: ', sentences_left_train[0])
# # Print the sentence split into tokens.
# print('Tokenized: ', tokenizer.tokenize(sentences_left_train[0]))
# # Print the sentence mapped to token ids.
# print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences_left_train[0])))

# print(tokenizer.tokenize('word embeddings are vectors .'))
# tokens = tokenizer('word embeddings are vectors .', add_special_tokens=True, return_attention_mask=False, return_token_type_ids=False)
# print(tokens.word_ids())
# print(get_index_of_duplicates(tokens.word_ids()[1:-1]))

get_max_sentence_length_of_a_dataset_by_tokenizer(tokenizer, "WiC_dataset/train/train.data.txt")

input_ids_left, input_ids_right, attention_masks_left, attention_masks_right, subword_index_list_left, subword_index_list_right = get_input_ids_and_so_on(tokenizer, "WiC_dataset/train/train.data.txt");


print(f"The corresponding token for word id {0} is: {get_word_from_id_for_XLMR(0)}")
print(f"The corresponding token for word id {2} is: {get_word_from_id_for_XLMR(2)}")
print(f"The corresponding token for word id {5} is: {get_word_from_id_for_XLMR(5)}")
print(f"The corresponding token for word id {6} is: {get_word_from_id_for_XLMR(6)}")

index_left_padded, index_right_padded = get_padded_index(tokenizer, "WiC_dataset/train/train.data.txt", subword_index_list_left, subword_index_list_right)

print(f"The corresponding token for word id {85358} is: {get_word_from_id_for_XLMR(85358)}")
print(f"The corresponding token for word id {2258} is: {get_word_from_id_for_XLMR(2258)}")
print(f"The corresponding token for word id {10484} is: {get_word_from_id_for_XLMR(10484)}")





In [None]:
'''save embeddings for train set'''
from transformers import XLMRobertaModel

print(input_ids_left[0].shape)
print(input_ids_left[0].unsqueeze(0).shape)
# model = XLMRobertaModel.from_pretrained('xlm-roberta-base', output_hidden_states=True)
# save_embeddings(device, input_ids_left, attention_masks_left, index_left_padded, 'npy_train/token_embeddings_left.npy', model)

model = XLMRobertaModel.from_pretrained('xlm-roberta-base', output_hidden_states=True)
save_embeddings(device, input_ids_right, attention_masks_right, index_right_padded, 'npy_train/token_embeddings_right.npy', model)

In [None]:
'''prepare data for dev set'''
sentences_left_dev, sentences_right_dev = get_sentences_from_dataset("WiC_dataset/dev/dev.data.txt")
# tokenizer = get_BERT_tokenizer()

get_max_sentence_length_of_a_dataset_by_tokenizer(tokenizer, "WiC_dataset/dev/dev.data.txt")
input_ids_left, input_ids_right, attention_masks_left, attention_masks_right, subword_index_list_left, subword_index_list_right = get_input_ids_and_so_on(tokenizer, "WiC_dataset/dev/dev.data.txt");
index_left_padded, index_right_padded = get_padded_index(tokenizer, "WiC_dataset/dev/dev.data.txt", subword_index_list_left, subword_index_list_right)

In [None]:
'''save embeddings for dev set'''
model = XLMRobertaModel.from_pretrained('xlm-roberta-base', output_hidden_states=True)
save_embeddings(device, input_ids_left, attention_masks_left, index_left_padded, 'npy_dev/token_embeddings_left.npy', model)

model = XLMRobertaModel.from_pretrained('xlm-roberta-base', output_hidden_states=True)
save_embeddings(device, input_ids_right, attention_masks_right, index_right_padded, 'npy_dev/token_embeddings_right.npy', model)

In [None]:
"""prepare test set"""
import numpy as np
labels_test = pd.read_csv("WiC_dataset/dev/dev.gold.txt", delimiter="\t", header=None, names=["label"])
labels_test.replace({"F": 0, "T": 1}, inplace=True)
labels_test = labels_test.values
print(len(labels_test))
embeddings_left_test = np.load('npy_dev/token_embeddings_left.npy')
print(embeddings_left_test.shape)
embeddings_right_test = np.load('npy_dev/token_embeddings_right.npy')
print(embeddings_right_test.shape)
concatenation_test = np.hstack((embeddings_left_test, embeddings_right_test))
print(concatenation_test.shape)

print(concatenation_test[0])


In [None]:
"""training process"""

embeddings_left = np.load('npy_train/token_embeddings_left.npy')
# print(embeddings_left.shape)
embeddings_right = np.load('npy_train/token_embeddings_right.npy')
# print(embeddings_right.shape)
concatenation = np.hstack((embeddings_left, embeddings_right))

labels = pd.read_csv("WiC_dataset/train/train.gold.txt", delimiter="\t", header=None, names=["label"])
labels.replace({"F": 0, "T": 1}, inplace=True)
labels = labels.values

X_train, y_train = concatenation, labels

number_samples, number_features = concatenation.shape
print("number of sample is " + str(number_samples))
print("number of feature is " + str(number_features))

X_test, y_test = concatenation_test, labels_test

from sklearn.preprocessing import StandardScaler 
sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

X_train = torch.from_numpy(X_train.astype(np.float32))
X_test = torch.from_numpy(X_test.astype(np.float32))
y_train = torch.from_numpy(y_train.astype(np.float32))
y_test = torch.from_numpy(y_test.astype(np.float32))

y_train = y_train.view(y_train.shape[0], 1)
y_test = y_test.view(y_test.shape[0], 1)
print(y_train.shape)
# print(y_train)

In [None]:
# 1) Model
# Linear model f = wx + b , sigmoid at the end
import torch.nn as nn

class Model(nn.Module):
    def __init__(self, n_input_features):
        super(Model, self).__init__()
        self.l1 = nn.Linear(n_input_features, 100)
        self.relu = nn.ReLU()
        self.l2 = nn.Linear(100, 1)
        # self.l1 = nn.Linear(n_input_features, 1)

    def forward(self, x):
        out = self.l1(x)
        out = self.relu(out)
        out = self.l2(out)
        y_pred = torch.sigmoid(out)
        # y_pred = torch.sigmoid(self.l1(x))
        return y_pred

# 2) Loss and optimizer
num_epochs = 200
learning_rate = 0.01
criterion = nn.BCELoss()
print(number_features)
model = Model(number_features)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# 3) Training loop
for epoch in range(num_epochs):
    # Forward pass and loss
    y_pred = model(X_train)
    # print(y_pred)
    loss = criterion(y_pred, y_train)

    # Backward pass and update
    loss.backward()
    optimizer.step()

    # zero grad before new step
    optimizer.zero_grad()

    if (epoch+1) % 5 == 0:
        # print(f'epoch: {epoch+1}, loss = {loss.item():.4f}')

        with torch.no_grad():
            y_predicted = model(X_test)
            y_predicted_cls = y_predicted.round()
            acc = y_predicted_cls.eq(y_test).sum() / float(y_test.shape[0])
            print(f'epoch: {epoch+1} accuracy: {acc.item():.4f}')