In [None]:
import scipy.io as sio
from scipy.stats import pearsonr
from gensim.models import KeyedVectors

from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
import copy
from tqdm.auto import tqdm
import torch

stimuli_location = "Stimuli/Text"

# Creating the Surprisal values.

Read in the word vectors, divide them into sentences based on the sentence boundries provided, and read in the word vectors as a gensim model.

In [None]:
# input: string dictating the location of the RunX.mat files
# output: a three dimensional list with the words of shape: #runs x #sentences x #words

def read_mat(location):
    
    all_runs = [] 

    for i in range(1,21): # for each run
        run = []
        loc = stimuli_location + "/Run" + str(i) + ".mat"    
        words = sio.loadmat(loc)
        j = 0
        boundrys = words['sentence_boundaries'][0]

        if i == 4:                    # fix wrong last sentence boundry in run 4 (source file mistake). 
            boundrys[-1] = 181

        for boundry in boundrys:  # for each sentence
            sentence = []
            for word in words['wordVec'][j:]:  # for each word 
                if words['onset_time'][j] < boundry:    # check if word is in sentence, if so add, else go to next sentence.
                    sentence.append(word[0][0].lower()) 
                    j+=1
                else:
                    break
            run.append(sentence)
        all_runs.append(run)
    
    return all_runs

### Calculating the pearsons correlations.

Here the pearson's correlations are calculated for each word with the previous seen words in the sentence.
When there are no previous words in the sentence we look at the full previous sentence instead.
But what if it's the first word of a run? No documentation on this in the broderick paper. 

We set the similarity to 1. which later on gets changed to a dissimilarity of 0 and hence gets ignored by the regression model.
Another issue is if the previous sentence is an empty sentence (happens sometimes due to us only looking at content words), what do you do then? no documentation again. We look back until a non empty sentence is found but this might not be the right solution. Might be better to just ignore these words.


In [None]:
# input: the word embedding and the embeddings of the sentence it occurs in. (up to the word)
# output: the correlation between the word embedding and the average of the sentence embeddings 

def calc_cor(word, sentence, model):
    avg = np.mean(np.stack([model[word2] for word2 in sentence]),0)  
    cor, _ = pearsonr(model[word],avg)
    return cor


# calculate the similarity vectors,
# note that you run into problems when you enctounter sentences of size 0, 
# we simply look back multiple times to fix this

# input: 3d words list generated by read_mat
# output: same format as input but now contains similarity values instead of the words themself 

def broderick_values(all_runs_words):

    # read in the relevant word vectors.
    model = KeyedVectors.load_word2vec_format("limited_vectors.txt", binary=False, encoding="utf8")


    all_runs_sim = []
    for run in all_runs_words: # for all runs
        run_sim = []
        for i, sentence in enumerate(run): # for all sentences
            sentence_sim = []
            for j, word in enumerate(sentence): # for all words, 

                # check if its the first word of the sentence or not.
                if j != 0: 

                    # if not then avarage the vectors of the previous seen words in this sentence and calculate the correlation.
                    sentence_sim.append(calc_cor(word, sentence[:j], model))
                else:

                    # if it was the first word in sentence but not the first sentence.
                    if i != 0:

                        # if the previous sentence is not empty.
                        if run[i-1] !=[]: 

                            # if not then avarage the vectors of the previous sentence and calculate the correlation.
                            sentence_sim.append(calc_cor(word, run[i-1], model))
                        else:

                            # if that previous previous is also not empty
                            if run[i-2] !=[]:

                                # if not then avarage the vectors of the previous sentence and calculate the correlation.
                                sentence_sim.append(calc_cor(word, run[i-2], model))
                            else:
                                # if not then avarage the vectors of the previous sentence and calculate the correlation.
                                sentence_sim.append(calc_cor(word, run[i-3], model))
                    else:

                        # if it the first word in the first sentence then append a 1 as simiilarity. which becomes a 0 disimilarity later
                        sentence_sim.append(1)
            run_sim.append(sentence_sim)
        all_runs_sim.append(run_sim)
    
    return all_runs_sim

## Using BERT masked language task instead.

The BERT langauge model is used to predict the word in its sentence. Its prediction score is used as a measure of similarity. The lower the score the higher the dissimilarity. The get normalized at the end.

This part is somewhat unfinshed. While we have initial results, there are still a few problems in the implementation that need to be addressed. they are:
 - The model also looks ahead in the sentence.
 - some words get split into subwords by the tokenizer, they are then ignored in this implementation, better solution needs to be found.
 - small sentences often don't perform too well as there is not enough information to base a good prediction on.
     - possible solution to this problem is to give the last n sentences as input instead of only the current one, this way you also solve the first word of sentence problem when its no longer allowed to look into the future of the sentence. 

create for each word a copy of the sentence its in with itself masked as the prediction task.

In [None]:
#
def create_masked_input(tokenizer,inputs,context_size, future_look=False):
    
    #creating a deep copy that we can adjust as needed 
    all_runs2 = copy.deepcopy(inputs)
    
    # these empty sentences need to be removed because they do not occur in the real sentence
    # there are other empty sentences, but those can be explained by being short sentence with no target words.
    # not sure why they are contained in the proveded dataset, a bug? 
    del all_runs2[3][16]
    del all_runs2[3][36]
    del all_runs2[3][38]
    del all_runs2[4][15]
    
    
    #loading in the txt file that contains the full sentences.
    with open('full_text.txt', 'r', encoding="utf8") as f:
        lines = f.readlines()

    
    # initializations
    found = 0
    skip = 0
    sen = 0
    not_founds = []
    all_masks = []
    all_ids = []
    
    # Que that contains "context_size" number of previous sentences
    # these are the sentences passed along to bert for predictions along with a portion of the curernt sentence
    que = []
    
    
    
    
    for l, run in enumerate(all_runs2):
        run_ids = []
        masked_run = []
        
        
        
        for i, line in enumerate(run):
            line_ids = []
            masked_line = []
            
            sentence = lines[sen]
            t_sentence = tokenizer.tokenize(sentence)
            flatten_que = [wword for ssentence in que for wword in ssentence]
            
            #print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
            #print("full sentence:" ,t_sentence, "\n")
            #print("words:", line, "\n")
            
            seen_upto = 0
            for j, word in enumerate(line):

                if word in t_sentence:
                    
                    # find the index of the word in the sentence, and transform that word in the mask.
                    # look only in the part of the sentence you havent seen yet for the previous word
                    # in order to handle duplicate words in the sentence. (done using "seen_upto")
                    ind = seen_upto + t_sentence[seen_upto:].index(word)
                    mask = t_sentence[:ind]
                    line_ids.append(len(flatten_que) + ind)
                    mask = mask + ['[MASK]']
                    
                    if future_look :
                        mask = mask + t_sentence[ind+1:]
                    
                    
                    
                    #print(que + mask)
                    #print(ASDfasdf)
                    
                    # add to output and update variables
                    masked_line.append(flatten_que + mask)
                    found += 1
                    seen_upto = ind
                    
                    #print(flatten_que + mask)
                else:
                    #print(word)
                    
                    # if word not found: append as None in output (to catch in next steps)
                    line_ids.append(None)
                    not_founds.append(word)
                    skip +=1

                    
                #que management
            que.append(t_sentence)
            if len(que) > context_size:
                #print("asdfasdfasdfasdfasdfasdfasdfasdfasdfasdfasdfasdfasdfasdfasdfasdfasdfasdfasdfasd")
                que.pop(0)
                
            # add to higher lists
            masked_run.append(masked_line)
            run_ids.append(line_ids)
            sen+=1
        
        # add to higher lists
        all_masks.append(masked_run)
        all_ids.append(run_ids)
            
        
    return(all_masks,all_ids,all_runs2)

    
#test1, test2 = create_masked_input(all_runs,1,False)

In [None]:
def bert_prediction(tokenizer, masked_sentence_runs, all_runs_ids, words, printt=False):
    # load the pretrained model.

    model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    model.eval()

    bert_scores_runs = []

    for l,run in enumerate(tqdm(all_runs_ids)): # for all runs
        bert_scores_lines = []      
        
        for i,line in enumerate(run):  # for all sentences.
            ii = 0
            bert_scores_words = []
            
            for j, idx in enumerate(line):
                
                if idx != None:
                    inp = ['[CLS]'] + masked_sentence_runs[l][i][ii]
                    ii += 1
                    indexed_tokens = tokenizer.convert_tokens_to_ids(inp)
                    segments_ids = [0]* len(indexed_tokens)

    #                 tokens_tensor = torch.tensor([indexed_tokens]).cuda()
    #                 segments_tensors = torch.tensor([segments_ids]).cuda()

                    tokens_tensor = torch.tensor([indexed_tokens])
                    segments_tensors = torch.tensor([segments_ids])

                    # ask bert for predictions for the masked sentence.
                    with torch.no_grad():
                        predictions = model(tokens_tensor, segments_tensors)

                    orig_word = words[l][i][j]
                    orig_id = tokenizer.convert_tokens_to_ids([orig_word])

                    # get the score of this word.
                    score = predictions[0,idx+1][orig_id]
                    bert_scores_words.append(score.item())

                    # some prints to clearify whats happening.
                    predicted_index = predictions[0, idx+1].topk(5)[1].cpu().numpy()
                    predicted_token = tokenizer.convert_ids_to_tokens(predicted_index)



                    if printt:
                        print('>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<< run: ', l,' line: ', i)
                        print(" ")
                        
                        print("sentence:", inp)
                        print("target word:",orig_word)
                        print("token id:",orig_id)
                        print("score given:",score.item())
                        print("top 5 predictions",predicted_token)
                        print("scores of top 5:",predictions[0, idx+1].topk(5)[0].cpu().numpy())
                        print(" ") 



                else:
                    bert_scores_words.append(None)
            bert_scores_lines.append(bert_scores_words)

        bert_scores_runs.append(bert_scores_lines)
        
    return bert_scores_runs

#bert_prediction(test1, test2,True)

Those scores are ofcourse not normalized, they seem to be somewhere around -3 to 10 but need to be normalized first.
This is done by this calculation: (value - minimum) / (maximum-minimum)
The results are scores normalized between 0 and 1.

In [None]:
def normalise_bert(bert_scores_runs):

    total = 0
    count = 0
    flat_lst = []

    # create a flat list of values to extract max /min etc from, also calc total here.
    for run in bert_scores_runs:
        for line in run:
            for word in line:
                if word != None:
                    flat_lst.append(word)
                    total+= word
                    count+= 1 

    # some prints and precalculating some values
    print('average', total/count)
    print('sd', np.std(flat_lst))
    u = np.mean(flat_lst)
    s = np.std(flat_lst)
    mi = np.min(flat_lst)
    ma = np.max(flat_lst)
    norm = ma-mi

    # applying the normalizations.
    bert_normalized_runs = []
    for run in bert_scores_runs:
        bert_normalized_lines = []
        for line in run:
            bert_normalized_words = []
            for word in line:
                if word != None:
                    bert_normalized_words.append((word - mi)/ norm)
                else: 
                    bert_normalized_words.append(0) #### should be 1?
            bert_normalized_lines.append(bert_normalized_words)
        bert_normalized_runs.append(bert_normalized_lines)

    
    return bert_normalized_runs
    

                


In [None]:
#input: method either "broderick" or "bert"

def create_disimilarity_values(method, context_size = 1, future_look = False, printt = False, save=True):
    all_runs_words = read_mat("Stimuli/Text")
    
    if method == "pearson":
        result = broderick_values(all_runs_words)
        
    if method == "bert":
        # tokenizer used
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        
        masked_sentence_runs, all_runs_ids, all_runs = create_masked_input(tokenizer, all_runs_words, context_size, future_look)
        bert_scores_runs = bert_prediction(tokenizer, masked_sentence_runs, all_runs_ids, all_runs, printt)
        result = normalise_bert(bert_scores_runs)
        
    if method == "static":
        result = [[[1 for word in sen] for sen in run] for run in all_runs_words]
        
    if save:
        if method == "bert":
            np.save('vectors/' + method +  '_' + str(context_size) +  '_' + str(future_look) + '.npy', np.array(result))
        else:
            np.save('vectors/' + method + '.npy', np.array(result))
        
    return result
    
    
    

Below are the commands to create the main variants of the suprisal values discussed in the thesis.

In [None]:
# res = create_disimilarity_values("pearson") # pearson.npy
# res = create_disimilarity_values("static") # static.npy

#res = create_disimilarity_values("bert", 0, True) # bert_0_True.npy.npy
# res = create_disimilarity_values("bert", 1, True) # bert_1_True.npy.npy
# res = create_disimilarity_values("bert", 2, True) # bert_2_True.npy.npy
# res = create_disimilarity_values("bert", 3, True) # bert_3_True.npy.npy
# res = create_disimilarity_values("bert", 4, True) # bert_4_True.npy.npy

# res = create_disimilarity_values("bert", 0, False) # bert_0_False.npy.npy
# res = create_disimilarity_values("bert", 1, False) # bert_1_False.npy.npy
# res = create_disimilarity_values("bert", 2, False) # bert_2_False.npy.npy
# res = create_disimilarity_values("bert", 3, False) # bert_3_False.npy.npy
res = create_disimilarity_values("bert", 6, False) # bert_4_False.npy.npy

# test = create_disimilarity_values("static", 5, False, False)