In [1]:
import numpy as np
import pandas as pd
import torch
import pickle
import re
import matplotlib.pyplot as plt
from itertools import chain
from collections import Counter
import torch.nn as nn
import glob
import random
from sklearn.metrics.pairwise import cosine_similarity
from transformers import DistilBertTokenizer, DistilBertModel
from tqdm import tqdm
import time

import sys
sys.path.insert(0, '../../src/models/')
sys.path.insert(0, '../../src/features/')
#sys.path.insert(0, '../src/visualization/')

from predict_model import loadBERT
from predict_model import SpanPredictor as classify
from build_features import text_cleaner
from build_features import get_prediction_results
#import visualize as vis

%matplotlib inline

In [2]:
# specify device
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

# Load the BERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Bert mode
bert = DistilBertModel.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
sents = ['The birds has a black bill.',
         'The bill of the bird is black.', 
         'This is something else.',
         'The European has a orange bill.']

In [4]:
class BERT(nn.Module):
    def __init__(self, bert):

        super(BERT, self).__init__()

        # Distil Bert model
        self.bert = bert

    #define the forward pass
    def forward(self, **kwargs):

        #pass the inputs to the model BERT  
        cls_hs = self.bert(**kwargs)
        hidden_state = cls_hs.last_hidden_state

        return hidden_state

In [5]:
model = BERT(bert)

In [6]:
def similarity_matrix(sentence_list):
    
    """
    Calculates a similarity cosine matrix based on a list of
    sentences.
    """
    # Initialize dictionary to store tokenized sentences
    tokens = {'input_ids': [], 'attention_mask': []}

    for sentence in sentence_list:
        # encode each sentence and append to dictionary
        new_tokens = tokenizer.encode_plus(sentence, max_length=128,
                                           truncation=True, padding='max_length',
                                           return_tensors='pt')
        # Drop the batch dimension
        tokens['input_ids'].append(new_tokens['input_ids'][0])
        tokens['attention_mask'].append(new_tokens['attention_mask'][0])
    
    # Reformat list of tensors into single tensor
    tokens['input_ids'] = torch.stack(tokens['input_ids'])
    tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
    
    # Get vectors
    hiddenstates = model(**tokens)
    # Sum along first axis
    summed_hs = torch.sum(hiddenstates, 1)
    # Detach
    summed_hs_np = summed_hs.detach().numpy()
    # Get the matrix
    return cosine_similarity(summed_hs_np, summed_hs_np).round(5)

In [7]:
text= """
The brown bear (Ursus arctos) is a large bear species found across Eurasia and North America.[1][2] In North America, the populations of brown bears are called grizzly bears, while the subspecies that inhabits the Kodiak Islands of Alaska is known as the Kodiak bear. It is one of the largest living terrestrial members of the order Carnivora, rivaled in size only by its closest relative, the polar bear (Ursus maritimus), which is much less variable in size and slightly bigger on average.[3][4][5][6][7] The brown bear's range includes parts of Russia, Central Asia, China, Canada, the United States, Hokkaido, Scandinavia, Finland, the Balkans, the Picos de Europa and the Carpathian region (especially Romania), Iran, Anatolia, and the Caucasus.[1][8] The brown bear is recognized as a national and state animal in several European countries.[9]
"""

In [8]:
textlist = text_cleaner(text)

In [11]:
similarity_matrix(textlist)

array([[1.     , 0.84999, 0.77776, 0.84397, 0.76749],
       [0.84999, 1.     , 0.84509, 0.88701, 0.81679],
       [0.77776, 0.84509, 1.     , 0.86323, 0.75792],
       [0.84397, 0.88701, 0.86323, 1.     , 0.84067],
       [0.76749, 0.81679, 0.75792, 0.84067, 1.     ]], dtype=float32)

In [10]:
textlist

['It is one of the largest living terrestrial members of the order Carnivora, rivaled in size only by its closest relative, the polar bear , which is much less variable in size and slightly bigger on average.',
 'In North America, the populations of brown bears are called grizzly bears, while the subspecies that inhabits the Kodiak Islands of Alaska is known as the Kodiak bear.',
 "The brown bear's range includes parts of Russia, Central Asia, China, Canada, the United States, Hokkaido, Scandinavia, Finland, the Balkans, the Picos de Europa and the Carpathian region , Iran, Anatolia, and the Caucasus.",
 'The brown bear  is a large bear species found across Eurasia and North America.',
 'The brown bear is recognized as a national and state animal in several European countries.']