In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
import torch
import logging
logging.basicConfig(level=logging.ERROR)
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [2]:
!pip install transformers==2.8.0

Collecting transformers==2.8.0
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |████████████████████████████████| 573kB 8.6MB/s 
[?25hCollecting boto3
[?25l  Downloading https://files.pythonhosted.org/packages/57/3d/386cc84db1e57aa7782eed00bcbdb884e496bdb1689c7f4c09a22572846d/boto3-1.17.35-py2.py3-none-any.whl (131kB)
[K     |████████████████████████████████| 133kB 18.5MB/s 
Collecting tokenizers==0.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/d6/e3/5e49e9a83fb605aaa34a1c1173e607302fecae529428c28696fb18f1c2c9/tokenizers-0.5.2-cp37-cp37m-manylinux1_x86_64.whl (5.6MB)
[K     |████████████████████████████████| 5.6MB 18.4MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl 

In [3]:
!pip install tensorflow==1.13.1

Collecting tensorflow==1.13.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/29/6b4f1e02417c3a1ccc85380f093556ffd0b35dc354078074c5195c8447f2/tensorflow-1.13.1-cp37-cp37m-manylinux1_x86_64.whl (92.6MB)
[K     |████████████████████████████████| 92.6MB 32kB/s 
Collecting tensorboard<1.14.0,>=1.13.0
[?25l  Downloading https://files.pythonhosted.org/packages/0f/39/bdd75b08a6fba41f098b6cb091b9e8c7a80e1b4d679a581a0ccd17b10373/tensorboard-1.13.1-py3-none-any.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 49.1MB/s 
Collecting tensorflow-estimator<1.14.0rc0,>=1.13.0
[?25l  Downloading https://files.pythonhosted.org/packages/bb/48/13f49fc3fa0fdf916aa1419013bb8f2ad09674c275b4046d5ee669a46873/tensorflow_estimator-1.13.0-py2.py3-none-any.whl (367kB)
[K     |████████████████████████████████| 368kB 35.8MB/s 
Collecting keras-applications>=1.0.6
[?25l  Downloading https://files.pythonhosted.org/packages/71/e3/19762fdfc62877ae9102edf6342d71b28fbfd9dea3d2f96a882ce099b03

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
!cp -r "/content/drive/MyDrive/Information_retrieval_project/khan_acad/model_save_categorized_reduced_khan_acad" /content

In [6]:
!cp "/content/drive/MyDrive/Information_retrieval_project/khan_acad/train_khan_acad.csv" /content
!cp "/content/drive/MyDrive/Information_retrieval_project/khan_acad/test_khan_acad.csv" /content
!cp "/content/drive/MyDrive/Information_retrieval_project/khan_acad/val_khan_acad.csv" /content


In [7]:
import pandas as pd
train_data = pd.read_csv("train_khan_acad.csv")
test_data = pd.read_csv("test_khan_acad.csv")
val_data = pd.read_csv("val_khan_acad.csv")
train_data


Unnamed: 0,video_transcripts,hierarchy
0,In the last couple of videos we saw that we c...,math>>multivariable-calculus>>multivariable-de...
1,- What we're going to do in this video is gi...,science>>ap-biology>>natural-selection
2,"So once again, we have three equal, or we say...",math>>pre-algebra>>pre-algebra-equations-expre...
3,- Liz's math test included a survey question...,math>>engageny-alg-1>>alg1-2
4,- The following two equations form a linear s...,math>>algebra-home>>alg-system-of-equations
...,...,...
4183,- Hello everyone. So this is what I might ca...,math>>multivariable-calculus>>multivariable-de...
4184,- Let's try now to subtract some two-digit n...,math>>early-math>>cc-early-math-add-sub-100
4185,- Let's say that I have a circle. My best att...,math>>engageny-geo>>geo-5
4186,- So let's look at the female reproductive cy...,science>>health-and-medicine>>human-anatomy-an...


In [8]:
from google.colab import files

In [9]:
import re
def clean_sentence(question):
  # print(question)
  question = re.sub('<[^>]*>', ' ',question)
  question = re.sub(' +', ' ', question)
  question = re.sub('\xa0','',question)
  question = question.rstrip()
  question = re.sub('nan','',question)
  question = re.sub(u'\u2004','',question)
  question = re.sub(u'\u2009','',question)

  # question = question.decode("utf-8")
  # question = question.replace(u'\u200\d*','').encode("utf-8")
  question = re.sub('&nbsp','',question)
  question = re.sub('&ndash','',question)
  question = re.sub('\r','',question)
  question = re.sub('\t','',question)
  question = re.sub('\n',' ',question)

  question = re.sub('MathType@.*','',question)
  question = re.sub('&thinsp','',question)
  question = re.sub('&times','',question)
  question = re.sub('\u200b','',question)
  question = re.sub('&rarr;;;','',question)

  return question

In [10]:
train_data["hierarchy"].value_counts()

science>>health-and-medicine>>circulatory-system-diseases     99
science>>health-and-medicine>>human-anatomy-and-physiology    65
science>>health-and-medicine>>respiratory-system-diseases     55
science>>health-and-medicine>>circulatory-system              54
science>>health-and-medicine>>infectious-diseases             52
                                                              ..
math>>engageny-geo>>geo-3                                      1
science>>ap-physics-1>>ap-one-dimensional-motion               1
science>>ap-physics-1>>ap-forces-newtons-laws                  1
math>>old-ap-calculus-ab>>ab-existence-theorems                1
math>>old-integral-calculus>>riemann-sums-ic                   1
Name: hierarchy, Length: 569, dtype: int64

In [11]:
# final_data_1 = final_data.loc[0:71003,:]
# final_data_1

In [12]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Loading BERT tokenizer...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [13]:
test_data["hierarchy"].value_counts()

science>>health-and-medicine>>human-anatomy-and-physiology    24
science>>health-and-medicine>>circulatory-system-diseases     22
science>>health-and-medicine>>circulatory-system              17
math>>algebra-home>>alg-polynomials                           11
science>>health-and-medicine>>infectious-diseases             11
                                                              ..
math>>engageny-alg-1>>alg1-1                                   1
math>>precalculus>>x9e81a4f98389efdf:complex                   1
science>>physics>>thermodynamics                               1
math>>geometry-home>>geometry-coordinate-plane                 1
math>>old-integral-calculus>>riemann-sums-ic                   1
Name: hierarchy, Length: 416, dtype: int64

In [14]:

from sklearn.preprocessing import LabelEncoder

LE = LabelEncoder()
LE.fit_transform(pd.concat([train_data['hierarchy'],test_data['hierarchy']]))
train_data['label'] = LE.transform(train_data['hierarchy'])
train_data.head()

Unnamed: 0,video_transcripts,hierarchy,label
0,In the last couple of videos we saw that we c...,math>>multivariable-calculus>>multivariable-de...,354
1,- What we're going to do in this video is gi...,science>>ap-biology>>natural-selection,422
2,"So once again, we have three equal, or we say...",math>>pre-algebra>>pre-algebra-equations-expre...,384
3,- Liz's math test included a survey question...,math>>engageny-alg-1>>alg1-2,231
4,- The following two equations form a linear s...,math>>algebra-home>>alg-system-of-equations,99


In [15]:
def get_labels(prediction):
    predicted_label =  LE.inverse_transform([prediction])
    return predicted_label[0]

In [16]:
get_labels(204)

'math>>cc-seventh-grade-math>>cc-7th-fractions-decimals'

In [17]:
train_data.iloc[14,1]

'economics-finance-domain>>macroeconomics>>monetary-system-topic'

In [18]:
train_data

Unnamed: 0,video_transcripts,hierarchy,label
0,In the last couple of videos we saw that we c...,math>>multivariable-calculus>>multivariable-de...,354
1,- What we're going to do in this video is gi...,science>>ap-biology>>natural-selection,422
2,"So once again, we have three equal, or we say...",math>>pre-algebra>>pre-algebra-equations-expre...,384
3,- Liz's math test included a survey question...,math>>engageny-alg-1>>alg1-2,231
4,- The following two equations form a linear s...,math>>algebra-home>>alg-system-of-equations,99
...,...,...,...
4183,- Hello everyone. So this is what I might ca...,math>>multivariable-calculus>>multivariable-de...,354
4184,- Let's try now to subtract some two-digit n...,math>>early-math>>cc-early-math-add-sub-100,226
4185,- Let's say that I have a circle. My best att...,math>>engageny-geo>>geo-5,240
4186,- So let's look at the female reproductive cy...,science>>health-and-medicine>>human-anatomy-an...,497


In [19]:
# LE_test = LabelEncoder()

test_data['label'] = LE.transform(test_data['hierarchy'])
test_data.head()

Unnamed: 0,video_transcripts,hierarchy,label
0,- What I hope to do in this video is get fam...,math>>math1>>x89d82521517266d4:functions,335
1,In the last video we were able to set up this...,math>>old-ap-calculus-ab>>ab-applications-defi...,357
2,- In previous videos we talk about GDP as th...,economics-finance-domain>>ap-macroeconomics>>e...,3
3,- So what we're gonna do in this video is se...,math>>old-integral-calculus>>definite-integral...,378
4,- So I've said that if you have a vector fie...,math>>multivariable-calculus>>multivariable-de...,354


In [20]:
val_data['label'] = LE.transform(val_data['hierarchy'])
val_data.head()

Unnamed: 0,video_transcripts,hierarchy,label
0,Find the probability of rolling doubles on tw...,math>>precalculus>>x9e81a4f98389efdf:prob-comb,395
1,"After the food is swallowed, it leaves the m...",science>>health-and-medicine>>human-anatomy-an...,497
2,Let's now talk about what is easily one of th...,math>>geometry>>hs-geo-trig,256
3,The goal in this video is to essentially prov...,science>>chemistry>>thermodynamics-chemistry,472
4,"A line goes through the points (-1, 6) and (5...",math>>in-in-grade-11-ncert>>in-in-class11-stra...,304


In [21]:
train_features, test_features, train_labels, test_labels = train_data["video_transcripts"],test_data["video_transcripts"],train_data["label"],test_data["label"]
val_features,val_labels = val_data["video_transcripts"], val_data["label"]

In [22]:
train_labels.value_counts()

489    99
497    65
505    55
488    54
498    52
       ..
382     1
359     1
195     1
471     1
216     1
Name: label, Length: 569, dtype: int64

In [23]:
test_labels.value_counts()

497    24
489    22
488    17
93     11
485    11
       ..
192     1
191     1
398     1
187     1
291     1
Name: label, Length: 416, dtype: int64

In [24]:
get_labels(268)

'math>>in-in-class-3rd-math-cbse>>x80b2f4aa70819288:represent-and-interpret-data'

In [25]:
question_answer = train_features.values
categories = train_labels.values

In [26]:
question_answer

array([" In the last couple of videos we saw that we can describe a curves by a position vector-valued function. And in very general terms, it would be the x position as a function of time times the unit vector in the horizontal direction. Plus the y position as a function of time times the unit victor in the vertical direction. And this will essentially describe this-- though, if you can imagine a particle and let's say the parameter t represents time. It'll describe where the particle is at any given time. And if we wanted a particular curve we can say, well, this only applies for some curve-- we're dealing, it's r of t. And it's only applicable between t being greater than a and less than b. And you know, that would describe some curve in two dimensions. Just me just draw it here. This is all a review of really, the last two videos. So this curve, it might look something like that where this is where t is equal to a. That's where t is equal to b. And so r of a will be this vector ri

In [27]:
len(categories)

4188

In [28]:
input_ids = []
attention_masks = []

for sent in question_answer:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        truncation=True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)


# Print sentence 0, now as a list of IDs.
print('Original: ', question_answer[0])
print('Token IDs:', input_ids[0])

Original:   In the last couple of videos we saw that we can describe a curves by a position vector-valued function. And in very general terms, it would be the x position as a function of time times the unit vector in the horizontal direction. Plus the y position as a function of time times the unit victor in the vertical direction. And this will essentially describe this-- though, if you can imagine a particle and let's say the parameter t represents time. It'll describe where the particle is at any given time. And if we wanted a particular curve we can say, well, this only applies for some curve-- we're dealing, it's r of t. And it's only applicable between t being greater than a and less than b. And you know, that would describe some curve in two dimensions. Just me just draw it here. This is all a review of really, the last two videos. So this curve, it might look something like that where this is where t is equal to a. That's where t is equal to b. And so r of a will be this vector

In [29]:
input_ids_val = []
attention_masks_val = []

for sent in val_features:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        truncation=True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids_val.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks_val.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids_val = torch.cat(input_ids_val, dim=0)
attention_masks_val = torch.cat(attention_masks_val, dim=0)


# Print sentence 0, now as a list of IDs.
print('Original: ', question_answer[0])
print('Token IDs:', input_ids[0])

Original:   In the last couple of videos we saw that we can describe a curves by a position vector-valued function. And in very general terms, it would be the x position as a function of time times the unit vector in the horizontal direction. Plus the y position as a function of time times the unit victor in the vertical direction. And this will essentially describe this-- though, if you can imagine a particle and let's say the parameter t represents time. It'll describe where the particle is at any given time. And if we wanted a particular curve we can say, well, this only applies for some curve-- we're dealing, it's r of t. And it's only applicable between t being greater than a and less than b. And you know, that would describe some curve in two dimensions. Just me just draw it here. This is all a review of really, the last two videos. So this curve, it might look something like that where this is where t is equal to a. That's where t is equal to b. And so r of a will be this vector

In [30]:
print('Original: ', question_answer[1])
print('Token IDs:', input_ids[1])

Original:   -  What we're going to do in this video is give ourselves a little bit of a tour of eukaryotic cells. And the first place to start is just to remind ourselves what it means for a cell to be eukaryotic. It means that inside the cell, there are membrane-bound organelles. Now, what does that mean? Well, you could view it as sub-compartments within the cell. Membrane-bound organelles. And in this video in particular, we're going to highlight some of these membrane-bound organelles that make the cells eukaryotic. So let's just start with some of the ingredients that we know is true of all cells. So you'll have your cellular membrane here. I drew it big, so that we have a lot of space to draw things in. So this is our cellular membrane. I'll do some nice shading so you appreciate that it'll actually be three-dimensional. We see so many slices of cells that sometimes we forget that they are more spherical, or that they have three-dimensional shape to them. They're not all spherica

In [31]:
labels = torch.tensor(categories)

In [32]:
get_labels(419)

'science>>ap-biology>>ecology-ap'

In [33]:
get_labels(311)

'math>>in-in-grade-12-ncert>>in-in-determinants'

In [34]:
num_classes = len(list(set(categories)))
num_classes

569

In [35]:
from torch.utils.data import TensorDataset, random_split
# train_poincare_tensor = torch.tensor(poincare_embeddings_final,dtype=torch.float)
# train_poincare_tensor = torch.tensor(poincare_embeddings_final_train,dtype=torch.float)
# val_poincare_tensor = torch.tensor(poincare_embeddings_final_val, dtype=torch.float)
train_labels = torch.tensor(categories)
val_labels = torch.tensor(val_labels.values)
# Combine the training inputs into a TensorDataset.
train_dataset = TensorDataset(input_ids, attention_masks, train_labels)
val_dataset = TensorDataset(input_ids_val,attention_masks_val,val_labels)


In [36]:
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler


In [37]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
batch_size = 32
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), 
            batch_size = batch_size 
        )

In [38]:

# run this cell to prepare model for inference
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Loads BertForSequenceClassification, the pretrained BERT model with a single 
model = BertForSequenceClassification.from_pretrained(
    "model_save_categorized_reduced_khan_acad", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 572,   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = True, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [39]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [40]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [41]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


In [42]:
test_features = test_features.values
labels = test_labels.values

In [43]:
test_features

array([' -  What I hope to do in this video is get familiar with the notion of an interval, and also think about ways that we can show an interval, or interval notation. Right over here I have a number line. Let\'s say I wanted to talk about the interval on the number line that goes from negative three to two. So I care about this-- Let me use a different color. Let\'s say I care about this interval right over here. I care about all the numbers from negative three to two. So in order to be more precise, I have to be clear. Am I including negative three and two, or am I not including negative three and two, or maybe I\'m just including one of them. So if I\'m including negative three and two, then I would fill them in. So this right over here, I\'m filling negative three and two in, which means that negative three and two are part of this interval. And when you include the endpoints, this is called a closed interval. Closed interval. And I just showed you how I can depict it on a number

In [44]:
train_labels = train_data["hierarchy"].values
len(train_labels)

4188

In [45]:
len(input_ids)

4188

In [46]:
# train_embeddings = []
# with torch.no_grad():
#   outputs = model(input_ids.to(device),attention_masks.to(device))
# train_embeddings = torch.mean(outputs[1][0].squeeze(),dim=1)
# train_embeddings.shape

In [282]:
import numpy as np
class_emb = {cls:[] for cls in list(set(train_data["hierarchy"].values))}
for index,label in enumerate(list(set(train_data["hierarchy"].values))):
  sample_indices_for_label = np.where(train_labels == label)[0][:2]
  input_ids_for_class = input_ids[sample_indices_for_label]
  attention_masks_class = attention_masks[sample_indices_for_label]
  input_ids_for_class = input_ids_for_class.to(device)
  attention_masks_class = attention_masks_class.to(device)
  
  with torch.no_grad():
    outputs = model(input_ids_for_class,attention_masks_class)
  class_emb[label] = torch.cat((outputs[1][-3][0][0],outputs[1][-5][0][0],outputs[1][-6][0][0]),dim=-1)


In [283]:
class_keys = [item[0] for item in class_emb.items()]

In [284]:
class_values = [item[1] for item in class_emb.items()]

In [285]:
class_keys = np.array(class_keys)

In [286]:
class_prototype_embeddings = torch.stack(class_values,dim=0)

In [287]:
class_prototype_embeddings.shape

torch.Size([569, 2304])

In [288]:
class_keys[0]

'math>>old-integral-calculus>>riemann-sums-ic'

In [289]:

len(input_ids)

4188

In [290]:
test_features

array([' -  What I hope to do in this video is get familiar with the notion of an interval, and also think about ways that we can show an interval, or interval notation. Right over here I have a number line. Let\'s say I wanted to talk about the interval on the number line that goes from negative three to two. So I care about this-- Let me use a different color. Let\'s say I care about this interval right over here. I care about all the numbers from negative three to two. So in order to be more precise, I have to be clear. Am I including negative three and two, or am I not including negative three and two, or maybe I\'m just including one of them. So if I\'m including negative three and two, then I would fill them in. So this right over here, I\'m filling negative three and two in, which means that negative three and two are part of this interval. And when you include the endpoints, this is called a closed interval. Closed interval. And I just showed you how I can depict it on a number

In [291]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
test_input_ids = []
test_attention_masks = []
for sent in test_features:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        truncation=True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    test_input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    test_attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)
labels = torch.tensor(labels)

# Set the batch size.  
batch_size = 32  
# test_poincare_tensor = torch.tensor(taxonomy_vectors,dtype=torch.float)

# Create the DataLoader.
prediction_data = TensorDataset(test_input_ids, test_attention_masks, labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [292]:
test_poincare_tensor = class_prototype_embeddings# torch.tensor(taxonomy_vectors,dtype=torch.float)


In [293]:
test_labels = np.array(test_labels)

In [294]:
test_labels[0]

'math>>old-integral-calculus>>riemann-sums-ic'

In [295]:


len(input_ids)

4188

In [314]:
print('Predicting labels for {:,} test sentences...'.format(len(test_input_ids)))

# Put model in evaluation mode
model.eval()
cos = torch.nn.CosineSimilarity(dim=-1, eps=1e-6)

test_input_ids = test_input_ids.to('cuda')
test_attention_masks = test_attention_masks.to('cuda')
class_prototype_embeddings = class_prototype_embeddings.to('cuda')
# Tracking variables1
predictions , true_labels = [], []
for input_id,attention_mask in zip(test_input_ids, test_attention_masks):
  with torch.no_grad():
    outputs = model(input_id.reshape(1,-1),attention_mask.reshape(1,-1))
  # print(torch.mean(outputs[1][0].squeeze(),dim=0).shape)
  distances = cos(torch.cat((outputs[1][-3][0][0], outputs[1][-5][0][0],outputs[1][-6][0][0])),class_prototype_embeddings)
  distances,indices = torch.topk(distances,10,largest=True)
  predictions.append(class_keys[indices.cpu().numpy()])
print(len(predictions))

Predicting labels for 1,047 test sentences...
1047


In [315]:
labels=test_data['label'].values

In [316]:
labels

array([335, 357,   3, ..., 537, 395, 450])

In [317]:
labels

array([335, 357,   3, ..., 537, 395, 450])

In [318]:
final_predictions = []
for prediction in predictions:
  final_predictions.append(LE.transform(prediction))


In [301]:

import tensorflow as tf
y_true = np.array(labels)
y_true = tf.identity(y_true)
y_pred = np.array(final_predictions)
y_pred = tf.identity(y_pred)
print(y_pred.shape,y_true.shape)
k = 5
recall, update_recall = tf.compat.v1.metrics.recall_at_top_k(y_true, y_pred, 5)
precision, update_precision = tf.compat.v1.metrics.precision_at_top_k(y_true, y_pred, 5)

tmp_rank = tf.nn.top_k(y_pred, 5)
stream_vars = [i for i in tf.local_variables()]

with tf.Session() as sess:
    sess.run(tf.local_variables_initializer())
    print("precision",sess.run(update_precision))
    # print("precision",sess.run(precision))

    print("update_recall: ",sess.run(update_recall ))
    print("recall",sess.run(recall))

    print("STREAM_VARS: ",(sess.run(stream_vars)))
    print("TMP_RANK: ",sess.run(tmp_rank))

(1047, 5) (1047,)
precision 0.06208213944603629
update_recall:  0.3104106972301815
recall 0.3104106972301815
STREAM_VARS:  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 325.0, 722.0, 325.0, 4910.0]
TMP_RANK:  TopKV2(values=array([[245, 162, 135, 107,  78],
       [357, 313, 298, 158, 127],
       [397, 346,   7,   3,   1],
       ...,
       [413, 385, 299, 256,  39],
       [406, 385, 278, 126,  39],
       [508, 506, 473, 457, 450]]), indices=array([[1, 0, 3, 2, 4],
       [3, 1, 2, 4, 0],
       [1, 2, 3, 0, 4],
       ...,
       [1, 0, 4, 2, 3],
       [0, 4, 1, 2, 3],
       [3, 1, 2, 0, 4]],

Following four cells show metrics Recall@5, R@10, R@15, R@20 for prototype inspired baseline code above


In [None]:
import tensorflow as tf
y_true = np.array(labels)
y_true = tf.identity(y_true)
y_pred = np.array(final_predictions)
y_pred = tf.identity(y_pred)
print(y_pred.shape,y_true.shape)
k = 8
recall, update_recall = tf.compat.v1.metrics.recall_at_top_k(y_true, y_pred, 5)
precision, update_precision = tf.compat.v1.metrics.precision_at_top_k(y_true, y_pred, 5)

tmp_rank = tf.nn.top_k(y_pred, 5)
stream_vars = [i for i in tf.local_variables()]

with tf.Session() as sess:
    sess.run(tf.local_variables_initializer())
    print("precision",sess.run(update_precision))
    # print("precision",sess.run(precision))

    print("update_recall: ",sess.run(update_recall ))
    print("recall",sess.run(recall))

    print("STREAM_VARS: ",(sess.run(stream_vars)))
    print("TMP_RANK: ",sess.run(tmp_rank))

(1047, 5) (1047,)
precision 0.05291308500477555
update_recall:  0.26456542502387775
recall 0.26456542502387775
STREAM_VARS:  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 277.0, 770.0, 277.0, 4958.0]
TMP_RANK:  TopKV2(values=array([[411, 192, 162, 115, 107],
       [347, 337, 298, 170, 158],
       [ 17,  14,  11,   3,   1],
       ...,
       [413, 298, 265, 170, 158],
       [284, 273, 158,  88,  32],
       [506, 465, 457, 451, 448]]), indices=array([[3, 0, 2, 4, 1],
       [2, 3, 0, 1, 4],
       [2, 4, 1, 0, 3],
       ...,
       [4, 1, 3, 0, 2],
       [3, 1, 2, 4, 0],
       [1, 2, 0, 3, 4]], dtype=int32))


In [320]:
import tensorflow as tf
y_true = np.array(labels)
y_true = tf.identity(y_true)
y_pred = np.array(final_predictions)
y_pred = tf.identity(y_pred)
print(y_pred.shape,y_true.shape)
k = 8
recall, update_recall = tf.compat.v1.metrics.recall_at_top_k(y_true, y_pred, 10)
precision, update_precision = tf.compat.v1.metrics.precision_at_top_k(y_true, y_pred, 10)

tmp_rank = tf.nn.top_k(y_pred, 10)
stream_vars = [i for i in tf.local_variables()]

with tf.Session() as sess:
    sess.run(tf.local_variables_initializer())
    print("precision",sess.run(update_precision))
    # print("precision",sess.run(precision))

    print("update_recall: ",sess.run(update_recall ))
    print("recall",sess.run(recall))

    print("STREAM_VARS: ",(sess.run(stream_vars)))
    print("TMP_RANK: ",sess.run(tmp_rank))

(1047, 10) (1047,)
precision 0.03925501432664757
update_recall:  0.39255014326647564
recall 0.39255014326647564
STREAM_VARS:  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 411.0, 636.0, 411.0, 10059.0]
TMP_RANK:  TopKV2(values=array([[356, 245, 210, ...,  78,  63,  30],
       [381, 380, 358, ..., 158, 127,  66],
       [397, 346,  19, ...,   5,   3,   1],
       ...,
       [413, 385, 357, ..., 158, 127,  39],
       [413, 406, 385, ..., 126,  95,  39],
       [508, 506, 473, ..., 442, 422, 397]]), indices=array([[8, 1

In [313]:
import tensorflow as tf
y_true = np.array(labels)
y_true = tf.identity(y_true)
y_pred = np.array(final_predictions)
y_pred = tf.identity(y_pred)
print(y_pred.shape,y_true.shape)
k = 8
recall, update_recall = tf.compat.v1.metrics.recall_at_top_k(y_true, y_pred, 15)
precision, update_precision = tf.compat.v1.metrics.precision_at_top_k(y_true, y_pred, 15)

tmp_rank = tf.nn.top_k(y_pred, 15)
stream_vars = [i for i in tf.local_variables()]

with tf.Session() as sess:
    sess.run(tf.local_variables_initializer())
    print("precision",sess.run(update_precision))
    # print("precision",sess.run(precision))

    print("update_recall: ",sess.run(update_recall ))
    print("recall",sess.run(recall))

    print("STREAM_VARS: ",(sess.run(stream_vars)))
    print("TMP_RANK: ",sess.run(tmp_rank))

(1047, 15) (1047,)
precision 0.029290035020694046
update_recall:  0.4393505253104107
recall 0.4393505253104107
STREAM_VARS:  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 460.0, 587.0, 460.0, 15245.0]
TMP_RANK:  TopKV2(values=array([[411, 356, 263, ...,  78,  63,  30],
       [381, 380, 364, ..., 127,  71,  66],
       [540, 506, 397, ...,   5,   3,   1],
       ...,
       [413, 385, 357, ..., 127, 100,  39],
       [413, 406, 385, ..., 126,  95,  39],
       [508, 506, 473, ..., 416, 397,  19]]), indices=array([[10,  8, 14, ...,  4,  6,  5],
       [ 8,  9,

In [None]:
import tensorflow as tf
y_true = np.array(labels)
y_true = tf.identity(y_true)
y_pred = np.array(final_predictions)
y_pred = tf.identity(y_pred)
print(y_pred.shape,y_true.shape)
k = 8
recall, update_recall = tf.compat.v1.metrics.recall_at_top_k(y_true, y_pred, 20)
precision, update_precision = tf.compat.v1.metrics.precision_at_top_k(y_true, y_pred, 20)

tmp_rank = tf.math.top_k(y_pred, 20,sorted=False)
stream_vars = [i for i in tf.local_variables()]

with tf.Session() as sess:
    sess.run(tf.local_variables_initializer())
    print("precision",sess.run(update_precision))
    # print("precision",sess.run(precision))

    print("update_recall: ",sess.run(update_recall ))
    print("recall",sess.run(recall))

    print("STREAM_VARS: ",(sess.run(stream_vars)))
    print("TMP_RANK: ",sess.run(y_pred))

(1047, 20) (1047,)
precision 0.020678127984718242
update_recall:  0.41356255969436484
recall 0.41356255969436484
STREAM_VARS:  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 433.0, 614.0, 433.0, 20507.0]
TMP_RANK:  [[192 107 162 ... 383 114 356]
 [298 170 347 ... 225 211 358]
 [  3  11  17 ... 383 397 506]
 ...
 [170 298 158 ... 284 364 249]
 [ 32 273 158 ... 170 413 358]
 [457 506 465 ... 167 479   9]]


In [307]:
import tensorflow as tf
y_true = np.array(labels)
y_true = tf.identity(y_true)
y_pred = np.array(final_predictions)
y_pred = tf.identity(y_pred)
print(y_pred.shape,y_true.shape)
k = 8
recall, update_recall = tf.compat.v1.metrics.recall_at_top_k(y_true, y_pred, 20)
precision, update_precision = tf.compat.v1.metrics.precision_at_top_k(y_true, y_pred, 20)

tmp_rank = tf.math.top_k(y_pred, 20,sorted=False)
stream_vars = [i for i in tf.local_variables()]

with tf.Session() as sess:
    sess.run(tf.local_variables_initializer())
    print("precision",sess.run(update_precision))
    # print("precision",sess.run(precision))

    print("update_recall: ",sess.run(update_recall ))
    print("recall",sess.run(recall))

    print("STREAM_VARS: ",(sess.run(stream_vars)))
    print("TMP_RANK: ",sess.run(y_pred))

(1047, 20) (1047,)
precision 0.023686723973256926
update_recall:  0.47373447946513847
recall 0.47373447946513847
STREAM_VARS:  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 496.0, 551.0, 496.0, 20444.0]
TMP_RANK:  [[162 245 107 ... 342 255 334]
 [127 313 298 ... 297 222  87]
 [  3 397 346 ... 405 368 167]
 ...
 [385 413 256 ...  40 384 287]
 [406 278 126 ... 185 287  92]
 [457 506 473 ... 437 274 148]]


In [321]:
def get_cleaned_taxonomy(taxonomy):
  cleaned_taxonomy = []
  for value in taxonomy:
      value = ' '.join(value.lower().split(">>"))
      # taxonomy_words = [inflection.singularize(val)  for token in value for val in token.split(" ") if val.isalpha()]
      cleaned_taxonomy.append( value )
  return cleaned_taxonomy
test_labels = list(set(test_data["hierarchy"].values))
test_emb_data = get_cleaned_taxonomy(test_labels)

In [322]:
label_input_ids = []
label_attention_masks = []
for sent in test_emb_data:

    label_encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 256,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        truncation=True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    label_input_ids.append(label_encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    label_attention_masks.append(label_encoded_dict['attention_mask'])

In [323]:
import numpy as np
taxonomy_vectors = []
for label_input_id,label_att_mask in zip(label_input_ids,label_attention_masks):
    label_input_id = label_input_id.to(device)
    label_att_mask = label_att_mask.to(device)
    with torch.no_grad():
      outputs = model(label_input_id.reshape(1,-1),label_att_mask.reshape(1,-1))
    taxonomy_vectors.append(torch.cat((outputs[1][-2][0][0], outputs[1][-3][0][0],outputs[1][-4][0][0])).cpu().numpy())
taxonomy_vectors = np.vstack(taxonomy_vectors)
taxonomy_vectors.shape


(416, 2304)

In [324]:
labels

array([335, 357,   3, ..., 537, 395, 450])

In [325]:
test_labels = np.array(test_labels)

In [326]:
print('Predicting labels for {:,} test sentences...'.format(len(test_input_ids)))

# Put model in evaluation mode
model.eval()
cos = torch.nn.CosineSimilarity(dim=-1, eps=1e-6)

test_input_ids = test_input_ids.to('cuda')
test_attention_masks = test_attention_masks.to('cuda')
taxonomy_vectors = torch.tensor(taxonomy_vectors,dtype=torch.float).to('cuda')
# Tracking variables1
predictions , true_labels = [], []
for input_id,attention_mask in zip(test_input_ids, test_attention_masks):
  with torch.no_grad():
    outputs = model(input_id.reshape(1,-1),attention_mask.reshape(1,-1))
  # print(torch.mean(outputs[1][0].squeeze(),dim=0).shape)
  distances = cos(torch.cat((outputs[1][-1][0][0], outputs[1][-2][0][0],outputs[1][-3][0][0])),taxonomy_vectors)
  distances,indices = torch.topk(distances,20,largest=True)
  predictions.append(test_labels[indices.cpu().numpy()])
print(len(predictions))

Predicting labels for 1,047 test sentences...
1047


In [327]:
final_predictions = []
for prediction in predictions:
  final_predictions.append(LE.transform(prediction))


In [122]:
import tensorflow as tf
y_true = np.array(labels)
y_true = tf.identity(y_true)
y_pred = np.array(final_predictions)
y_pred = tf.identity(y_pred)
print(y_pred.shape,y_true.shape)
k = 8
recall, update_recall = tf.compat.v1.metrics.recall_at_top_k(y_true, y_pred, 5)
precision, update_precision = tf.compat.v1.metrics.precision_at_top_k(y_true, y_pred, 5)

tmp_rank = tf.nn.top_k(y_pred, 5)
stream_vars = [i for i in tf.local_variables()]

with tf.Session() as sess:
    sess.run(tf.local_variables_initializer())
    print("precision",sess.run(update_precision))
    # print("precision",sess.run(precision))

    print("update_recall: ",sess.run(update_recall ))
    print("recall",sess.run(recall))

    print("STREAM_VARS: ",(sess.run(stream_vars)))
    print("TMP_RANK: ",sess.run(tmp_rank))

(1047, 5) (1047,)
precision 0.01585482330468004
update_recall:  0.07927411652340019
recall 0.07927411652340019
STREAM_VARS:  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 83.0, 964.0, 83.0, 5152.0]
TMP_RANK:  TopKV2(values=array([[297, 294, 276, 202, 182],
       [344, 335, 331, 124, 115],
       [ 21,  16,  15,  14,   8],
       ...,
       [344, 339, 297, 294, 202],
       [294, 211, 210, 202, 182],
       [513, 459, 457, 450, 330]]), indices=array([[0, 2, 3, 1, 4],
       [0, 3, 1, 2, 4],
       [2, 1, 0, 3, 4],
       ...,
       [1, 3, 4, 0, 2],
       [1, 3, 2, 4, 0],
       [2, 3, 1, 0, 4]], dtype=int32))


In [None]:
import tensorflow as tf
y_true = np.array(labels)
y_true = tf.identity(y_true)
y_pred = np.array(final_predictions)
y_pred = tf.identity(y_pred)
print(y_pred.shape,y_true.shape)
k = 8
recall, update_recall = tf.compat.v1.metrics.recall_at_top_k(y_true, y_pred, 10)
precision, update_precision = tf.compat.v1.metrics.precision_at_top_k(y_true, y_pred, 10)

tmp_rank = tf.nn.top_k(y_pred, 10)
stream_vars = [i for i in tf.local_variables()]

with tf.Session() as sess:
    sess.run(tf.local_variables_initializer())
    print("precision",sess.run(update_precision))
    # print("precision",sess.run(precision))

    print("update_recall: ",sess.run(update_recall ))
    print("recall",sess.run(recall))

    print("STREAM_VARS: ",(sess.run(stream_vars)))
    print("TMP_RANK: ",sess.run(tmp_rank))

(1047, 10) (1047,)
precision 0.013467048710601719
update_recall:  0.1346704871060172
recall 0.1346704871060172
STREAM_VARS:  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 141.0, 906.0, 141.0, 10329.0]
TMP_RANK:  TopKV2(values=array([[297, 294, 289, ..., 210, 202, 182],
       [348, 346, 344, ..., 117, 116, 115],
       [ 23,  21,  16, ...,   6,   5,   2],
       ...,
       [344, 342, 339, ..., 290, 202,  74],
       [410, 407, 400, ..., 210, 202, 182],
       [513, 459, 458, ..., 336, 330, 284]]), indices=array([[0, 2, 5, ..., 9, 1, 4],
       [5, 7, 0, ..., 8, 6, 4],
       [8, 2, 1, ..., 6, 9, 5],
       ...,
       [1, 5, 3, ..., 8, 2, 9],
       [5, 9, 7, ..., 2, 4, 0],
       [2, 3, 7, ..., 8, 4, 9]], dtype=int32))


In [None]:
import tensorflow as tf
y_true = np.array(labels)
y_true = tf.identity(y_true)
y_pred = np.array(final_predictions)
y_pred = tf.identity(y_pred)
print(y_pred.shape,y_true.shape)
k = 8
recall, update_recall = tf.compat.v1.metrics.recall_at_top_k(y_true, y_pred, 15)
precision, update_precision = tf.compat.v1.metrics.precision_at_top_k(y_true, y_pred, 15)

tmp_rank = tf.nn.top_k(y_pred, 15)
stream_vars = [i for i in tf.local_variables()]

with tf.Session() as sess:
    sess.run(tf.local_variables_initializer())
    print("precision",sess.run(update_precision))
    # print("precision",sess.run(precision))

    print("update_recall: ",sess.run(update_recall ))
    print("recall",sess.run(recall))

    print("STREAM_VARS: ",(sess.run(stream_vars)))
    print("TMP_RANK: ",sess.run(tmp_rank))

(1047, 15) (1047,)
precision 0.011397644062400509
update_recall:  0.17096466093600765
recall 0.17096466093600765
STREAM_VARS:  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 179.0, 868.0, 179.0, 15526.0]
TMP_RANK:  TopKV2(values=array([[297, 294, 289, ..., 185, 182,  74],
       [348, 346, 345, ..., 117, 116, 115],
       [ 23,  21,  18, ...,   3,   2,   0],
       ...,
       [344, 342, 339, ..., 261, 202,  74],
       [410, 407, 400, ..., 202, 195, 182],
       [513, 461, 459, ..., 284, 281, 276]]), indices=array([[ 0,  2,  5, ..., 13,  4, 11],
       [ 5,  7, 12, ...,  8,  6,  4],
       [ 8,  2, 14, ..., 12,  5, 11],
       ...,
       [ 1,  5,  3, ..., 14,  2,  9],
       [ 5,  9,  7, ...,  4, 14,  0],
       [ 2, 10,  3, ...,  9, 

In [329]:
import tensorflow as tf
y_true = np.array(labels)
y_true = tf.identity(y_true)
y_pred = np.array(final_predictions)
y_pred = tf.identity(y_pred)
print(y_pred.shape,y_true.shape)
k = 8
recall, update_recall = tf.compat.v1.metrics.recall_at_top_k(y_true, y_pred, 20)
precision, update_precision = tf.compat.v1.metrics.precision_at_top_k(y_true, y_pred, 20)

tmp_rank = tf.math.top_k(y_pred, 20,sorted=False)
stream_vars = [i for i in tf.local_variables()]

with tf.Session() as sess:
    sess.run(tf.local_variables_initializer())
    print("precision",sess.run(update_precision))
    # print("precision",sess.run(precision))

    print("update_recall: ",sess.run(update_recall ))
    print("recall",sess.run(recall))

    print("STREAM_VARS: ",(sess.run(stream_vars)))
    print("TMP_RANK: ",sess.run(tmp_rank))

(1047, 20) (1047,)
precision 0.010076408787010506
update_recall:  0.20152817574021012
recall 0.20152817574021012
STREAM_VARS:  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 211.0, 836.0, 211.0, 20729.0]
TMP_RANK:  TopKV2(values=array([[297, 294, 290, ..., 182,  74,   8],
       [348, 346, 345, ..., 117, 116, 115],
       [410,  23,  21, ...,   3,   2,   0],
       ...,
       [344, 342, 339, ..., 202, 182,  74],
       [410, 407, 400, ..., 194, 182,  74],
       [513, 461, 459, ..

In [None]:
def mrr_metric(labels, predictions, weights=None,
              metrics_collections=None,
              updates_collections=None,
              name=None):
    
    with tf.name_scope(name, 'mrr_metric', [predictions, labels, weights]) as scope:

    
        k = 20 #predictions.get_shape().as_list()[-1]
        print(predictions.get_shape())

        get_ranked_indicies = tf.expand_dims(tf.where(tf.equal(tf.cast(predictions,tf.int64),labels[:,None]))[:,1],1)
        rr = 1/(get_ranked_indicies+1)
        m_rr =  tf.reduce_sum(rr)/tf.cast(labels.get_shape().as_list()[0],dtype=tf.float64)

        if metrics_collections:
            tf.add_to_collection(metrics_collections, m_rr)

        if updates_collections:
            tf.add_to_collections(updates_collections, update_mrr_op)

        return m_rr,m_rr,rr

In [None]:
mrr, update_mrr,rr = mrr_metric(y_true,y_pred)
with tf.Session() as sess:
    sess.run(tf.local_variables_initializer())
    print("update_mrr",sess.run(update_mrr),sess.run(rr).shape)
    # print("precision",sess.run(precision))

    # print("update_recall: ",sess.run(update_recall ))
    # print("recall",sess.run(recall))

    # print("STREAM_VARS: ",(sess.run(stream_vars)))
    # print("TMP_RANK: ",sess.run(tmp_rank))

(1047, 20)
update_mrr 0.05334960867320016 (211, 1)
