# Loading the Data

In [None]:
# Importing the dataset and indexing its data

import json
from datasets import load_dataset

# Load the dataset, trust_remote_code=True is needed to load the dataset from the remote repository.
dataset = load_dataset('dataset-download.py', trust_remote_code=True) 

doc_list = [dataset['train'][:10]] # Load the first 10 examples of the dataset

print(doc_list)  # Print the first example of the dataset

doc_list[0]['en_captions']


[{'video_id': ['v_QOlSCBRmfWY', 'v_ehGHCYKzyZ8', 'v_nwznKOuZM7w', 'v_ogQozSI5V8U', 'v_nHE7u40plD0', 'v_69IsHpmRyfk', 'v_D18b2IZpxk0', 'v_pizl41xmw7k', 'v_oP77DgsbhKQ', 'v_fzp5ooc727c'], 'video_path': ['https://www.youtube.com/watch?v=QOlSCBRmfWY', 'https://www.youtube.com/watch?v=ehGHCYKzyZ8', 'https://www.youtube.com/watch?v=nwznKOuZM7w', 'https://www.youtube.com/watch?v=ogQozSI5V8U', 'https://www.youtube.com/watch?v=nHE7u40plD0', 'https://www.youtube.com/watch?v=69IsHpmRyfk', 'https://www.youtube.com/watch?v=D18b2IZpxk0', 'https://www.youtube.com/watch?v=pizl41xmw7k', 'https://www.youtube.com/watch?v=oP77DgsbhKQ', 'https://www.youtube.com/watch?v=fzp5ooc727c'], 'duration': [82.7300033569336, 61.720001220703125, 31.649999618530273, 36.54999923706055, 145.55999755859375, 94.72000122070312, 95.66999816894531, 172.0800018310547, 214.60000610351562, 148.32000732421875], 'captions_starts': [[0.8299999833106995, 17.3700008392334, 56.2599983215332], [0.0, 3.0899999141693115, 15.4300003051757

[['A young woman is seen standing in a room and leads into her dancing.',
  ' The girl dances around the room while the camera captures her movements.',
  ' She continues dancing around the room and ends by laying on the floor.'],
 ['The video starts with a title logo sequence.',
  ' A man and woman are in a living room demonstrating exercises.',
  ' The woman lays on the ground.',
  " The man starts pointing to different areas of the woman's body as she does an exercise.",
  ' The woman begins to do small sit ups.',
  ' The woman ends with a final title logo sequence.'],
 ['Two people are seen moving around a kitchen quickly performing various tasks and sitting down.',
  ' They then wax down a ski in the kitchen while continuing to move around.'],
 ['We see a hallway with a wooden floor.',
  ' A dog in socks walks slowly out onto the floor as a lady films him.',
  ' The dog turns around and goes back to the other room.'],
 ['A woman and a man are sitting on the sidewalk playing music.

# Transformer Encoder

This notebook covers our understanding of the Tranformer Architecture as required in section 2.5


In [None]:
import numpy as np
import pprint
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import torch
import transformers
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from bertviz import model_view, head_view

# Get the interactive Tools for Matplotlib
#%matplotlib notebook
#%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

ImportError: cannot import name 'display' from 'IPython.core.display' (c:\Users\rafae\Desktop\MPDW-Project\.venv\Lib\site-packages\IPython\core\display.py)

In [None]:
model_path = 'cross-encoder/ms-marco-MiniLM-L-12-v2'
model_path = 'nboost/pt-bert-base-uncased-msmarco'
CLS_token = "[CLS]"
SEP_token = "[SEP]"

## Loading the Models


In [None]:
transformers.logging.set_verbosity_warning()

tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path,  output_hidden_states=True, output_attentions=True)  
model = AutoModelForSequenceClassification.from_pretrained(model_path, config=config)

# After loading the model, you can inspect its architecture. Tipycally, each model is composed by the embedding layer, the self-attention layers and the output layers. The output layer is always task specific. 
model

## Tokenization

See here for details: https://huggingface.co/docs/transformers/tokenizer_summary

In [None]:
#sentence_a = ["How many people live in Berlin?", "How many people live in Berlin?"]
#sentence_b =  ["Berlin has a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers.", "New York City is famous for the Metropolitan Museum of Art."]
captions = doc_list[0]['en_captions']
#inputs = tokenizer.encode_plus(sentence_a, sentence_b, return_tensors='pt', add_special_tokens=True, max_length = 512, padding=True, truncation = True)
inputs = tokenizer(captions, return_tensors='pt', add_special_tokens=True, max_length = 512, padding=True, truncation = True)

print.pprint(inputs)

In [None]:
print(tokenizer.decode(inputs["input_ids"][0].tolist()))
print(tokenizer.decode(inputs["input_ids"][1].tolist()))

In [None]:
input_ids = inputs['input_ids']
input_id_list = input_ids[0].tolist() # Batch index 0
pprint.pprint(input_id_list)

In [None]:
input_tokens_list = tokenizer.convert_ids_to_tokens(input_id_list)
pprint.pprint(input_tokens_list)

In [None]:
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt', add_special_tokens=True, max_length = 512, padding=True, truncation = True)
inputs

In [None]:
res = "\n".join("{} \t {}".format(x, y) for x, y in zip(input_id_list, input_tokens_list))
print(res)

In [None]:
with torch.no_grad():
    outputs = model(**inputs)

In [None]:
outputs.keys()

## Hidden Layer Embeddings

In [None]:
# total number of layers embeddings
len(outputs['hidden_states'])

In [None]:
# The format is as follow:
# outputs['hidden_states'][layer_m][0][token_n]
layer_m = 12
token_n = 1
# Get all the embeddings of one layer:
output_embeddings = outputs['hidden_states'][layer_m][0]
output_embeddings.shape

In [None]:
token_throat = 2
token_cancer = 3

# Get the embedding of one particular token in one particular layer
throat_output_embedding = outputs['hidden_states'][layer_m][0][token_throat]
throat_output_embedding.shape

In [None]:
output_embeddings.shape

In [None]:
def display_scatterplot(data, words):

    if data.shape[1] == 2:
        twodim = data
    else:
        pca = PCA()
        pca.fit(output_embeddings.detach().numpy())
        twodim = pca.transform(data)[:,:2]
    
    plt.style.use('default') # https://matplotlib.org/3.5.1/gallery/style_sheets/style_sheets_reference.html
    plt.figure(figsize=(6,6))
    plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
    for word, (x,y) in zip(words, twodim):
        plt.text(x+0.05, y+0.05, word)

    return

display_scatterplot(output_embeddings.detach().numpy(), input_tokens_list)