# Generating Embeddings for Cosine Similarity
Using BERT to convert dataset of ads into embeddings to measure topic intensity of ads. 

### 0: Installing Packages and Importing Dependencies
This Jupyter notebook makes use of the `transformers` package from HuggingFace and uses PyTorch for tensor operations.

In [None]:
%pip install transformers

In [13]:
import torch
from transformers import BertTokenizer, BertModel
import matplotlib.pyplot as plt

# Loading the pre-trained model's tokenizer (a.k.a dictionary)
model_names = {'base uncased': 'bert-base-uncased', 'large uncased': 'bert-large-uncased', 
               'base cased': 'bert-base-cased', 'large cased': 'bert-large-cased'}
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

### 1: Generating Encodings
Converting the raw text from the dataset into encodings which captures the contextual meaning of words in the input.

In [14]:
from Scripts import createTokensFromAd
from Scripts import data_wrapper
#Map of Identifier -> Marked Text
ads_marked_text = createTokensFromAd.main()
#Map of Identifier -> Encoded text
encoded_ads = {}
max_len=0
for identifier, text in ads_marked_text.items():  
    encoding= tokenizer.encode_plus(text,add_special_tokens=True,return_token_type_ids=True, return_attention_mask=True)
    max_len = max(max_len, len(encoding['input_ids']))
    encoded_ads[identifier] = encoding
#Adding padding to max length of the input ids.
for identifier, encoding in encoded_ads.items():
    encoding['input_ids'] += [0] * (max_len - len(encoding['input_ids']))
    encoding['token_type_ids'] += [0] * (max_len - len(encoding['token_type_ids']))
    encoding['attention_mask'] += [0] * (max_len - len(encoding['attention_mask']))



Token indices sequence length is longer than the specified maximum sequence length for this model (1454 > 512). Running this sequence through the model will result in indexing errors


### 2: Converting encoding into PyTorch Tensors
A useful step before extracting the embeddings is standardizing the format of input into tensors of identical shape.

In [15]:
#Extract Embeddings

encoded_ads_tensors = {}
for identifier,encoding in encoded_ads.items():
    token_tensor=torch.tensor([encoding['input_ids']])
    segment_tensor = torch.tensor([encoding['token_type_ids']])
    attention_tensor = torch.tensor([encoding['attention_mask']])
    encoded_ads_tensors[identifier] = (token_tensor, segment_tensor, attention_tensor)
    
    
    
#Loading the Bert Model
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states)
)
model.eval()    

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

### 3: Generating Embeddings
Running the BERT model on the input tensors.

In [16]:
embeddings = {}
output=[]
for identifier, tensors in encoded_ads_tensors.items():
    with torch.no_grad():
        embeddings[identifier] = model(tensors[0],attention_mask=tensors[2])
        output.append(embeddings[identifier].last_hidden_state)

RuntimeError: The expanded size of the tensor (2242) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 2242].  Tensor sizes: [1, 512]

In [None]:
#Now implementing Cosine Similarity between the two embeddings
import torch.nn.functional as F

def cosine_similarity(a,b):
    return F.cosine_similarity(a,b,dim=1)
print("Cosine Similarity between the two embeddings: ",cosine_similarity(output[0][-1].unsqueeze(0),output[1][-1].unsqueeze(0))[0][0])