## Bert

In [None]:
 !pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m40.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m63.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [None]:
import pickle
import pandas as pd
from gensim.models import Word2Vec
from scipy.spatial.distance import cosine
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import tensorflow as tf
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
from keras.utils import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#path = '/content/drive/MyDrive/Group 1/data_clean.pkl' #TK's path
path = '/content/drive/MyDrive/Group 1 - Text Analytics/Class/drive-download-20230324T134613Z-001 (1).zip (Unzipped Files)/data_clean.pkl' #VA's path
with open(path, 'rb') as file:
    data = pickle.load(file)

# print 1st row of dataset
print(data[0])

car wondering anyone could enlighten car saw day door sports car looked late early called bricklin doors really small addition front bumper separate rest body know anyone tellme model name engine specs years production car made history whatever info funky looking car please mail thanks


In [None]:
type(data)

list

### BERT Transformation


> BERT consists of a trained Transformer encoder stack (12 encoders in BERT-base) i.e. no decoder. The output from the last encoder (size 768 per token) can then be used for downstream tasks such as classification.

The text data is prepared by:
1. Adding CLS and SEP tokens to the beginning and the end of each text
2. Tokenizing the data with the BERT tokenizer (remember Text Preprocessing!). The tokens are derived with WordPiece using BERT‘s training.
3. Padding or truncating to the maximum sequence length (no more than 512).
4. Converting the tokens to the vocabulary ids, e.g. “play”  103
5. Converting the ids for each document to a tensor.

In [None]:
# Add CLS and SEP tokens to the beginning and the end of each text
data_clean_special = ['[CLS] ' + text + ' [SEP]' for text in data]

# Examine the first list entry
print(data_clean_special[0])

[CLS] car wondering anyone could enlighten car saw day door sports car looked late early called bricklin doors really small addition front bumper separate rest body know anyone tellme model name engine specs years production car made history whatever info funky looking car please mail thanks [SEP]


In [None]:
# Tokenizing the data with the BERT tokenizer (remember Text Preprocessing!). The tokens are derived with WordPiece using BERT‘s training.

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize texts
tokenized_texts = [tokenizer.tokenize(text) for text in data_clean_special]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Examine the first document
# Tokenizer splits the words until it can find a match with the dictionary
print(tokenized_texts[0])

['[CLS]', 'car', 'wondering', 'anyone', 'could', 'en', '##light', '##en', 'car', 'saw', 'day', 'door', 'sports', 'car', 'looked', 'late', 'early', 'called', 'brick', '##lin', 'doors', 'really', 'small', 'addition', 'front', 'bumper', 'separate', 'rest', 'body', 'know', 'anyone', 'tell', '##me', 'model', 'name', 'engine', 'spec', '##s', 'years', 'production', 'car', 'made', 'history', 'whatever', 'info', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', '[SEP]']


In [None]:
# Determine the maximum sequence length
# Restrict to 85 to save resources
length = []
for text in tokenized_texts:
  length.append(len(text))
seq_length = np.quantile(length, 0.85)

In [None]:
print(seq_length)

248.0


Our data does not exceed BERT's maximum sequence length of 512. Text data often consists of sequences of varying lengths. BERT performs better when the input data is in a consistent format. By setting a maximum sequence length, we can ensure that the model processes the data in batches, which significantly speeds up training and inference.

The parameters of the **pad_sequences** method are:



*   **sequences**: List of tokenized texts.
*   **dtype**: Data type of the output sequences.
* **maxlen**: Maximum length of each sequence.
* **value**: Padding value, in this case, the '[PAD]' token.
* **truncating**: If a text is longer than the maximum length, remove tokens from the beginning ("pre") or the end ("post").
* **padding**: If a text is shorter than the maximum length, add padding at the beginning ("pre") or the end ("post").


In [None]:
# Padding or truncating to the maximum sequence length (no more than 248).
sentences_padded = pad_sequences(tokenized_texts, dtype=object, maxlen=int(seq_length), value='[PAD]', truncating="post", padding="post")

In [None]:
# Examine the first document
print(sentences_padded[0])

['[CLS]' 'car' 'wondering' 'anyone' 'could' 'en' '##light' '##en' 'car'
 'saw' 'day' 'door' 'sports' 'car' 'looked' 'late' 'early' 'called'
 'brick' '##lin' 'doors' 'really' 'small' 'addition' 'front' 'bumper'
 'separate' 'rest' 'body' 'know' 'anyone' 'tell' '##me' 'model' 'name'
 'engine' 'spec' '##s' 'years' 'production' 'car' 'made' 'history'
 'whatever' 'info' 'funky' 'looking' 'car' 'please' 'mail' 'thanks'
 '[SEP]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]'
 '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]'
 '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]'
 '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]'
 '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]'
 '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]'
 '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]'
 '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]' '[PAD]'


In [None]:
print(len(sentences_padded[7]))

248


In [None]:
# Converting the tokens to the vocabulary ids
sentences_converted = [tokenizer.convert_tokens_to_ids(s) for s in sentences_padded]

In [None]:
# Examine the first document
print(sentences_converted[0])

[101, 2482, 6603, 3087, 2071, 4372, 7138, 2368, 2482, 2387, 2154, 2341, 2998, 2482, 2246, 2397, 2220, 2170, 5318, 4115, 4303, 2428, 2235, 2804, 2392, 21519, 3584, 2717, 2303, 2113, 3087, 2425, 4168, 2944, 2171, 3194, 28699, 2015, 2086, 2537, 2482, 2081, 2381, 3649, 18558, 24151, 2559, 2482, 3531, 5653, 4283, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


Question to e.: Why do we have to convert tokens to ids? Is it for calculating similarity?

- used for embeddings

In [None]:
# You can achieve the same transformation as above using
# Note that we used data instead of data_clean_special, as the encode method automatically adds the [CLS] and [SEP] tokens.
sentences_converted_quick=[[tokenizer.encode(s,add_special_tokens=True ,padding='max_length',truncation='longest_first', max_length=int(seq_length))]
for s in data]

In [None]:
# For reference
'''
# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Define the maximum length
MAX_LEN = 248

# Encode the corpus with CLS and SEP tokens
encoded_corpus = tokenizer(data, add_special_tokens=True, max_length=MAX_LEN, padding=True, truncation=True)

# Print the result
print(encoded_corpus)
'''

'\n# Load the tokenizer\ntokenizer = BertTokenizer.from_pretrained("bert-base-uncased")\n\n# Define the maximum length\nMAX_LEN = 248\n\n# Encode the corpus with CLS and SEP tokens\nencoded_corpus = tokenizer(data, add_special_tokens=True, max_length=MAX_LEN, padding=True, truncation=True)\n\n# Print the result\nprint(encoded_corpus)\n'

### BERT Model Interpretation

**a. word_embeddings**: In the BERT base model, the word embeddings size is 768. The numbers come from the pre-trained BERT model itself, which is trained on a large corpus of text data. This size is a design choice made by the creators of BERT, and it represents the dimensions of the dense vectors used to represent each token in the input sequence. 30522 is the size of the BERT vocabulary.

**b. position_embeddings**: The Embedding layer (512, 768) indicates that the position embedding is a matrix with 512 rows (one for each position in the input sequence) and 768 columns (the same dimension as the word embeddings). 512 is the maximum sequence length.

**c. Number of encoder layers**: 12

**d. BertLayer:**

1.   **BertAttention**:
*   **BertSelfAttention**: computes contextualized representation of word embeddings and position embeddings
*   **BertSelfOutput**: applies normalization to the BertSelfAttention ouputs. 

2.   **BertIntermediate:** A feed-forward neural network that transform the output from the BERTAttention.

3. **BertOutput:** Another feed-forward network that maps the 3,072-dimensional intermediate representation back to a 768-dimensional output. Layer normalization and dropout are applied to the output.

4. **BertPooler:** A pooling layer that takes the hidden state of the first token ([CLS]) from the final encoder layer and applies a linear transformation followed by a Tanh activation function. This output can be used as a final fixed-size representation for classification tasks.


In [None]:
inputs = torch.tensor(sentences_converted_quick)
model = BertModel.from_pretrained('bert-base-uncased') 
model

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [None]:
with torch.no_grad():
  outputs = model(inputs[0])
  embeddings = outputs.last_hidden_state[0] [0].numpy()

In [None]:
len(embeddings)

768