## Actual Workings

In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
!pip install transformers
!pip install pytorch_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 5.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 57.8 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 64.4 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.0 tokenizers-0.12.1 transformers-4.22.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch_transformers
  Downloading pytorch_transformers-1.2.0-py3-none-any.whl (176 kB)
[K     |█████████████████████████

In [None]:
import pandas as pd
import numpy as np

import torch

from transformers import BertTokenizer
from transformers import BertModel

from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
data_uri = '/content/drive/MyDrive/Colab Notebooks/deeplearning_news_clustering/data/allData-19-08-27.ta'

df = pd.read_csv(data_uri, header=None, names=['news'])
len(df)

1727888

In [None]:
df['news'] = df['news'].str.strip()
df.head(), len(df)

(                                                news
 0  அமைச்சின் பணிகளை முன்னெடுப்பதற்கு கௌர அமைச்சர்...
 1  இவ்வமைச்சு இல 40  புத்கமுவ வீதி  இராஜகிரிய எனு...
 2  தேசிய மொழிகள் மற்றும் சமூக ஒருமைப்பாட்டு அமைச்...
 3  தேசிய மொழிகள்  சமூக ஒருமைப்பாடு தொடர்பான கொள்க...
 4  குறித்த தேசிய நோக்கங்களை எய்தும் பொருட்டு தேசி..., 1727888)

In [None]:
## Load pretrained model/tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True)
model = BertModel.from_pretrained('bert-base-multilingual-uncased', output_hidden_states=True)

Downloading:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Put the model in "evaluation" mode,meaning feed-forward operation.
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(105879, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

In [None]:
def get_vectors(news):
  encoding = tokenizer.encode(news,
                              add_special_tokens = True,
                              max_length = 512, 
                              truncation = True, 
                              padding = "max_length"
                            )
  token_text = tokenizer.convert_ids_to_tokens(encoding)
  indexed_tokens = tokenizer.convert_tokens_to_ids(token_text)

  # Convert inputs to PyTorch tensors
  tokens_tensor = torch.tensor([indexed_tokens])

  # Run the text through BERT, get the output and collect all of the hidden states produced from all 12 layers.
  with torch.no_grad():
    outputs = model(tokens_tensor)
    
    # can use last hidden state as word embeddings
    last_hidden_state = outputs[0]
    word_embed_1 = last_hidden_state

    # Evaluating the model will return a different number of objects based on how it's  configured in the `from_pretrained` call earlier. 
    # In this case, becase we set `output_hidden_states = True`, the third item will be the hidden states from all layers. 
    # See the documentation for more details:https://huggingface.co/transformers/model_doc/bert.html#bertmodel
    hidden_states = outputs[2]
    
    # initial embeddings can be taken from 0th layer of hidden states
    word_embed_2 = hidden_states[0]
    return word_embed_2

In [None]:
df['vec'] = df['news'].progress_apply(get_vectors)

  0%|          | 0/1727888 [00:00<?, ?it/s]

In [None]:
# base_url = r'D:\repo\R&D\deeplearning_news_clustering\data\vec.csv'
base_url = '/content/drive/MyDrive/Colab Notebooks/deeplearning_news_clustering/vectors/vec.csv'
df.to_csv(base_url, encoding='utf-8')