In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel


In [13]:

from tqdm import tqdm  # Progress bar


In [3]:
df = pd.read_csv("Cleaned_Data.csv")


In [17]:
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")


In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [25]:
def get_bert_embedding(text):
    tokens = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    
    # Move to GPU if available
    input_ids = tokens["input_ids"].to(device)
    attention_mask = tokens["attention_mask"].to(device)
    
    # Get hidden states from BERT
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    # Extract the last hidden state (CLS token representation)
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    print(cls_embedding.shape)
    return cls_embedding

In [27]:
tqdm.pandas()  # Enable progress bar
df["bert_vector"] = df["text"].progress_apply(get_bert_embedding)

# Save vectorized dataset
df.to_pickle("bert_vectorized.pkl")

  0%|          | 4/1600000 [00:02<188:58:45,  2.35it/s]

(768,)
(768,)
(768,)
(768,)


  0%|          | 18/1600000 [00:02<26:21:56, 16.86it/s]

(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)


  0%|          | 44/1600000 [00:02<9:12:55, 48.23it/s] 

(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)


  0%|          | 71/1600000 [00:02<5:40:35, 78.29it/s]

(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)


  0%|          | 99/1600000 [00:02<4:25:30, 100.43it/s]

(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)


  0%|          | 125/1600000 [00:03<4:05:55, 108.43it/s]

(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)


  0%|          | 152/1600000 [00:03<3:45:41, 118.14it/s]

(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)


  0%|          | 179/1600000 [00:03<3:37:57, 122.34it/s]

(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)


  0%|          | 206/1600000 [00:03<3:33:04, 125.13it/s]

(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)


  0%|          | 233/1600000 [00:03<3:29:33, 127.24it/s]

(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)


  0%|          | 259/1600000 [00:04<4:01:30, 110.40it/s]

(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)


  0%|          | 271/1600000 [00:04<4:12:16, 105.69it/s]

(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)


  0%|          | 293/1600000 [00:04<4:17:12, 103.66it/s]

(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)


  0%|          | 315/1600000 [00:04<4:16:49, 103.81it/s]

(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)


  0%|          | 338/1600000 [00:04<4:18:04, 103.31it/s]

(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)


  0%|          | 360/1600000 [00:05<4:35:48, 96.67it/s] 

(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)


  0%|          | 381/1600000 [00:05<4:32:06, 97.98it/s]

(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)


  0%|          | 391/1600000 [00:06<21:40:06, 20.51it/s]

(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)


  0%|          | 399/1600000 [00:08<35:16:22, 12.60it/s]

(768,)
(768,)
(768,)
(768,)
(768,)


  0%|          | 405/1600000 [00:09<40:14:42, 11.04it/s]

(768,)
(768,)
(768,)
(768,)


  0%|          | 409/1600000 [00:09<43:21:52, 10.25it/s]

(768,)
(768,)


  0%|          | 410/1600000 [00:09<10:42:11, 41.51it/s]

(768,)




KeyboardInterrupt



In [9]:
df.head(5)


Unnamed: 0,target,id,date,user,text
0,0,1467810369,2009-04-06 22:19:45,_TheSpecialOne_,switchfoot awww thats a bummer you shoulda ...
1,0,1467810672,2009-04-06 22:19:49,scotthamilton,is upset that he cant update his facebook by t...
2,0,1467810917,2009-04-06 22:19:53,mattycus,kenichan i dived many times for the ball manag...
3,0,1467811184,2009-04-06 22:19:57,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,2009-04-06 22:19:57,Karoli,nationwideclass no its not behaving at all im ...
