In [12]:
!pip install transformers



In [13]:
import pandas as pd
import torch
from transformers import AutoTokenizer, RobertaTokenizer, RobertaModel
import numpy as np

batch_size = 100
input_text_file = 'assignment4-dataset.txt'
output_csv_file = 'assignment4-dataset.csv'
model_name = 'FacebookAI/roberta-base'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [16]:

with open(input_text_file, 'r', encoding='utf-8') as file:
    lines = file.readlines()

total_lines = len(lines)
sample_size = max(1, int(total_lines * 0.0001))  # At least one line

import random
random.seed(42)  # For reproducibility
sampled_lines = random.sample(lines, sample_size)

df = pd.DataFrame(sampled_lines, columns=['text'])
df.to_csv(output_csv_file, index=False)




In [18]:
df = pd.read_csv(output_csv_file)

In [19]:
tokenizer = RobertaTokenizer.from_pretrained(model_name, use_fast=True)
model = RobertaModel.from_pretrained(model_name).to(device)
model.eval()

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dr

In [20]:
!pip install datasets
from datasets import load_dataset



In [21]:
dataset = load_dataset('csv', data_files=output_csv_file)
print(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 446
    })
})


In [22]:
def tokenize_function(batch):
    return tokenizer(batch['text'], padding="max_length", truncation=True, max_length=512)

In [23]:

tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size = batch_size)

Map:   0%|          | 0/446 [00:00<?, ? examples/s]

In [24]:
from collections import defaultdict
from tqdm import tqdm

token_embeddings = defaultdict(list)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


for batch in tqdm(tokenized_dataset["train"].batch(batch_size)):
    input_ids = torch.tensor(batch["input_ids"]).to(device)
    attention_mask = torch.tensor(batch["attention_mask"]).to(device)
        
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state  
        
    for input_id_row, embedding_row in zip(input_ids, embeddings):
        tokens = tokenizer.convert_ids_to_tokens(input_id_row)
        for token, embedding in zip(tokens, embedding_row):
            token_embeddings[token].append(embedding.cpu().numpy())

average_embeddings = {}

# Iterate over each token and its corresponding embeddings
for token, embeds in token_embeddings.items():
    # Convert the list of embeddings to a NumPy array and calculate the mean
    embeds_array = np.array(embeds)
    average_embedding = np.mean(embeds_array, axis=0)
    
    # Store the average embedding in the dictionary
    average_embeddings[token] = average_embedding




Batching examples:   0%|          | 0/446 [00:00<?, ? examples/s]

100%|██████████| 5/5 [00:17<00:00,  3.51s/it]
