In [6]:
from transformers import RobertaTokenizer, RobertaModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
# Step 1: Load the text from a .txt file
file_path = 'wood_corpus.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    
    text = file.read()

In [9]:
# Step 2: Load pre-trained RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
model = RobertaModel.from_pretrained('roberta-large')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Step 3: Tokenize the text
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)

#see tokens:
input_ids = inputs['input_ids'][0]
tokens = tokenizer.convert_ids_to_tokens(input_ids)

print("Tokens:", tokens)


Tokens: ['<s>', 'Introduction', 'Ġto', 'ĠHeart', 'wood', 'Ġand', 'ĠSap', 'wood', 'Ġin', 'ĠWood', 'ĠAnat', 'omy', 'Ċ', 'Ċ', 'In', 'Ġthe', 'Ġanatomy', 'Ġof', 'Ġa', 'Ġtree', ',', 'Ġheart', 'wood', 'Ġand', 'Ġsap', 'wood', 'Ġare', 'Ġdistinct', 'Ġregions', 'Ġwithin', 'Ġthe', 'Ġtrunk', ',', 'Ġeach', 'Ġplaying', 'Ġunique', 'Ġroles', 'Ġin', 'Ġthe', 'Ġtree', "'s", 'Ġstructure', 'Ġand', 'Ġfunction', '.', 'ĠThese', 'Ġzones', 'Ġform', 'Ġconcent', 'ric', 'Ġlayers', 'Ġas', 'Ġthe', 'Ġtree', 'Ġgrows', ',', 'Ġwith', 'Ġsap', 'wood', 'Ġlocated', 'Ġin', 'Ġthe', 'Ġouter', 'most', 'Ġrings', 'Ġjust', 'Ġbeneath', 'Ġthe', 'Ġbark', ',', 'Ġand', 'Ġheart', 'wood', 'Ġoccupying', 'Ġthe', 'Ġcentral', ',', 'Ġolder', 'Ġportion', '.', 'ĠThe', 'Ġdistinction', 'Ġarises', 'Ġduring', 'Ġthe', 'Ġnatural', 'Ġaging', 'Ġand', 'Ġdevelopment', 'Ġof', 'Ġthe', 'Ġtree', ',', 'Ġas', 'Ġcells', 'Ġin', 'Ġthe', 'Ġsap', 'wood', 'Ġtransition', 'Ġto', 'Ġheart', 'wood', 'Ġover', 'Ġtime', '.', 'ĠBoth', 'Ġregions', 'Ġare', 'Ġvital', 'Ġfor', 'Ġt

In [11]:
# Step 4: Extract embeddings
# Get the tokenized input IDs and attention mask
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

# Run the inputs through the model
with torch.no_grad():
    
    outputs = model(input_ids, attention_mask=attention_mask)
    # Get the last hidden states (embeddings for each token)
    embeddings = outputs.last_hidden_state

# Choose the two words you want to extract embeddings for
word_1 = "heartwood"
word_2 = "sapwood"

# Tokenize the individual words
word_1_id = tokenizer.encode(word_1, add_special_tokens=False)
word_2_id = tokenizer.encode(word_2, add_special_tokens=False)

# Find the indices of these words in the tokenized input
indices_word_1 = [i for i, id_ in enumerate(input_ids[0]) if id_ in word_1_id]
indices_word_2 = [i for i, id_ in enumerate(input_ids[0]) if id_ in word_2_id]

# Extract embeddings for the specific words
embedding_word_1 = embeddings[0, indices_word_1, :].mean(dim=0) if indices_word_1 else None
embedding_word_2 = embeddings[0, indices_word_2, :].mean(dim=0) if indices_word_2 else None

# Output the embeddings
print("Embedding for", word_1," :", embedding_word_1)
print("Embedding for", word_2," :", embedding_word_2)

Embedding for heartwood  : tensor([ 0.2567,  0.2757, -0.2582,  ...,  0.0249,  0.3811,  0.2583])
Embedding for sapwood  : tensor([ 0.2408,  0.2505, -0.2593,  ...,  0.0568,  0.3598,  0.2458])


In [12]:
similarity = cosine_similarity(embedding_word_1.unsqueeze(0), embedding_word_2.unsqueeze(0))
print("Cosine Similarity:", similarity)

Cosine Similarity: [[0.9996097]]
