In [None]:
import joblib
from transformers import RobertaTokenizer, RobertaModel
import torch
import re
import numpy as np
import pandas as pd

In [1]:
# Initialize tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('macedonizer/mk-roberta-base')
model = RobertaModel.from_pretrained('macedonizer/mk-roberta-base')

Some weights of the model checkpoint at macedonizer/mk-roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at macedonizer/mk-roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to b

In [33]:
def remove_special_characters(text):
    # Define a regular expression pattern to match the specified characters
    pattern = r'[?.,!:;@#$%^&*()\[\]{}\\/|+\-_=]'
    
    # Use the sub() function from the re module to replace matched characters with an empty string
    cleaned_text = re.sub(pattern, '', text)
    
    return cleaned_text

In [2]:
sentence = 'Јас сум тој и тој сум бил, замисли да не беше баш се што е денес, дали би продолжил?'

In [36]:
sentence = remove_special_characters(sentence)

In [37]:

# Assuming 'sentence' is your input text and 'model', 'tokenizer' are already defined and loaded
inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)

with torch.no_grad():
    outputs = model(**inputs)

# Extract sentence embedding
sentence_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy().tolist()

# Get all token embeddings
all_token_embeddings = outputs.last_hidden_state.squeeze().numpy()

words = sentence.replace(',', '').replace('?', '').split()  # Simple tokenization based on spaces and removing punctuation

word_embeddings = []
token_idx = 1  # Start from the first token (after [CLS])

for word in words:
    token_embeddings = []
    while token_idx < len(all_token_embeddings) - 1:  # Exclude [SEP] token
        token_embedding = all_token_embeddings[token_idx]
        token_text = tokenizer.decode(inputs['input_ids'][0, token_idx], clean_up_tokenization_spaces=True).strip()

        if token_text == word or word.startswith(token_text):
            token_embeddings.append(token_embedding)
            token_idx += 1
        else:
            break
    
    if token_embeddings:
        # Average token embeddings to get the word embedding
        word_embedding = np.mean(token_embeddings, axis=0).tolist()
        word_embeddings.append(word_embedding)

# Prepare embeddings data with sentence embedding included for each word
embeddings_data = [{
    'word': word,
    'sentence_embedding': sentence_embedding,  # Same for all words in the sentence
    'word_embedding': word_embedding
} for word, word_embedding in zip(words, word_embeddings)]

# Now, 'embeddings_data' contains the necessary information for each word along with the shared sentence context


In [38]:

df = pd.DataFrame(embeddings_data)

In [41]:
embedding_cols = pd.DataFrame(df['sentence_embedding'].tolist(), columns=[f'sentence_embedding_{i}' for i in range(len(df['sentence_embedding'][0]))])

# Concatenate the new columns with the original dataframe
df = pd.concat([df, embedding_cols], axis=1)

# Now, you can drop the original 'sentence_embedding' column if you don't need it anymore
df.drop(columns=['sentence_embedding'], inplace=True)

In [43]:
embedding_cols = pd.DataFrame(df['word_embedding'].tolist(), columns=[f'word_embedding_{i}' for i in range(len(df['word_embedding'][0]))])

# Concatenate the new columns with the original dataframe
df = pd.concat([df, embedding_cols], axis=1)

# Now, you can drop the original 'sentence_embedding' column if you don't need it anymore
df.drop(columns=['word_embedding'], inplace=True)

In [45]:
log_model = joblib.load('../Models/logistic_regression_model.pkl')

In [51]:
word_embeddings = np.array([data['word_embedding'] for data in embeddings_data])
predicted_categories_indices = log_model.predict(df.iloc[:, 1:])
categories = ['0', 'adjective', 'adposition', 'adverb', 'conjuction', 'noun', 'numeral', 'particle', 'pronoun', 'residual', 'verb']  # Example categories
predicted_categories = [categories[index] for index in predicted_categories_indices]

In [52]:
word_category_mappings = {data['word']: category for data, category in zip(embeddings_data, predicted_categories)}

In [53]:
word_category_mappings

{'Јас': 'particle',
 'сум': 'verb',
 'тој': 'pronoun',
 'и': 'conjuction',
 'бил': 'verb',
 'замисли': 'verb',
 'да': 'conjuction',
 'не': 'pronoun',
 'беше': 'verb',
 'баш': 'adjective',
 'се': 'adjective',
 'што': 'adjective',
 'е': 'verb',
 'денес': 'adverb',
 'дали': 'adverb',
 'би': 'particle',
 'продолжил': 'verb'}