In [1]:
import joblib
from transformers import RobertaTokenizer, RobertaModel
import torch
import re
import numpy as np
import pandas as pd

In [2]:
# Initialize tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('macedonizer/mk-roberta-base')
model = RobertaModel.from_pretrained('macedonizer/mk-roberta-base')

def remove_special_characters(text):
    pattern = r'[?.,!:;@#$%^&*()\[\]{}\\/|+\-_=]'
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

def process_sentences(sentences):
    embeddings_data = []
    sentence_id = 0  # Initialize sentence ID

    for sentence in sentences:
        cleaned_sentence = remove_special_characters(sentence)
        inputs = tokenizer(cleaned_sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)

        with torch.no_grad():
            outputs = model(**inputs)

        sentence_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy().tolist()
        all_token_embeddings = outputs.last_hidden_state.squeeze().numpy()
        words = cleaned_sentence.split()

        word_embeddings = []
        token_idx = 1  # Skip [CLS]

        for word in words:
            token_embeddings = []
            while token_idx < len(all_token_embeddings) - 1:  # Skip [SEP]
                token_embedding = all_token_embeddings[token_idx]
                token_text = tokenizer.decode(inputs['input_ids'][0, token_idx], clean_up_tokenization_spaces=True).strip()

                if token_text == word or word.startswith(token_text):
                    token_embeddings.append(token_embedding)
                    token_idx += 1
                else:
                    break

            if token_embeddings:
                word_embedding = np.mean(token_embeddings, axis=0).tolist()
                embeddings_data.append({
                    'sentence_id': sentence_id,
                    'word': word,
                    'sentence_embedding': sentence_embedding,
                    'word_embedding': word_embedding
                })

        sentence_id += 1  # Increment sentence ID for the next sentence

    return embeddings_data

# Example usage:
sentences = [
    'Јас сум тој и тој сум бил, замисли да не беше баш се што е денес, дали би продолжил?',
    'Втора реченица за пример.'
]
embeddings_data = process_sentences(sentences)

# Continue with your data preparation and model prediction as before
df = pd.DataFrame(embeddings_data)
# And so on, to prepare your data frame and make predictions


Some weights of the model checkpoint at macedonizer/mk-roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at macedonizer/mk-roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to b

In [3]:
embeddings_data

[{'sentence_id': 0,
  'word': 'Јас',
  'sentence_embedding': [-0.1404009610414505,
   -0.33094486594200134,
   1.3437083959579468,
   0.8852136731147766,
   -0.38577476143836975,
   -0.9087980389595032,
   0.9380064606666565,
   0.895352840423584,
   0.3932926654815674,
   -0.4657720923423767,
   0.34640464186668396,
   -0.604483962059021,
   0.12050440162420273,
   0.2667635381221771,
   0.25184231996536255,
   -0.5110941529273987,
   -0.23545759916305542,
   -0.7583696246147156,
   -0.060181062668561935,
   -0.12982332706451416,
   0.6036368012428284,
   -1.0947465896606445,
   0.16814544796943665,
   0.21179307997226715,
   0.3282619118690491,
   0.42148613929748535,
   -0.3655751645565033,
   0.423129141330719,
   -0.7752925157546997,
   -0.4616692364215851,
   -0.20094580948352814,
   -0.060646381229162216,
   0.7131708264350891,
   0.9017381072044373,
   0.6246816515922546,
   -1.358420729637146,
   0.3966732919216156,
   0.010997317731380463,
   -1.1401231288909912,
   -0.454563

In [13]:
df = pd.DataFrame(embeddings_data)

In [14]:
df

Unnamed: 0,sentence_id,word,sentence_embedding,word_embedding
0,0,Јас,"[-0.1404009610414505, -0.33094486594200134, 1....","[-0.5557374358177185, -0.5370658040046692, 0.5..."
1,0,сум,"[-0.1404009610414505, -0.33094486594200134, 1....","[-1.4281060695648193, 0.1723286360502243, 1.10..."
2,0,тој,"[-0.1404009610414505, -0.33094486594200134, 1....","[-0.13567139208316803, -0.5457574725151062, 0...."
3,0,и,"[-0.1404009610414505, -0.33094486594200134, 1....","[0.8628255128860474, -0.8517863750457764, 0.10..."
4,0,тој,"[-0.1404009610414505, -0.33094486594200134, 1....","[-0.09584172070026398, -0.2445765882730484, 1...."
5,0,сум,"[-0.1404009610414505, -0.33094486594200134, 1....","[-1.1216413974761963, 0.36443406343460083, 1.3..."
6,0,бил,"[-0.1404009610414505, -0.33094486594200134, 1....","[-0.1125146746635437, 0.3591093420982361, 2.25..."
7,0,замисли,"[-0.1404009610414505, -0.33094486594200134, 1....","[0.9079269170761108, 0.07648082822561264, 0.59..."
8,0,да,"[-0.1404009610414505, -0.33094486594200134, 1....","[-0.8650392889976501, -0.8034347891807556, 2.4..."
9,0,не,"[-0.1404009610414505, -0.33094486594200134, 1....","[-1.1020736694335938, -1.287554383277893, 1.77..."


In [42]:
def create_feature_embeddings(df, columns):
    for column in columns:
        embedding_cols = pd.DataFrame(df[column].tolist(), columns=[f'{column}_{i}' for i in range(len(df[column][0]))])

        # Concatenate the new columns with the original dataframe
        df = pd.concat([df, embedding_cols], axis=1)
        
    return df

In [43]:
data = create_feature_embeddings(df, ['sentence_embedding', 'word_embedding'])

In [44]:
data

Unnamed: 0,sentence_id,word,sentence_embedding,word_embedding,sentence_embedding_0,sentence_embedding_1,sentence_embedding_2,sentence_embedding_3,sentence_embedding_4,sentence_embedding_5,...,word_embedding_758,word_embedding_759,word_embedding_760,word_embedding_761,word_embedding_762,word_embedding_763,word_embedding_764,word_embedding_765,word_embedding_766,word_embedding_767
0,0,Јас,"[-0.1404009610414505, -0.33094486594200134, 1....","[-0.5557374358177185, -0.5370658040046692, 0.5...",-0.140401,-0.330945,1.343708,0.885214,-0.385775,-0.908798,...,0.123755,0.878346,-0.638973,-1.039611,-0.230183,-0.106538,-0.828146,0.84972,-0.433959,-1.140905
1,0,сум,"[-0.1404009610414505, -0.33094486594200134, 1....","[-1.4281060695648193, 0.1723286360502243, 1.10...",-0.140401,-0.330945,1.343708,0.885214,-0.385775,-0.908798,...,0.75971,-1.760856,-0.048928,-1.248029,-0.624639,0.329493,-0.985875,0.295632,1.002164,0.374117
2,0,тој,"[-0.1404009610414505, -0.33094486594200134, 1....","[-0.13567139208316803, -0.5457574725151062, 0....",-0.140401,-0.330945,1.343708,0.885214,-0.385775,-0.908798,...,0.479956,-1.127033,-0.871265,0.429548,-1.051022,-0.086075,-2.293173,0.728684,-1.784118,-0.555762
3,0,и,"[-0.1404009610414505, -0.33094486594200134, 1....","[0.8628255128860474, -0.8517863750457764, 0.10...",-0.140401,-0.330945,1.343708,0.885214,-0.385775,-0.908798,...,0.252122,-0.511153,-0.059363,-0.718377,0.178286,-0.068777,-1.635254,-0.558813,-0.488259,-0.566882
4,0,тој,"[-0.1404009610414505, -0.33094486594200134, 1....","[-0.09584172070026398, -0.2445765882730484, 1....",-0.140401,-0.330945,1.343708,0.885214,-0.385775,-0.908798,...,1.215836,-1.045205,-1.117911,1.004986,-1.716781,-0.019117,-1.480631,1.01364,-0.823914,-0.181509
5,0,сум,"[-0.1404009610414505, -0.33094486594200134, 1....","[-1.1216413974761963, 0.36443406343460083, 1.3...",-0.140401,-0.330945,1.343708,0.885214,-0.385775,-0.908798,...,0.881869,-2.012005,-0.73971,0.216602,-0.743142,-0.321566,-0.326809,0.383635,1.634734,-0.363552
6,0,бил,"[-0.1404009610414505, -0.33094486594200134, 1....","[-0.1125146746635437, 0.3591093420982361, 2.25...",-0.140401,-0.330945,1.343708,0.885214,-0.385775,-0.908798,...,-0.478209,-1.0779,-0.123418,0.064544,-1.189237,0.080105,-0.565171,0.791668,1.188609,-0.047791
7,0,замисли,"[-0.1404009610414505, -0.33094486594200134, 1....","[0.9079269170761108, 0.07648082822561264, 0.59...",-0.140401,-0.330945,1.343708,0.885214,-0.385775,-0.908798,...,0.740563,-0.990182,-0.97248,-1.510459,-0.040884,0.188055,-1.88328,0.293684,-0.947339,0.601113
8,0,да,"[-0.1404009610414505, -0.33094486594200134, 1....","[-0.8650392889976501, -0.8034347891807556, 2.4...",-0.140401,-0.330945,1.343708,0.885214,-0.385775,-0.908798,...,-0.091537,-0.976817,-1.292882,0.634442,-1.886335,0.080082,0.339828,0.214133,-0.069076,-0.334909
9,0,не,"[-0.1404009610414505, -0.33094486594200134, 1....","[-1.1020736694335938, -1.287554383277893, 1.77...",-0.140401,-0.330945,1.343708,0.885214,-0.385775,-0.908798,...,1.118946,0.173948,-0.029013,-1.648056,-2.462029,-0.484853,-1.360387,1.132703,-0.168596,0.208399


In [45]:
data.drop(columns=['sentence_embedding', 'word_embedding'], inplace=True)

In [46]:
log_model = joblib.load('../Models/logistic_regression_model.pkl')

In [47]:
X = data.drop(columns=['sentence_id', 'word'])

In [48]:
X

Unnamed: 0,sentence_embedding_0,sentence_embedding_1,sentence_embedding_2,sentence_embedding_3,sentence_embedding_4,sentence_embedding_5,sentence_embedding_6,sentence_embedding_7,sentence_embedding_8,sentence_embedding_9,...,word_embedding_758,word_embedding_759,word_embedding_760,word_embedding_761,word_embedding_762,word_embedding_763,word_embedding_764,word_embedding_765,word_embedding_766,word_embedding_767
0,-0.140401,-0.330945,1.343708,0.885214,-0.385775,-0.908798,0.938006,0.895353,0.393293,-0.465772,...,0.123755,0.878346,-0.638973,-1.039611,-0.230183,-0.106538,-0.828146,0.84972,-0.433959,-1.140905
1,-0.140401,-0.330945,1.343708,0.885214,-0.385775,-0.908798,0.938006,0.895353,0.393293,-0.465772,...,0.75971,-1.760856,-0.048928,-1.248029,-0.624639,0.329493,-0.985875,0.295632,1.002164,0.374117
2,-0.140401,-0.330945,1.343708,0.885214,-0.385775,-0.908798,0.938006,0.895353,0.393293,-0.465772,...,0.479956,-1.127033,-0.871265,0.429548,-1.051022,-0.086075,-2.293173,0.728684,-1.784118,-0.555762
3,-0.140401,-0.330945,1.343708,0.885214,-0.385775,-0.908798,0.938006,0.895353,0.393293,-0.465772,...,0.252122,-0.511153,-0.059363,-0.718377,0.178286,-0.068777,-1.635254,-0.558813,-0.488259,-0.566882
4,-0.140401,-0.330945,1.343708,0.885214,-0.385775,-0.908798,0.938006,0.895353,0.393293,-0.465772,...,1.215836,-1.045205,-1.117911,1.004986,-1.716781,-0.019117,-1.480631,1.01364,-0.823914,-0.181509
5,-0.140401,-0.330945,1.343708,0.885214,-0.385775,-0.908798,0.938006,0.895353,0.393293,-0.465772,...,0.881869,-2.012005,-0.73971,0.216602,-0.743142,-0.321566,-0.326809,0.383635,1.634734,-0.363552
6,-0.140401,-0.330945,1.343708,0.885214,-0.385775,-0.908798,0.938006,0.895353,0.393293,-0.465772,...,-0.478209,-1.0779,-0.123418,0.064544,-1.189237,0.080105,-0.565171,0.791668,1.188609,-0.047791
7,-0.140401,-0.330945,1.343708,0.885214,-0.385775,-0.908798,0.938006,0.895353,0.393293,-0.465772,...,0.740563,-0.990182,-0.97248,-1.510459,-0.040884,0.188055,-1.88328,0.293684,-0.947339,0.601113
8,-0.140401,-0.330945,1.343708,0.885214,-0.385775,-0.908798,0.938006,0.895353,0.393293,-0.465772,...,-0.091537,-0.976817,-1.292882,0.634442,-1.886335,0.080082,0.339828,0.214133,-0.069076,-0.334909
9,-0.140401,-0.330945,1.343708,0.885214,-0.385775,-0.908798,0.938006,0.895353,0.393293,-0.465772,...,1.118946,0.173948,-0.029013,-1.648056,-2.462029,-0.484853,-1.360387,1.132703,-0.168596,0.208399


In [49]:
# word_embeddings = np.array([data['word_embedding'] for data in embeddings_data])
predicted_categories_indices = log_model.predict(X)
categories = ['0', 'adjective', 'adposition', 'adverb', 'conjuction', 'noun', 'numeral', 'particle', 'pronoun', 'residual', 'verb']  # Example categories
predicted_categories = [categories[index] for index in predicted_categories_indices]

In [51]:
data['predicted_category'] = predicted_categories

# Select only the necessary columns for the final mapping
final_mapping_df = data[['sentence_id', 'word', 'predicted_category']]

# Convert the dataframe into a dictionary for easy access if needed
sentence_word_category_mapping = final_mapping_df.groupby('sentence_id').apply(lambda x: dict(zip(x.word, x.predicted_category))).to_dict()

print(sentence_word_category_mapping)

{0: {'Јас': 'particle', 'сум': 'verb', 'тој': 'pronoun', 'и': 'conjuction', 'бил': 'verb', 'замисли': 'verb', 'да': 'conjuction', 'не': 'pronoun', 'беше': 'verb', 'баш': 'adjective', 'се': 'adjective', 'што': 'adjective', 'е': 'verb', 'денес': 'adverb', 'дали': 'adverb', 'би': 'particle', 'продолжил': 'verb'}, 1: {'Втора': 'adjective', 'реченица': 'noun', 'за': 'adverb', 'пример': 'noun'}}


In [57]:
def find_sentences_with_keyword_and_category(keyword, category, sentence_word_category_mapping, sentences):
    """
    Find sentences where the specified keyword falls under the given category.

    :param keyword: The keyword to look for.
    :param category: The category the keyword should fall under.
    :param sentence_word_category_mapping: A dictionary mapping sentence IDs to dictionaries of word-category pairs.
    :param sentences: A list of sentences corresponding to the sentence IDs.
    :return: A list of sentences where the keyword is categorized under the specified category.
    """
    matching_sentences = []
    keyword_lower = keyword.lower()
    category_lower = category.lower()

    # Iterate over the sentence ID and word-category mapping
    for sentence_id, word_category_map in sentence_word_category_mapping.items():
        # Check if the keyword exists in the current sentence's word-category map and matches the category, case insensitively
        for word, word_category in word_category_map.items():
            if word.lower() == keyword_lower and word_category.lower() == category_lower:
                # If a match is found, append the corresponding sentence to the matching_sentences list
                matching_sentences.append(sentences[sentence_id])
                break  # Stop checking other words in this sentence once a match is found

    return matching_sentences

In [60]:
result = find_sentences_with_keyword_and_category('Продолжил', 'verb', sentence_word_category_mapping, sentences)

In [61]:
result

['Јас сум тој и тој сум бил, замисли да не беше баш се што е денес, дали би продолжил?']