In [26]:
import joblib
from transformers import RobertaTokenizer, RobertaModel
import torch
import re
import numpy as np
import pandas as pd

In [112]:
# Initialize tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('macedonizer/mk-roberta-base')
model = RobertaModel.from_pretrained('macedonizer/mk-roberta-base')

def remove_special_characters(text):
    pattern = r'[?.,!:;@#$%^&*()\[\]{}\\/|+\-_=]'
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text


To modify your process_sentences function to handle subtokens properly, you need to adjust the logic that matches words in the sentence to the tokens (and subtokens) generated by the tokenizer. When a word is split into multiple subtokens by the tokenizer, you should aggregate the embeddings of these subtokens to represent the original word.

Below is an updated version of your function with adjustments to handle subtokens correctly. The key changes include:

Using the tokenizer's encode_plus method to get offsets that help in aligning subtokens to original words.
Aggregating subtoken embeddings by taking the mean to represent the embedding of the original word.
Here's how you can do it:

python
Copy code
import torch
import numpy as np
from transformers import RobertaTokenizer, RobertaModel

# Assume remove_special_characters function exists. If not, you might need to define it.
# Example: def remove_special_characters(sentence): return re.sub(r'\W+', ' ', sentence)

tokenizer = RobertaTokenizer.from_pretrained('macedonizer/mk-roberta-base')
model = RobertaModel.from_pretrained('macedonizer/mk-roberta-base')

def process_sentences(sentences):
    embeddings_data = []
    sentence_id = 0  # Initialize sentence ID

    for sentence in sentences:
        # Assuming 'remove_special_characters' is a function you have that cleans the sentence.
        cleaned_sentence = remove_special_characters(sentence)
        inputs = tokenizer.encode_plus(cleaned_sentence, return_tensors="pt", padding=True, truncation=True, max_length=512, return_offsets_mapping=True)
        offset_mapping = inputs["offset_mapping"].squeeze().tolist()  # Get offsets

        with torch.no_grad():
            outputs = model(**inputs)

        sentence_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy().tolist()
        all_token_embeddings = outputs.last_hidden_state.squeeze().numpy()

        # Initialize container for word embeddings
        word_embeddings = []
        words = cleaned_sentence.split()
        word_idx = 0
        current_word_embeddings = []

        for idx, (start, end) in enumerate(offset_mapping):
            # Skip special tokens
            if idx == 0 or idx == len(offset_mapping) - 1 or start == end:
                continue
            
            token_embedding = all_token_embeddings[idx]
            token_text = cleaned_sentence[start:end]
            if token_text == words[word_idx] or token_text in words[word_idx]:
                current_word_embeddings.append(token_embedding)
            else:
                if current_word_embeddings:
                    word_embedding = np.mean(current_word_embeddings, axis=0)
                    word_embeddings.append((word_idx, word_embedding))
                    current_word_embeddings = [token_embedding]
                    word_idx += 1

        # Handling the last word
        if current_word_embeddings:
            word_embedding = np.mean(current_word_embeddings, axis=0)
            word_embeddings.append((word_idx, word_embedding))

        for word_idx, word_embedding in word_embeddings:
            embeddings_data.append({
                'sentence_id': sentence_id,
                'word': words[word_idx],
                'sentence_embedding': sentence_embedding,
                'word_embedding': word_embedding.tolist()
            })

        sentence_id += 1  # Increment sentence ID for the next sentence

    return embeddings_data
# Example usage:
sentences = [
    "Првата реченица за пример.",
    "Втората реченица со клучен збор.",
    "Уште еден пример."
]
embeddings_data = process_sentences(sentences)

# Continue with your data preparation and model prediction as before
df = pd.DataFrame(embeddings_data)
# And so on, to prepare your data frame and make predictions


SyntaxError: unterminated string literal (detected at line 15) (670651212.py, line 15)

In [96]:
embeddings_data

[{'sentence_id': 0,
  'word': 'Првата',
  'sentence_embedding': [-0.009156584739685059,
   -0.15129134058952332,
   0.7499036192893982,
   1.521781325340271,
   0.14840342104434967,
   -0.021482041105628014,
   0.7353679537773132,
   0.9052767753601074,
   0.02132779359817505,
   -1.018346905708313,
   0.6084856986999512,
   0.21688765287399292,
   -0.21893388032913208,
   -0.8626759648323059,
   0.05956171080470085,
   0.04468899965286255,
   0.19877910614013672,
   -1.0776903629302979,
   1.161653995513916,
   -0.5063474774360657,
   1.1141902208328247,
   0.1433129906654358,
   0.2113131731748581,
   -0.5923874378204346,
   0.574842631816864,
   -0.43301209807395935,
   -0.23845700919628143,
   0.5221996903419495,
   0.19209326803684235,
   -0.6463501453399658,
   -0.14747744798660278,
   -0.6198680400848389,
   1.3514553308486938,
   -0.5815837383270264,
   0.9212337136268616,
   -0.589144229888916,
   0.030186666175723076,
   -0.5794903635978699,
   -0.6616466641426086,
   -0.8929

In [97]:
df = pd.DataFrame(embeddings_data)

In [98]:
df

Unnamed: 0,sentence_id,word,sentence_embedding,word_embedding
0,0,Првата,"[-0.009156584739685059, -0.15129134058952332, ...","[-0.24916261434555054, 0.8135042190551758, 0.9..."
1,0,реченица,"[-0.009156584739685059, -0.15129134058952332, ...","[-2.5628092288970947, -0.0662396252155304, 1.3..."
2,0,за,"[-0.009156584739685059, -0.15129134058952332, ...","[0.8072260022163391, -0.9919570684432983, 0.93..."
3,0,пример,"[-0.009156584739685059, -0.15129134058952332, ...","[0.3718527853488922, -0.14160950481891632, 1.0..."
4,1,Втората,"[-0.5984004139900208, -0.43612363934516907, 0....","[-0.8104681968688965, -0.2734103798866272, 1.9..."
5,1,реченица,"[-0.5984004139900208, -0.43612363934516907, 0....","[-2.314239501953125, -0.5015822649002075, 1.40..."
6,1,со,"[-0.5984004139900208, -0.43612363934516907, 0....","[-0.6076048016548157, -1.4616307020187378, 0.6..."
7,1,клучен,"[-0.5984004139900208, -0.43612363934516907, 0....","[-1.079825520515442, 0.4995218813419342, 2.578..."
8,1,збор,"[-0.5984004139900208, -0.43612363934516907, 0....","[-0.2656204402446747, 0.16564339399337769, -0...."
9,2,Уште,"[0.26477915048599243, -0.4897470474243164, 1.4...","[-1.0692380666732788, -1.2803955078125, 2.8142..."


In [99]:
def create_feature_embeddings(df, columns):
    for column in columns:
        embedding_cols = pd.DataFrame(df[column].tolist(), columns=[f'{column}_{i}' for i in range(len(df[column][0]))])

        # Concatenate the new columns with the original dataframe
        df = pd.concat([df, embedding_cols], axis=1)
        
    return df

In [100]:
data = create_feature_embeddings(df, ['sentence_embedding', 'word_embedding'])

In [101]:
data

Unnamed: 0,sentence_id,word,sentence_embedding,word_embedding,sentence_embedding_0,sentence_embedding_1,sentence_embedding_2,sentence_embedding_3,sentence_embedding_4,sentence_embedding_5,...,word_embedding_758,word_embedding_759,word_embedding_760,word_embedding_761,word_embedding_762,word_embedding_763,word_embedding_764,word_embedding_765,word_embedding_766,word_embedding_767
0,0,Првата,"[-0.009156584739685059, -0.15129134058952332, ...","[-0.24916261434555054, 0.8135042190551758, 0.9...",-0.009157,-0.151291,0.749904,1.521781,0.148403,-0.021482,...,0.150657,0.488571,-2.025501,0.747567,-0.855354,0.865665,0.617027,2.206092,0.730327,-0.231137
1,0,реченица,"[-0.009156584739685059, -0.15129134058952332, ...","[-2.5628092288970947, -0.0662396252155304, 1.3...",-0.009157,-0.151291,0.749904,1.521781,0.148403,-0.021482,...,0.5691,-0.692857,0.634519,-0.64417,-0.250187,-0.729749,-0.034769,-1.773796,0.210206,0.138445
2,0,за,"[-0.009156584739685059, -0.15129134058952332, ...","[0.8072260022163391, -0.9919570684432983, 0.93...",-0.009157,-0.151291,0.749904,1.521781,0.148403,-0.021482,...,-0.126134,-0.330276,1.010782,-0.253082,0.18469,1.030987,-0.077637,1.107909,0.792928,0.287311
3,0,пример,"[-0.009156584739685059, -0.15129134058952332, ...","[0.3718527853488922, -0.14160950481891632, 1.0...",-0.009157,-0.151291,0.749904,1.521781,0.148403,-0.021482,...,-1.421076,0.950702,-0.155192,0.578046,0.065505,-0.864249,-0.262437,-1.712991,2.266911,0.414163
4,1,Втората,"[-0.5984004139900208, -0.43612363934516907, 0....","[-0.8104681968688965, -0.2734103798866272, 1.9...",-0.5984,-0.436124,0.897956,1.061009,0.40599,-0.701733,...,0.602408,0.856115,-1.309602,1.294117,-0.452934,-0.650744,1.173459,1.680583,0.663294,0.361774
5,1,реченица,"[-0.5984004139900208, -0.43612363934516907, 0....","[-2.314239501953125, -0.5015822649002075, 1.40...",-0.5984,-0.436124,0.897956,1.061009,0.40599,-0.701733,...,0.382606,-1.350259,0.972904,0.163307,0.016757,-1.37403,-0.563008,-1.540951,0.009127,0.310579
6,1,со,"[-0.5984004139900208, -0.43612363934516907, 0....","[-0.6076048016548157, -1.4616307020187378, 0.6...",-0.5984,-0.436124,0.897956,1.061009,0.40599,-0.701733,...,0.06513,-0.62407,0.105841,-0.1187,1.267312,0.247946,0.882784,-0.185189,2.710305,-0.096825
7,1,клучен,"[-0.5984004139900208, -0.43612363934516907, 0....","[-1.079825520515442, 0.4995218813419342, 2.578...",-0.5984,-0.436124,0.897956,1.061009,0.40599,-0.701733,...,-1.409296,0.45181,-0.432733,1.176227,-0.494484,-0.050853,-0.837261,0.078496,0.78922,0.749815
8,1,збор,"[-0.5984004139900208, -0.43612363934516907, 0....","[-0.2656204402446747, 0.16564339399337769, -0....",-0.5984,-0.436124,0.897956,1.061009,0.40599,-0.701733,...,-0.593114,0.284471,0.935451,0.373874,0.024296,-0.632544,-0.702653,1.171047,0.832984,-0.069346
9,2,Уште,"[0.26477915048599243, -0.4897470474243164, 1.4...","[-1.0692380666732788, -1.2803955078125, 2.8142...",0.264779,-0.489747,1.428535,0.616716,-0.505975,-0.717448,...,-2.110084,1.05735,-0.108762,-0.815067,-0.168555,-0.0179,0.027578,-0.09177,1.489248,-0.621472


In [102]:
data.drop(columns=['sentence_embedding', 'word_embedding'], inplace=True)

In [103]:
log_model = joblib.load('../Models/logistic_regression_model.pkl')

In [104]:
X = data.drop(columns=['sentence_id', 'word'])

In [105]:
X

Unnamed: 0,sentence_embedding_0,sentence_embedding_1,sentence_embedding_2,sentence_embedding_3,sentence_embedding_4,sentence_embedding_5,sentence_embedding_6,sentence_embedding_7,sentence_embedding_8,sentence_embedding_9,...,word_embedding_758,word_embedding_759,word_embedding_760,word_embedding_761,word_embedding_762,word_embedding_763,word_embedding_764,word_embedding_765,word_embedding_766,word_embedding_767
0,-0.009157,-0.151291,0.749904,1.521781,0.148403,-0.021482,0.735368,0.905277,0.021328,-1.018347,...,0.150657,0.488571,-2.025501,0.747567,-0.855354,0.865665,0.617027,2.206092,0.730327,-0.231137
1,-0.009157,-0.151291,0.749904,1.521781,0.148403,-0.021482,0.735368,0.905277,0.021328,-1.018347,...,0.5691,-0.692857,0.634519,-0.64417,-0.250187,-0.729749,-0.034769,-1.773796,0.210206,0.138445
2,-0.009157,-0.151291,0.749904,1.521781,0.148403,-0.021482,0.735368,0.905277,0.021328,-1.018347,...,-0.126134,-0.330276,1.010782,-0.253082,0.18469,1.030987,-0.077637,1.107909,0.792928,0.287311
3,-0.009157,-0.151291,0.749904,1.521781,0.148403,-0.021482,0.735368,0.905277,0.021328,-1.018347,...,-1.421076,0.950702,-0.155192,0.578046,0.065505,-0.864249,-0.262437,-1.712991,2.266911,0.414163
4,-0.5984,-0.436124,0.897956,1.061009,0.40599,-0.701733,0.545752,0.419049,0.267834,-0.910041,...,0.602408,0.856115,-1.309602,1.294117,-0.452934,-0.650744,1.173459,1.680583,0.663294,0.361774
5,-0.5984,-0.436124,0.897956,1.061009,0.40599,-0.701733,0.545752,0.419049,0.267834,-0.910041,...,0.382606,-1.350259,0.972904,0.163307,0.016757,-1.37403,-0.563008,-1.540951,0.009127,0.310579
6,-0.5984,-0.436124,0.897956,1.061009,0.40599,-0.701733,0.545752,0.419049,0.267834,-0.910041,...,0.06513,-0.62407,0.105841,-0.1187,1.267312,0.247946,0.882784,-0.185189,2.710305,-0.096825
7,-0.5984,-0.436124,0.897956,1.061009,0.40599,-0.701733,0.545752,0.419049,0.267834,-0.910041,...,-1.409296,0.45181,-0.432733,1.176227,-0.494484,-0.050853,-0.837261,0.078496,0.78922,0.749815
8,-0.5984,-0.436124,0.897956,1.061009,0.40599,-0.701733,0.545752,0.419049,0.267834,-0.910041,...,-0.593114,0.284471,0.935451,0.373874,0.024296,-0.632544,-0.702653,1.171047,0.832984,-0.069346
9,0.264779,-0.489747,1.428535,0.616716,-0.505975,-0.717448,0.735878,0.193803,-0.513627,-1.311199,...,-2.110084,1.05735,-0.108762,-0.815067,-0.168555,-0.0179,0.027578,-0.09177,1.489248,-0.621472


In [106]:
# word_embeddings = np.array([data['word_embedding'] for data in embeddings_data])
predicted_categories_indices = log_model.predict(X)
categories = ['0', 'adjective', 'adposition', 'adverb', 'conjuction', 'noun', 'numeral', 'particle', 'pronoun', 'residual', 'verb']  # Example categories
predicted_categories = [categories[index] for index in predicted_categories_indices]

In [107]:
data['predicted_category'] = predicted_categories

# Select only the necessary columns for the final mapping
final_mapping_df = data[['sentence_id', 'word', 'predicted_category']]

# Convert the dataframe into a dictionary for easy access if needed
sentence_word_category_mapping = final_mapping_df.groupby('sentence_id').apply(lambda x: dict(zip(x.word, x.predicted_category))).to_dict()

print(sentence_word_category_mapping)

{0: {'Првата': 'adjective', 'реченица': 'noun', 'за': 'adverb', 'пример': 'noun'}, 1: {'Втората': 'adjective', 'реченица': 'noun', 'со': 'adverb', 'клучен': 'adjective', 'збор': 'noun'}, 2: {'Уште': 'particle', 'еден': 'adverb', 'пример': 'noun'}}


In [108]:
def find_sentences_with_keyword_and_category(keyword, category, sentence_word_category_mapping, sentences):
    """
    Find sentences where the specified keyword falls under the given category.

    :param keyword: The keyword to look for.
    :param category: The category the keyword should fall under.
    :param sentence_word_category_mapping: A dictionary mapping sentence IDs to dictionaries of word-category pairs.
    :param sentences: A list of sentences corresponding to the sentence IDs.
    :return: A list of sentences where the keyword is categorized under the specified category.
    """
    matching_sentences = []
    keyword_lower = keyword.lower()
    category_lower = category.lower()

    # Iterate over the sentence ID and word-category mapping
    for sentence_id, word_category_map in sentence_word_category_mapping.items():
        # Check if the keyword exists in the current sentence's word-category map and matches the category, case insensitively
        for word, word_category in word_category_map.items():
            if word.lower() == keyword_lower and word_category.lower() == category_lower:
                # If a match is found, append the corresponding sentence to the matching_sentences list
                matching_sentences.append(sentences[sentence_id])
                break  # Stop checking other words in this sentence once a match is found

    return matching_sentences

In [109]:
result = find_sentences_with_keyword_and_category('збор', 'noun', sentence_word_category_mapping, sentences)

In [110]:
result

['Втората реченица со клучен збор.']