Preprocessing the data:
1. Turn the Nan values to zero
2. One-hot encode the categorical values for every column except 'Word' and 'Lemma'
3. Use Roberta model (macedonizer) to capture the embeddings for the words and lemmas. 
4. Create a new feature which is the cosine similarity between two consecutive sentences in the dataset. (Optional)

In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('../../data.csv')

In [4]:
data

Unnamed: 0,sentence_id,word,lemma,category,type,gender,number,case,definite,person,tense,aspect,negation,degree,formation,vform,form,voice
0,1,Беше,сум,verb,auxiliarry,-,singular,Nan,Nan,-,imperfect,progressive,Nan,Nan,Nan,indicative,Nan,Nan
1,1,јасен,јасен,adjective,qualificative,masculine,singular,Nan,-,Nan,Nan,Nan,Nan,positive,Nan,Nan,Nan,Nan
2,1,и,и,conjuction,coordinating,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan,simple,Nan,Nan,Nan
3,1,студен,студен,adjective,qualificative,masculine,singular,Nan,-,Nan,Nan,Nan,Nan,positive,Nan,Nan,Nan,Nan
4,1,априлски,априлски,adjective,general,-,plural,Nan,-,Nan,Nan,Nan,Nan,-,Nan,Nan,Nan,Nan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96057,6790,како,како,conjuction,subordinating,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan,compound,Nan,Nan,Nan
96058,6790,што,што,conjuction,subordinating,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan,compound,Nan,Nan,Nan
96059,6790,е,сум,verb,auxiliarry,-,singular,Nan,Nan,third,present,progressive,Nan,Nan,Nan,indicative,Nan,Nan
96060,6790,2050,2050,numeral,cardinal,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan,digit,Nan


In [5]:
data.replace('Nan', 0, inplace=True)

In [6]:
data

Unnamed: 0,sentence_id,word,lemma,category,type,gender,number,case,definite,person,tense,aspect,negation,degree,formation,vform,form,voice
0,1,Беше,сум,verb,auxiliarry,-,singular,0,0,-,imperfect,progressive,0,0,0,indicative,0,0
1,1,јасен,јасен,adjective,qualificative,masculine,singular,0,-,0,0,0,0,positive,0,0,0,0
2,1,и,и,conjuction,coordinating,0,0,0,0,0,0,0,0,0,simple,0,0,0
3,1,студен,студен,adjective,qualificative,masculine,singular,0,-,0,0,0,0,positive,0,0,0,0
4,1,априлски,априлски,adjective,general,-,plural,0,-,0,0,0,0,-,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96057,6790,како,како,conjuction,subordinating,0,0,0,0,0,0,0,0,0,compound,0,0,0
96058,6790,што,што,conjuction,subordinating,0,0,0,0,0,0,0,0,0,compound,0,0,0
96059,6790,е,сум,verb,auxiliarry,-,singular,0,0,third,present,progressive,0,0,0,indicative,0,0
96060,6790,2050,2050,numeral,cardinal,0,0,0,0,0,0,0,0,0,0,0,digit,0


In [7]:
from sklearn.preprocessing import OneHotEncoder

columns_to_encode = list(data.drop(columns=['word', 'lemma', 'sentence_id']).columns)

data[columns_to_encode] = data[columns_to_encode].astype(str)

encoder = OneHotEncoder(sparse=False)

encoded_data = encoder.fit_transform(data[columns_to_encode])

encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(columns_to_encode))

final_data = pd.concat([data[['sentence_id', 'word', 'lemma']].reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

print(final_data)



       sentence_id      word     lemma  category_0  category_adjective  \
0                1      Беше       сум         0.0                 0.0   
1                1     јасен     јасен         0.0                 1.0   
2                1         и         и         0.0                 0.0   
3                1    студен    студен         0.0                 1.0   
4                1  априлски  априлски         0.0                 1.0   
...            ...       ...       ...         ...                 ...   
96057         6790      како      како         0.0                 0.0   
96058         6790       што       што         0.0                 0.0   
96059         6790         е       сум         0.0                 0.0   
96060         6790      2050      2050         0.0                 0.0   
96061         6790    година    година         0.0                 0.0   

       category_adposition  category_adverb  category_conjuction  \
0                      0.0              0.0

In [8]:
final_data.head()

Unnamed: 0,sentence_id,word,lemma,category_0,category_adjective,category_adposition,category_adverb,category_conjuction,category_noun,category_numeral,...,formation_compound,formation_simple,vform_0,vform_imperative,vform_indicative,form_0,form_digit,form_letter,form_roman,voice_0
0,1,Беше,сум,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
1,1,јасен,јасен,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,1,и,и,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1,студен,студен,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,1,априлски,априлски,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [9]:
data['word_category'] = data['word'] + '_' + data['category']

word_category_to_id = {word_category: idx for idx, word_category in enumerate(data['word_category'].unique(), start=1)}

data['word_id'] = data['word_category'].apply(lambda x: word_category_to_id[x])

In [10]:
data

Unnamed: 0,sentence_id,word,lemma,category,type,gender,number,case,definite,person,tense,aspect,negation,degree,formation,vform,form,voice,word_category,word_id
0,1,Беше,сум,verb,auxiliarry,-,singular,0,0,-,imperfect,progressive,0,0,0,indicative,0,0,Беше_verb,1
1,1,јасен,јасен,adjective,qualificative,masculine,singular,0,-,0,0,0,0,positive,0,0,0,0,јасен_adjective,2
2,1,и,и,conjuction,coordinating,0,0,0,0,0,0,0,0,0,simple,0,0,0,и_conjuction,3
3,1,студен,студен,adjective,qualificative,masculine,singular,0,-,0,0,0,0,positive,0,0,0,0,студен_adjective,4
4,1,априлски,априлски,adjective,general,-,plural,0,-,0,0,0,0,-,0,0,0,0,априлски_adjective,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96057,6790,како,како,conjuction,subordinating,0,0,0,0,0,0,0,0,0,compound,0,0,0,како_conjuction,393
96058,6790,што,што,conjuction,subordinating,0,0,0,0,0,0,0,0,0,compound,0,0,0,што_conjuction,146
96059,6790,е,сум,verb,auxiliarry,-,singular,0,0,third,present,progressive,0,0,0,indicative,0,0,е_verb,418
96060,6790,2050,2050,numeral,cardinal,0,0,0,0,0,0,0,0,0,0,0,digit,0,2050_numeral,4723


In [11]:
# Drop word_category column

data.drop(columns=['word_category'], inplace=True)

In [12]:
data

Unnamed: 0,sentence_id,word,lemma,category,type,gender,number,case,definite,person,tense,aspect,negation,degree,formation,vform,form,voice,word_id
0,1,Беше,сум,verb,auxiliarry,-,singular,0,0,-,imperfect,progressive,0,0,0,indicative,0,0,1
1,1,јасен,јасен,adjective,qualificative,masculine,singular,0,-,0,0,0,0,positive,0,0,0,0,2
2,1,и,и,conjuction,coordinating,0,0,0,0,0,0,0,0,0,simple,0,0,0,3
3,1,студен,студен,adjective,qualificative,masculine,singular,0,-,0,0,0,0,positive,0,0,0,0,4
4,1,априлски,априлски,adjective,general,-,plural,0,-,0,0,0,0,-,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96057,6790,како,како,conjuction,subordinating,0,0,0,0,0,0,0,0,0,compound,0,0,0,393
96058,6790,што,што,conjuction,subordinating,0,0,0,0,0,0,0,0,0,compound,0,0,0,146
96059,6790,е,сум,verb,auxiliarry,-,singular,0,0,third,present,progressive,0,0,0,indicative,0,0,418
96060,6790,2050,2050,numeral,cardinal,0,0,0,0,0,0,0,0,0,0,0,digit,0,4723


In [14]:
data.to_csv('../../preprocessed_data.csv', index=False)