In [36]:
import sklearn
import sklearn.model_selection
import pickle
import numpy as np
import pandas as pd
from transformers import XLMTokenizer, RobertaModel
from tqdm import tqdm

In [41]:
with open('../data/dataset.pickle', 'rb') as handle:
    X, y = pickle.load(handle)
    

In [43]:
X = X.to_numpy()

### Convert y to onehot

In [45]:
y = pd.get_dummies(y)
y.head(5)

Unnamed: 0,neutralne,oczekiwanie,podziw,radosc,smutek,strach,wstret,zaskoczenie,zlosc
0,1,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0


## Split dataset with the same label balance

In [64]:
splitter = sklearn.model_selection.StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)

train_X = None
test_X = None
train_y = None
test_y = None
for train_index, test_index in splitter.split(X, y):
    print(len(test_index))
    train_X, test_X = X[train_index], X[test_index]
    train_y, test_y = y[train_index], y[test_index]

843


## Get tokenizer and model

In [66]:
tokenizer = XLMTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")

## Parse datasets into input vectors

In [67]:
def encode_text(text):
    encoded_input = tokenizer.encode(text, return_tensors='pt')
    outputs = model(encoded_input)
    response = outputs[1].detach().numpy()
    return response[0]

In [68]:
train_X_parsed = []
test_X_parsed = []
for x in tqdm(list(train_X)):
    train_X_parsed.append(encode_text(x))
    
for x in tqdm(list(test_X)):
    test_X_parsed.append(encode_text(x))

100%|██████████| 3368/3368 [03:45<00:00, 14.91it/s]
100%|██████████| 843/843 [00:58<00:00, 14.42it/s]


## Save parsed data to train and test datasets so i don't have to parse it again!!!

In [69]:
with open('../data/train.pickle', 'wb') as handle:
    pickle.dump((train_X_parsed, train_y), handle)
with open('../data/test.pickle', 'wb') as handle:
    pickle.dump((test_X_parsed, test_y), handle)

In [71]:
with open('../data/train_org.pickle', 'wb') as handle:
    pickle.dump(train_X, handle)
with open('../data/test_org.pickle', 'wb') as handle:
    pickle.dump(test_X, handle)