In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
import utils
import os

In [2]:
model_name = "setu4993/LEALLA-large"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
for lang in ['eng', 'deu', 'esp', 'ron', 'ukr', 'hin']:
    data_df = pd.read_csv(f'./data/track_a/train/{lang}.csv')
    cls_tokens, embeddings = utils.get_all_hidden_cls(model, tokenizer, device='cuda', texts=data_df.text.to_list())

    path = f'./embeddings/{model_name.split("/")[-1].lower()}/{lang}/'
    os.makedirs(path, exist_ok=True)
    
    for layer in range(1, len(cls_tokens)):
        layer_path = path + 'train_' + str(layer) + '.npy'
        np.save(layer_path, cls_tokens[layer])

    np.save(path + 'train.npy', embeddings)
    print(lang)
    print(embeddings.shape)

    data_df = pd.read_csv(f'./data/track_c/dev/{lang}.csv')
    _, embeddings = utils.get_all_hidden_cls(model, tokenizer, device='cuda', texts=data_df.text.to_list())
    np.save(path + 'dev.npy', embeddings)
    print(embeddings.shape)

eng
(2768, 256)
(116, 256)
deu
(2603, 256)
(200, 256)
esp
(1996, 256)
(184, 256)
ron
(1241, 256)
(123, 256)
ukr
(2466, 256)
(249, 256)
hin
(2556, 256)
(100, 256)


In [16]:
## make dataset splits (train + val)
LANGS = ['eng', 'deu', 'esp']

In [33]:
from sklearn.model_selection import train_test_split

for lang in LANGS:
    data_df = pd.read_csv(f'./data/track_a/train/{lang}.csv')
    print(lang)
    for c in data_df.columns:
        if c != 'id' and c != 'text':
            print('\t', c)
            print('\t', np.unique(data_df[c], return_counts=True)[1])
    indices = np.arange(0, len(data_df))
    train_indices, val_indices = train_test_split(indices, test_size=0.15, random_state=1007)
    path = f'./embeddings/{model_name.split("/")[-1].lower()}/{lang}/'
    np.save(path + 'train_indices.npy', train_indices)
    np.save(path + 'val_indices.npy', val_indices)
    print('  val split')
    for c in data_df.columns:
        if c != 'id' and c != 'text':
            print('\t', c)
            print('\t', np.unique(data_df[c].to_numpy()[val_indices], return_counts=True)[1])

eng
	 anger
	 [2435  333]
	 fear
	 [1157 1611]
	 joy
	 [2094  674]
	 sadness
	 [1890  878]
	 surprise
	 [1929  839]
  val split
	 anger
	 [378  38]
	 fear
	 [163 253]
	 joy
	 [303 113]
	 sadness
	 [279 137]
	 surprise
	 [274 142]
deu
	 anger
	 [1835  768]
	 disgust
	 [1771  832]
	 fear
	 [2364  239]
	 joy
	 [2062  541]
	 sadness
	 [2087  516]
	 surprise
	 [2444  159]
  val split
	 anger
	 [260 131]
	 disgust
	 [261 130]
	 fear
	 [360  31]
	 joy
	 [315  76]
	 sadness
	 [306  85]
	 surprise
	 [370  21]
esp
	 anger
	 [1504  492]
	 disgust
	 [1342  654]
	 fear
	 [1679  317]
	 joy
	 [1354  642]
	 sadness
	 [1687  309]
	 surprise
	 [1575  421]
  val split
	 anger
	 [223  77]
	 disgust
	 [207  93]
	 fear
	 [249  51]
	 joy
	 [202  98]
	 sadness
	 [250  50]
	 surprise
	 [234  66]


In [None]:
## QWEN
# ENG - 25 min
# DEU - 60 min
# ESP - 10 min
# RON - 15 min
# UKR - 20 min
# HIN - 45 min

# 3h05 total

In [None]:
## lealla finetune time: 15 minutes
## lealla all layer emb extraction and linear classifiers training - 15-20 minutes per language
## 