In [4]:
import numpy as np

langs = ['deu', 'esp', 'eng']
all_embs = []
for lang in langs:
    train_embs = np.load(f'./embeddings/qwen/train_{lang}.npy')
    all_embs.append(train_embs)
all_embs = np.concatenate(all_embs, axis=0)
np.save('./embeddings/qwen/train_deu-esp-eng.npy', all_embs)
all_embs.shape

(7367, 3584)

In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import os
import utils
import datasets_local
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

EMB_DIM = {
    'lealla-large': 256,
    'qwen2.5-7b': 3584,
}
SEEDS = [1007, 1013, 1019]
train_kwargs = {
    'output_dim': 1,
    'optimizer': 'adamw',
    'loss_fn': 'bce_wll',
    'pred_fn': 'bce',
    'lr': 1e-3,
    'lr_min': 1e-5,
    'n_epochs': 50,
    'wd': 1e-3,
    'bs': 512,
}
model_name = "unsloth/QWEN2.5-7B"
model_path_name = model_name.lower().split('/')[-1]
device = torch.device('cuda')


In [2]:
ID_LANGS = ['eng', 'deu', 'esp', 'deu-esp-eng']
OOD_LANGS = ['ron', 'ukr', 'hin']

lang_mtds = {
    lang: None for lang in OOD_LANGS
}
lang_targets = {
    lang: None for lang in OOD_LANGS
}
lang_embeddings = {
    lang: None for lang in OOD_LANGS
}
for lang in OOD_LANGS:
    mtd = pd.read_csv(f'./data/track_c/dev/{lang}.csv')
    lang_mtds[lang] = mtd
    mtd.drop(columns=['text'], inplace=True)

    targets = [c for c in mtd.columns if c not in ['id', 'text']]
    lang_targets[lang] = targets

    embeddings = np.load(f'./embeddings/qwen/dev_c_{lang}.npy').astype(np.float32) ## QWEN embs are in fp16
    embeddings = torch.tensor(embeddings).to(device)
    embeddings /= embeddings.norm(p=2, dim=1, keepdim=True)
    lang_embeddings[lang] = embeddings

In [3]:
for lang in ID_LANGS: # train lp on this lang only, make predictions
    print(lang)

    path = f'./classifiers/{model_path_name}/{lang}/'
    output_path = f'./results/{model_path_name}/{lang}/'
    os.makedirs(output_path, exist_ok=True) 
    predictions_path_c = f'./predictions/lp/qwen/{lang}/track_c/'
    os.makedirs(predictions_path_c, exist_ok=True)
    predictions_path_a = f'./predictions/lp/qwen/{lang}/track_a/'
    os.makedirs(predictions_path_a, exist_ok=True)

    emb_dim = EMB_DIM[model_path_name]

    mtd = pd.read_csv(f'./data/track_a/train/{lang}.csv')
    src_targets = [c for c in mtd.columns if c not in ['id', 'text']]
    target_labels = {
        c: mtd[c].to_numpy() for c in src_targets
    }

    ## mask targets for each lang
    for ood_lang in OOD_LANGS:
        for c in lang_targets[ood_lang]:
            if c not in src_targets: # can't make predictions for this emotion; place zeros
                lang_mtds[ood_lang][c] = [0 for _ in range(len(lang_mtds[ood_lang]))]
    

    train_indices, val_indices = utils.load_split_indices('lealla-large', lang)
    train_labels = {
        c: target_labels[c][train_indices] for c in src_targets
    }
    val_labels = {
        c: target_labels[c][val_indices] for c in src_targets
    }
    
    # label -1 means there is no label for that sample for emotion c
    train_masks = {
        c: train_labels[c] > -0.5 for c in train_labels.keys()
    }
    val_masks = {
        c: val_labels[c] > -0.5 for c in val_labels.keys()
    }

    train_datasets = {
        c: datasets_local.EmbeddingsDataset(None, train_labels[c][train_masks[c]]) for c in src_targets 
    }
    val_datasets = {
        c: datasets_local.EmbeddingsDataset(None, val_labels[c][val_masks[c]]) for c in src_targets
    }


    macro_f1s = []
    path_layer = path + 'emb/'
    os.makedirs(path_layer, exist_ok=True)
    
    embeddings = np.load(f'./embeddings/qwen/train_{lang}.npy').astype(np.float32) ## QWEN embs are in fp16
    train_embeddings = torch.tensor(embeddings[train_indices]).to(device)
    train_embeddings /= train_embeddings.norm(p=2, dim=1, keepdim=True)
    val_embeddings = torch.tensor(embeddings[val_indices]).to(device)
    val_embeddings /= val_embeddings.norm(p=2, dim=1, keepdim=True)
    if os.path.exists(f'./embeddings/qwen/dev_a_{lang}.npy'):
        dev_embeddings = np.load(f'./embeddings/qwen/dev_a_{lang}.npy').astype(np.float32)
        dev_embeddings = torch.tensor(dev_embeddings).to(device)
        dev_embeddings /= dev_embeddings.norm(p=2, dim=1, keepdim=True)
        same_lang_mtd = pd.read_csv(f'./data/track_a/dev/{lang}.csv')
        same_lang_mtd.drop(columns=['text'], inplace=True)
    else:
        dev_embeddings = None
        same_lang_mtd = None
    same_lang_f1s = []
    for c in src_targets:
        st_path = path_layer + f'model_{c}.pt'
        if os.path.exists(st_path):# and False:
            best_net_st = torch.load(st_path, map_location='cpu')
            f1s = best_net_st['f1s']
            best_val_f1 = best_net_st['f1'] ## := max(f1s)
        else:
            train_datasets[c].embeddings = train_embeddings[train_masks[c]]
            val_datasets[c].embeddings = val_embeddings[val_masks[c]]
            f1s = []
            best_f1 = 0
            best_net_st = None
            for seed in SEEDS:
                net_st, best_val_f1 = utils.train_lp_balanced_class_loss(device, train_datasets[c], val_datasets[c], train_kwargs, seed, use_tqdm=False)
                f1s.append(best_val_f1)
                if best_val_f1 > best_f1:
                    best_f1 = best_val_f1
                    best_net_st = net_st
                    best_net_st['f1'] = best_f1
            
            best_net_st['f1s'] = f1s 
            torch.save(best_net_st, st_path) # best classifier for this emotion
        same_lang_f1s.append(best_val_f1)

        ## make predictions on the OOD langs and save them to a folder
        weight, bias = best_net_st['weight'].to(device), best_net_st['bias'].to(device)
        for ood_lang in OOD_LANGS:
            if c in lang_targets[ood_lang]:
                scores_ = lang_embeddings[ood_lang] @ weight.T + bias
                predictions_ = utils.get_predictions_bce(scores_).cpu().numpy()
                lang_mtds[ood_lang][c] = predictions_
        ## id predictions for track a
        if dev_embeddings is not None:
            scores_ = dev_embeddings @ weight.T + bias
            predictions_ = utils.get_predictions_bce(scores_).cpu().numpy()
            same_lang_mtd[c] = predictions_
    # save mtd with the predictions
    for ood_lang in OOD_LANGS:
        lang_mtds[ood_lang].to_csv(predictions_path_c + f'pred_{ood_lang}.csv', index=False)
    
    if dev_embeddings is not None:
        same_lang_mtd.to_csv(predictions_path_a + f'pred_{lang}.csv', index=False)

    print('MACRO F1 id', np.mean(same_lang_f1s))
    print(same_lang_f1s)
    

eng


  best_net_st = torch.load(st_path, map_location='cpu')


MACRO F1 id 0.46012590297532263
[0.208, 0.7060810810810811, 0.4205607476635514, 0.462882096069869, 0.5031055900621118]
deu


  best_net_st = torch.load(st_path, map_location='cpu')


MACRO F1 id 0.40440231731396586
[0.6044776119402986, 0.5403508771929825, 0.20833333333333334, 0.44976076555023925, 0.4219409282700422, 0.20155038759689922]
esp


  best_net_st = torch.load(st_path, map_location='cpu')


MACRO F1 id 0.555840975566009
[0.580246913580247, 0.6979166666666667, 0.4827586206896552, 0.6305418719211824, 0.47619047619047616, 0.46739130434782605]
deu-esp-eng
MACRO F1 id 0.5203235635616007
[0.5271317829457364, 0.5776031434184675, 0.6437054631828978, 0.47965116279069775, 0.4432432432432432, 0.4506065857885615]


  best_net_st = torch.load(st_path, map_location='cpu')


In [None]:
#####################
# LogisticRegression#
#####################

In [4]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import os
import utils
import datasets_local
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

EMB_DIM = {
    'lealla-large': 256,
    'qwen2.5-7b': 3584,
}
SEEDS = [1007, 1013, 1019]
model_name = "unsloth/QWEN2.5-7B"
model_path_name = model_name.lower().split('/')[-1]
device = torch.device('cuda')

In [5]:
ID_LANGS = ['eng', 'deu', 'esp', 'deu-esp-eng']
OOD_LANGS = ['ron', 'ukr', 'hin']

lang_mtds = {
    lang: None for lang in OOD_LANGS
}
lang_targets = {
    lang: None for lang in OOD_LANGS
}
lang_embeddings = {
    lang: None for lang in OOD_LANGS
}
for lang in OOD_LANGS:
    mtd = pd.read_csv(f'./data/track_c/dev/{lang}.csv')
    lang_mtds[lang] = mtd
    mtd.drop(columns=['text'], inplace=True)

    targets = [c for c in mtd.columns if c not in ['id', 'text']]
    lang_targets[lang] = targets

    embeddings = np.load(f'./embeddings/qwen/dev_c_{lang}.npy').astype(np.float32) ## QWEN embs are in fp16
    lang_embeddings[lang] = embeddings / np.sqrt(np.sum(np.square(embeddings), axis=1, keepdims=True))

In [9]:
for lang in ID_LANGS: # train lp on this lang only, make predictions
    print(lang)

    predictions_path_c = f'./predictions/lreg/qwen/{lang}/track_c/'
    os.makedirs(predictions_path_c, exist_ok=True)
    predictions_path_a = f'./predictions/lreg/qwen/{lang}/track_a/'
    os.makedirs(predictions_path_a, exist_ok=True)

    emb_dim = EMB_DIM[model_path_name]

    mtd = pd.read_csv(f'./data/track_a/train/{lang}.csv')
    src_targets = [c for c in mtd.columns if c not in ['id', 'text']]
    target_labels = {
        c: mtd[c].to_numpy() for c in src_targets
    }

    ## mask targets for each lang
    for ood_lang in OOD_LANGS:
        for c in lang_targets[ood_lang]:
            if c not in src_targets: # can't make predictions for this emotion; place zeros
                lang_mtds[ood_lang][c] = [0 for _ in range(len(lang_mtds[ood_lang]))]
    

    train_indices, val_indices = utils.load_split_indices('lealla-large', lang)
    train_labels = {
        c: target_labels[c][train_indices] for c in src_targets
    }
    val_labels = {
        c: target_labels[c][val_indices] for c in src_targets
    }
    
    # label -1 means there is no label for that sample for emotion c
    train_masks = {
        c: train_labels[c] > -0.5 for c in train_labels.keys()
    }
    val_masks = {
        c: val_labels[c] > -0.5 for c in val_labels.keys()
    }

    macro_f1s = []

    embeddings = np.load(f'./embeddings/qwen/train_{lang}.npy').astype(np.float32) ## QWEN embs are in fp16
    train_embeddings = embeddings[train_indices]
    train_embeddings /= np.sqrt(np.sum(np.square(train_embeddings), axis=1, keepdims=True))
    val_embeddings = embeddings[val_indices]
    val_embeddings /= np.sqrt(np.sum(np.square(val_embeddings), axis=1, keepdims=True))
    if os.path.exists(f'./embeddings/qwen/dev_a_{lang}.npy'):
        dev_embeddings = np.load(f'./embeddings/qwen/dev_a_{lang}.npy').astype(np.float32)
        dev_embeddings /= np.sqrt(np.sum(np.square(dev_embeddings), axis=1, keepdims=True))
        same_lang_mtd = pd.read_csv(f'./data/track_a/dev/{lang}.csv')
        same_lang_mtd.drop(columns=['text'], inplace=True)
    else:
        dev_embeddings = None
        same_lang_mtd = None
    same_lang_f1s = []
    for c in src_targets:
        f1s = []
        best_f1 = 0
        best_lreg = None
        train_c_embeddings = train_embeddings[train_masks[c]]
        train_c_labels = train_labels[c][train_masks[c]]
        val_c_embeddings = val_embeddings[val_masks[c]]
        val_c_labels = val_labels[c][val_masks[c]]

        for C in [1e-2, 1e-1, 1, 1e1, 1e2]:
            lreg = LogisticRegression(dual=True, C=C, random_state=SEEDS[0], class_weight='balanced', max_iter=100, solver='liblinear')
            lreg.fit(train_c_embeddings, train_c_labels)
            val_predictions = lreg.predict(val_c_embeddings)
            f1 = f1_score(val_c_labels, val_predictions)
            if f1 >= best_f1:
                best_f1 = f1
                best_lreg = lreg
        
        same_lang_f1s.append(best_f1)

        ## make predictions on the OOD langs and save them to a folder
        for ood_lang in OOD_LANGS:
            if c in lang_targets[ood_lang]:
                predictions_ = best_lreg.predict(lang_embeddings[ood_lang])
                lang_mtds[ood_lang][c] = predictions_
        if dev_embeddings is not None:
            predictions_ = best_lreg.predict(dev_embeddings)
            same_lang_mtd[c] = predictions_
    # save mtd with the predictions
    for ood_lang in OOD_LANGS:
        lang_mtds[ood_lang].to_csv(predictions_path_c + f'pred_{ood_lang}.csv', index=False)
    
    if dev_embeddings is not None:
        same_lang_mtd.to_csv(predictions_path_a + f'pred_{lang}.csv', index=False)

    print('MACRO F1 id', np.mean(same_lang_f1s))
    print(same_lang_f1s)
    

eng
MACRO F1 id 0.4161625301068976
[0.21097046413502107, 0.6532258064516129, 0.3169014084507042, 0.42244224422442245, 0.4772727272727273]
deu
MACRO F1 id 0.3753052585654582
[0.6131386861313869, 0.4883720930232558, 0.2033898305084746, 0.4245810055865922, 0.40740740740740744, 0.11494252873563218]
esp
MACRO F1 id 0.5313865472434555
[0.5609756097560975, 0.69, 0.4193548387096774, 0.5959595959595959, 0.47169811320754723, 0.45033112582781454]
deu-esp-eng
MACRO F1 id 0.5096822844157932
[0.5252854812398042, 0.5621181262729125, 0.6357615894039734, 0.44871794871794873, 0.43115438108484005, 0.4550561797752809]
