In [2]:
# import relevant packages

import pandas as pd
import os
import random
import numpy as np
import glob as glob

from ast import literal_eval

pd.set_option('display.max_colwidth', None)

## Load original train datasets that active learner was applied to

In [3]:
df_dict = dict()

PATH = "../0_data/main/1_clean"
    
for dataset in os.listdir(PATH):
    for f in glob.glob(f"{PATH}/{dataset}/train*.csv"):
        if "dyn21" not in f and "ipynb" not in f:
            print(dataset[:8])
            df_dict[dataset[:8]] = pd.read_csv(f)

has21_hi
has20_hi
for19_pt
ken20_en
fou18_en
ous19_fr
has19_hi
bas19_es
ous19_ar
san20_it


## Merge train datasets with prediction logits

In [4]:
PATH = "../0_data/main/2_active_learning"
AL_MODEL = "xlmt_dyn21_en_20000_rs1"

for dataset in os.listdir(PATH):
    print(dataset[:8])
    df_dict[dataset[:8]] = df_dict[dataset[:8]].merge(pd.read_csv(f"{PATH}/{dataset}/{AL_MODEL}.csv")[["prediction", "logits"]], left_index=True, right_index=True)

ous19_ar
for19_pt
san20_it
ous19_fr
bas19_es


## Create columns for selection

In [7]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

for dataset in df_dict:
    if "logits" in df_dict[dataset].columns:
        df_dict[dataset]["softmax_scores"] = df_dict[dataset].logits.apply(lambda x: softmax(literal_eval(x)))
        df_dict[dataset]["softmax_diff"] = df_dict[dataset].softmax_scores.apply(lambda x: abs(x[0]-x[1]))
        continue

## Select based on difference in softmax scores across classes

In [None]:
# create differently-sized train portions from rest of data

N_RANGE = [10, 20, 30, 40, 50, 100, 200, 300, 400, 500, 1000, 2000]

for dataset in df_dict:
    print(dataset.upper())
    for n in N_RANGE:
        print(f"  saving n = {n} training set (selected by active learning)")
        if "softmax_diff" in df_dict[dataset].columns:
            export_dict = df_dict[dataset].sort_values("softmax_diff")[["text", "label"]][:n]
        for file in glob.glob(f"../0_data/main/2_active_learning/{dataset}*"):
            export_dict.to_csv(f"{file}/train/train_{n}_al.csv",index=False)
 
    print()

HAS21_HI
  saving n = 10 training set (selected by active learning)
  saving n = 20 training set (selected by active learning)
  saving n = 30 training set (selected by active learning)
  saving n = 40 training set (selected by active learning)
  saving n = 50 training set (selected by active learning)
  saving n = 100 training set (selected by active learning)
  saving n = 200 training set (selected by active learning)
  saving n = 300 training set (selected by active learning)
  saving n = 400 training set (selected by active learning)
  saving n = 500 training set (selected by active learning)
  saving n = 1000 training set (selected by active learning)
  saving n = 2000 training set (selected by active learning)

HAS20_HI
  saving n = 10 training set (selected by active learning)
  saving n = 20 training set (selected by active learning)
  saving n = 30 training set (selected by active learning)
  saving n = 40 training set (selected by active learning)
  saving n = 50 training set

In [10]:
df_dict["san20_it"]

Unnamed: 0,text,label,split,prediction,logits,softmax_scores,softmax_diff
0,@user L‚Äô Italia ne ha gi√† accolti fin troppo di migranti: Macron apra come si deve le sue frontiere ai migranti e non con il contagocceüò§ü§î,0,train,1,"(-0.88551676, 0.7890035)","[0.15782244096304723, 0.8421775590369528]",0.684355
1,"La sinistra vicina ai pi√π deboli , non operai e gente comune , ma immigrati d'ogni sorta, ritiene giusto violare la legge. E a noi ci fanno chiudere le piccole aziende per cavilli burocratici. Un grazie da chi non si √® suicidato ma quasi . #Riace #MimmoLucano #Saviano http",0,train,1,"(-0.9222206, 0.8876971)","[0.14064807272758129, 0.8593519272724188]",0.718704
2,Io sono #Desiree Sono stata stuprata da un branco di #Clandestini Sono stata lasciata da loro li a morire Sono stata stuprata anche da morta Sono #DesireeMariottini non ho pi√π un futuro per colpa di un gruppo di #immigrati e sono stata violentata anche da un #magistrato italiano http,1,train,1,"(-1.284241, 1.1302452)","[0.08207470049519884, 0.9179252995048012]",0.835851
3,Un altra conquista dei #PDüëÄ dare le case popolari in mano ai #Rom .üôà Risultato : #degrado e #sopprusi Si ringrazia il sindaco #Salah ü§¶‚Äç‚ôÇÔ∏è http,1,train,1,"(-0.16130793, 0.031392984)","[0.451973296804215, 0.548026703195785]",0.096053
4,[marioafrica]: RT filippobubbico: Il benvenuto dell'Italia a #migranti. Tra oggi e domani 400 arrivi per #corridoi‚Ä¶ http,0,train,0,"(0.9738463, -1.1099527)","[0.8893185264132982, 0.11068147358670168]",0.778637
...,...,...,...,...,...,...,...
5595,I PDIOTI PER RIEPIRE LA PIAZZA INGAGGIANO TUTTI I MIGRANTI AFRICANI CLANDESTINI IN ITALIA COSI PDIOTI +CLANDESTINI 69 PERSONE ü§£ü§£ü§£üòÇüòÇüòÇüòÅüòÅüòÅ AUGURI GRANDI CAZZARI COME IL BOSS CAZZARO RENZI WM5S W NUOVO GOVERNO,1,train,1,"(-0.50333995, 0.3105694)","[0.3070580631467035, 0.6929419368532965]",0.385884
5596,@user dica alla Fedeli che sistemati questi rom altri ne arriveranno a sfruttare la pieta' e non finiremo mai e per gl'italiani niente,0,train,1,"(-2.8639174, 2.5761027)","[0.004320647049271328, 0.9956793529507286]",0.991359
5597,Gli islamici devono adeguarsi alle tradizioni popolari e religiose dell'Italia!!! Non il contrario. http,1,train,0,"(1.1118305, -1.1924204)","[0.9092284853060056, 0.09077151469399444]",0.818457
5598,@user Pensi che questo √© quello che aspetta alle donne con la venuta dell'islam. Maometto era anche pedofilo. Perch√© la sinistra non da dei medievali a loro? Sinistra di gente accecata che corre verso un burrone.,1,test,1,"(-1.2798216, 1.0581983)","[0.08802273721974115, 0.9119772627802588]",0.823955


In [11]:
softmax(literal_eval(df_dict["san20_it"].logits[0]))

array([0.15782244, 0.84217756])

In [12]:
np.asarray(df_dict["san20_it"].logits[0])

array('(-0.88551676, 0.7890035)', dtype='<U24')

In [14]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

softmax(literal_eval(df_dict["san20_it"].logits[0]))

array([0.15782244, 0.84217756])

In [12]:
# assumes we have model predictions (pred_label) and uncertainty (pred_score) for each entry
# could also do ¬ßcross-entropy for uncertainty
# we only use the train set
# the test set remains completely held-out

In [15]:
# create dummy column for uncertainty while waiting for real results
for dataset in df_dict:
    df_dict[dataset]["pred_score"] = df_dict[dataset].label.apply(lambda x: random.uniform(0,1))

In [16]:
# select top-n entries based on active learning
# this is deterministic, so no need for multiple random seeds

N_RANGE = [10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000]

for dataset in df_dict:
    print(dataset)
    df_dict[dataset].sort_values(by="pred_score", inplace=True)
    for n in N_RANGE:
        if n<len(df_dict[dataset]):
            print(f"  saving n = {n} training set")
    print()

has21_hi
  saving n = 10 training set
  saving n = 20 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set

has20_hi
  saving n = 10 training set
  saving n = 20 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set

for19_pt
  saving n = 10 training set
  saving n = 20 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set

ken20_en
  saving n = 10 training set
  saving n = 20 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set
  sav