# Data cleaning pipeline

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# ========= Helpers =========
def count_multilabel_rows(df: pd.DataFrame, label_cols):
    s = df[label_cols].sum(axis=1)
    return int((s > 1).sum()), int((s == 1).sum())

def to_monolabel(df: pd.DataFrame, label_cols):
    active = df[label_cols].sum(axis=1)
    return df.loc[active == 1].copy()

def reformat_emit_to_istat(df: pd.DataFrame):
    df = df.copy()
    label_cols = [c for c in df.columns if c not in ["id", "text"]]
    rows = []
    for _, r in df.iterrows():
        for lab in label_cols:
            if r[lab] == 1:
                rows.append({"label": lab, "text": r["text"]})
                break
    out = pd.DataFrame(rows)
    out["text"] = out["text"].astype(str).str.strip()
    return out

def load_istat_tsv(path: str):
    df = pd.read_csv(path, sep="\t", header=None, names=["label", "text"])
    df["text"] = df["text"].astype(str).str.strip()
    return df


## EMit: quick stats on multilabel vs single-label

In [4]:
emit_train = pd.read_csv("/content/emit_train_A.csv")
emit_test  = pd.read_csv("/content/emit_test.csv")

label_cols_train = [c for c in emit_train.columns if c not in ["id", "text"]]
label_cols_test  = [c for c in emit_test.columns  if c not in ["id", "text"]]

tr_multi, tr_single = count_multilabel_rows(emit_train, label_cols_train)
te_multi, te_single = count_multilabel_rows(emit_test,  label_cols_test)
print(f"EMit train:  multilabel={tr_multi}, singlelabel={tr_single}")
print(f"EMit test:   multilabel={te_multi}, singlelabel={te_single}")


EMit train:  multilabel=1175, singlelabel=4791
EMit test:   multilabel=865, singlelabel=135
train-all shape: (1854, 2)
train-mix shape: (677, 2)
train-gbv shape: (366, 2)
valid-all shape: (327, 2)
valid-mix shape: (120, 2)
valid-gbv shape: (64, 2)
test-all shape: (546, 2)
test-mix shape: (200, 2)
test-gbv shape: (108, 2)

Intersections for split=train:
 all ∩ mix = 365
 all ∩ gbv = 248
 mix ∩ gbv = 285
 all ∩ mix ∩ gbv = 194

Intersections for split=valid:
 all ∩ mix = 13
 all ∩ gbv = 9
 mix ∩ gbv = 24
 all ∩ mix ∩ gbv = 3

Intersections for split=test:
 all ∩ mix = 46
 all ∩ gbv = 0
 mix ∩ gbv = 49
 all ∩ mix ∩ gbv = 0



## ISTAT: sizes + intersections

In [9]:
splits   = ["train", "valid", "test"]
variants = ["all", "mix", "gbv"]
istat_sets = {}

for split in splits:
    for var in variants:
        df = load_istat_tsv(f"/content/{split}-emotion-{var}.tsv")
        istat_sets[(split, var)] = set(df["text"])
        print(f"{split}-{var} shape:", df.shape)

for split in splits:
    s_all = istat_sets[(split, "all")]
    s_mix = istat_sets[(split, "mix")]
    s_gbv = istat_sets[(split, "gbv")]
    print(f"\nIntersections for split={split}:")
    print(" all ∩ mix =", len(s_all & s_mix))
    print(" all ∩ gbv =", len(s_all & s_gbv))
    print(" mix ∩ gbv =", len(s_mix & s_gbv))
    print(" all ∩ mix ∩ gbv =", len(s_all & s_mix & s_gbv))


train-all shape: (1854, 2)
train-mix shape: (677, 2)
train-gbv shape: (366, 2)
valid-all shape: (327, 2)
valid-mix shape: (120, 2)
valid-gbv shape: (64, 2)
test-all shape: (546, 2)
test-mix shape: (200, 2)
test-gbv shape: (108, 2)

Intersections for split=train:
 all ∩ mix = 365
 all ∩ gbv = 248
 mix ∩ gbv = 285
 all ∩ mix ∩ gbv = 194

Intersections for split=valid:
 all ∩ mix = 13
 all ∩ gbv = 9
 mix ∩ gbv = 24
 all ∩ mix ∩ gbv = 3

Intersections for split=test:
 all ∩ mix = 46
 all ∩ gbv = 0
 mix ∩ gbv = 49
 all ∩ mix ∩ gbv = 0


## Build general set and clean conflicts; save CSVs

In [5]:
for split in splits:
    df_all = load_istat_tsv(f"/content/{split}-emotion-all.tsv")
    df_mix = load_istat_tsv(f"/content/{split}-emotion-mix.tsv")
    df_gbv = load_istat_tsv(f"/content/{split}-emotion-gbv.tsv")

    # concat all+mix and drop exact dupes
    df_gen = pd.concat([df_all, df_mix], ignore_index=True)
    df_gen = df_gen.drop_duplicates(subset=["label", "text"])
    print(f"\n[{split}] after all+mix dedup:", df_gen.shape)

    # remove items that are exactly in gbv (same label+text)
    gbv_pairs = set(df_gbv.apply(tuple, axis=1))
    mask = df_gen.apply(tuple, axis=1).isin(gbv_pairs)
    df_gen = df_gen.loc[~mask].copy()
    print(f"[{split}] after removing pairs present in gbv:", df_gen.shape)

    # remove conflicting texts (same text, different labels across gen vs gbv)
    texts_gen = set(df_gen["text"])
    texts_gbv = set(df_gbv["text"])
    conflicted = {
        t for t in (texts_gen & texts_gbv)
        if set(df_gen.loc[df_gen["text"] == t, "label"])
        != set(df_gbv.loc[df_gbv["text"] == t, "label"])
    }
    if conflicted:
        df_gen = df_gen.loc[~df_gen["text"].isin(conflicted)].copy()
        df_gbv = df_gbv.loc[~df_gbv["text"].isin(conflicted)].copy()
    print(f"[{split}] gen after removing conflicts:", df_gen.shape)
    print(f"[{split}] gbv after removing conflicts:", df_gbv.shape)

    # save
    df_gen.to_csv(f"/content/{split}-emotion-geen.csv", index=False)
    df_gbv.to_csv(f"/content/{split}-emotion-gbv.csv", index=False)
    print(f"[{split}] saved: /content/{split}-emotion-geen.csv and /content/{split}-emotion-gbv.csv")


[train] after all+mix dedup: (2091, 2)
[train] after removing pairs present in gbv: (1752, 2)
[train] gen after removing conflicts: (1749, 2)
[train] gbv after removing conflicts: (363, 2)
[train] saved: /content/train-emotion-geen.csv and /content/train-emotion-gbv.csv

[valid] after all+mix dedup: (427, 2)
[valid] after removing pairs present in gbv: (397, 2)
[valid] gen after removing conflicts: (396, 2)
[valid] gbv after removing conflicts: (63, 2)
[valid] saved: /content/valid-emotion-geen.csv and /content/valid-emotion-gbv.csv

[test] after all+mix dedup: (697, 2)
[test] after removing pairs present in gbv: (647, 2)
[test] gen after removing conflicts: (647, 2)
[test] gbv after removing conflicts: (108, 2)
[test] saved: /content/test-emotion-geen.csv and /content/test-emotion-gbv.csv


## EMit → monolabel CSVs and reformat EMit to ISTAT + 20% stratified valid

In [6]:
emit_train_m = to_monolabel(emit_train, label_cols_train)
emit_test_m  = to_monolabel(emit_test,  label_cols_test)

for df in (emit_train_m, emit_test_m):
    if "Disgust" in df.columns:
        df["Anger"] = df["Anger"] | df["Disgust"]
        df.drop(columns=["Disgust"], inplace=True)

emit_train_m.to_csv("/content/emit_train_m.csv", index=False)
emit_test_m.to_csv("/content/emit_test_m.csv",   index=False)
print("\nSaved EMit monolabel:")
print(" /content/emit_train_m.csv")
print(" /content/emit_test_m.csv")

df_train_fmt = reformat_emit_to_istat(emit_train_m)
df_test_fmt  = reformat_emit_to_istat(emit_test_m)

df_train_main, df_valid = train_test_split(
    df_train_fmt, test_size=0.2, stratify=df_train_fmt["label"], random_state=42
)

df_valid.to_csv("/content/emit_valid_m.csv", index=False)
print("\nEMit formatted shapes:")
print("  train_formatted:", df_train_fmt.shape)
print("  test_formatted: ", df_test_fmt.shape)
print("  valid_from_train:", df_valid.shape)



Saved EMit monolabel:
 /content/emit_train_m.csv
 /content/emit_test_m.csv

EMit formatted shapes:
  train_formatted: (4791, 2)
  test_formatted:  (135, 2)
  valid_from_train: (959, 2)


## Merge EMit + ISTAT to create final gen splits

In [7]:
train_gen = pd.read_csv("/content/train-emotion-geen.csv")
valid_gen = pd.read_csv("/content/valid-emotion-geen.csv")
test_gen  = pd.read_csv("/content/test-emotion-geen.csv")

train_def = pd.concat([df_train_main, train_gen], ignore_index=True)
valid_def = pd.concat([df_valid,      valid_gen], ignore_index=True)
test_def  = pd.concat([df_test_fmt,   test_gen ], ignore_index=True)

train_def = train_def.drop_duplicates(subset=["label", "text"])
valid_def = valid_def.drop_duplicates(subset=["label", "text"])
test_def  = test_def.drop_duplicates(subset=["label", "text"])

train_def.to_csv("/content/train-emotion-gen.csv", index=False)
valid_def.to_csv("/content/valid-emotion-gen.csv", index=False)
test_def.to_csv("/content/test-emotion-gen.csv",  index=False)

print("\nFinal DEF shapes:")
print("  train-emotion-gen:", train_def.shape)
print("  valid-emotion-gen:", valid_def.shape)
print("  test-emotion-gen: ", test_def.shape)
print("\nSaved GEN CSVs in /content/.")


Final DEF shapes:
  train-emotion-gen: (5461, 2)
  valid-emotion-gen: (1347, 2)
  test-emotion-gen:  (781, 2)

Saved GEN CSVs in /content/.


## Map Italian labels → English for GEN and GBV splits

In [8]:
it2en = {
    "AMORE": "Love",
    "GIOIA": "Joy",
    "NEUTRA": "Neutral",
    "PAURA": "Fear",
    "RABBIA": "Anger",
    "SORPRESA": "Surprise",
    "TRISTEZZA": "Sadness",
}

for split in ["train", "valid", "test"]:
    for t in ["gen", "gbv"]:
        path = f"/content/{split}-emotion-{t}.csv"
        try:
            df = pd.read_csv(path)
        except FileNotFoundError:
            continue
        before = sorted(df["label"].unique())
        df["label"] = df["label"].replace(it2en)
        after = sorted(df["label"].unique())
        df.to_csv(path, index=False)
        print(f"{split}-{t}: labels before={before}  after={after}")


train-gen: labels before=['AMORE', 'Anger', 'Anticipation', 'Fear', 'GIOIA', 'Joy', 'Love', 'NEUTRA', 'Neutral', 'PAURA', 'RABBIA', 'SORPRESA', 'Sadness', 'Surprise', 'TRISTEZZA', 'Trust']  after=['Anger', 'Anticipation', 'Fear', 'Joy', 'Love', 'Neutral', 'Sadness', 'Surprise', 'Trust']
train-gbv: labels before=['AMORE', 'GIOIA', 'NEUTRA', 'PAURA', 'RABBIA', 'SORPRESA', 'TRISTEZZA']  after=['Anger', 'Fear', 'Joy', 'Love', 'Neutral', 'Sadness', 'Surprise']
valid-gen: labels before=['AMORE', 'Anger', 'Anticipation', 'Fear', 'GIOIA', 'Joy', 'Love', 'NEUTRA', 'Neutral', 'PAURA', 'RABBIA', 'SORPRESA', 'Sadness', 'Surprise', 'TRISTEZZA', 'Trust']  after=['Anger', 'Anticipation', 'Fear', 'Joy', 'Love', 'Neutral', 'Sadness', 'Surprise', 'Trust']
valid-gbv: labels before=['AMORE', 'GIOIA', 'NEUTRA', 'PAURA', 'RABBIA', 'SORPRESA', 'TRISTEZZA']  after=['Anger', 'Fear', 'Joy', 'Love', 'Neutral', 'Sadness', 'Surprise']
test-gen: labels before=['AMORE', 'Anger', 'Anticipation', 'GIOIA', 'Joy', 'Love