In [None]:
import sys

import numpy as np
import pandas as pd

from utils.split import eval_splits, get_all_combs, get_valid_tests

## Split

1. Find combination of hoaxes that reach 70%, 20% and 10% of the data.
2. Find which combination has the most similar topic distribution to the original data

The splits are the following:
+ Train_val - 80%  = Train + val
    + Train - 70%
    + Val - 10%
+ Test - 20%

We obtain the following split:
+ Test: 'SP057', 'SP015', 'SP049', 'SP047', 'SP010', 'SP014', 'SP009', 'SP027', 'SP040', 'SP020', 'SP023', 'SP008', 'SP031'
+ Val: 'SP005', 'SP065', 'SP052', 'SP055', 'SP068'
+ Train: 'SP003', 'SP013', 'SP064', 'SP054', 'SP070', 'SP017', 'SP067', 'SP043', 'SP036', 'SP048'


In [None]:
st = pd.read_csv("data/stereohoax/stereoHoax-ES_goldstandard.csv")
st_soft = pd.read_csv("data/stereohoax/stereohoax_unaggregated.csv")
st = st.merge(st_soft[["index", "stereo_a1", "stereo_a2", "stereo_a3"]], on="index", how="left")

labels = [
    "xenophobia",
    "suffering",
    "economic",
    "migration",
    "culture",
    "benefits",
    "health",
    "security",
    "dehumanisation",
    "others",
]
labels_groups = labels + ["implicit", "contextual"]
y_columns = ["stereo"] + labels
# st = st[["index", "rh_id", "conversation_id"] + labels]

In [None]:
print(f'The number of distinct hoaxes is {st["rh_id"].nunique()}')
hoaxes = st["rh_id"].unique()
hoaxes

In [None]:
print(f"The category distribution of the data in general is: \n")
gen_dist = st[labels_groups].apply(pd.Series.value_counts)
gen_dist

In [None]:
gen_dist *= 100 / len(st)
gen_dist.round(2)

1. Find combination of hoaxes that reach 20% of the data
2. Find which of these combinations has the most similar topic distribution

To avoid looking at over 4 million combinations, we don't consider the 6 hoaxes with least tweets (total of 14) for this process.
We add these 14 tweets at the end

In [None]:
file_sz = st.groupby("rh_id").size().sort_values()
file_sz[:6], file_sz[6:]

In [None]:
valid_splits = get_all_combs(file_sz[6:], test_ratio=0.2, eps=0.01)

In [None]:
valid_splits2 = [split[0] for split in valid_splits]
res = eval_splits(st, valid_splits2, "rh_id", labels)

Sort results by MSE and MAPE

In [None]:
res.sort_values(by="MSE").head()

In [None]:
res.sort_values(by="MAPE").head()

We keep the one with lower MAPE

In [None]:
fs, n_samples = valid_splits[530]
print(fs)
print(f"Test percentage= {n_samples / len(st) * 100:.2f}")

In [None]:
test_split = list(fs)
test = st[st.rh_id.isin(test_split)]
train_val = st[~st.rh_id.isin(test_split)]

### Validation split

In [None]:
file_sz = train_val.groupby("rh_id").size().sort_values()
valid_splits_val = get_all_combs(file_sz, test_ratio=0.125, eps=0.01)

In [None]:
valid_splits_val2 = [split[0] for split in valid_splits_val]
res_val = eval_splits(train_val, valid_splits_val2, "rh_id", labels)

In [None]:
res_val.sort_values(by="MSE").head()

In [None]:
res_val.sort_values(by="MAPE").head()

In [None]:
fs_val, n_samples = valid_splits_val[380]
print(fs_val)
print(f"Val percentage of whole data = {n_samples / len(st) * 100:.1f}")
print(f"Val percentage of train = {n_samples / len(train_val) * 100:.1f}")

In [None]:
val_split = list(fs_val)
val = st[st.rh_id.isin(val_split)]
train = st[~st.rh_id.isin(test_split + val_split)]

In [None]:
train_val.to_csv("data/stereohoax/train_val_split.csv", index=False)
test.to_csv("data/stereohoax/test_split.csv", index=False)
val.to_csv("data/stereohoax/val_split.csv", index=False)
train.to_csv("data/stereohoax/train_split.csv", index=False)

## Analysis

In [None]:
fs = {
    "SP057",
    "SP015",
    "SP049",
    "SP047",
    "SP010",
    "SP014",
    "SP009",
    "SP027",
    "SP040",
    "SP020",
    "SP023",
    "SP008",
    "SP031",
}

In [None]:
fs_val = {"SP005", "SP065", "SP052", "SP055", "SP068"}

In [None]:
test[labels_groups].apply(pd.Series.value_counts) / len(test) * 100

In [None]:
train[labels_groups].apply(pd.Series.value_counts) / len(train) * 100

In [None]:
train_val[labels_groups].apply(pd.Series.value_counts) / len(train) * 100

In [None]:
val[labels_groups].apply(pd.Series.value_counts) / len(train) * 100

In [None]:
val.stereo.sum() / len(val)

In [None]:
fs.union(fs_val)

In [None]:
file_sz = st.groupby("rh_id").size().sort_values()
fs_test = fs
fs_train_val = set(file_sz.keys()) - fs_test
fs_train_val = set(file_sz.keys()) - fs - fs_val
fs_train = fs_train_val - fs_val

a = 0
for f_list in (fs_train, fs_val, fs_test):
    ds = file_sz[list(f_list)].copy()
    ds["sum"] = ds.sum()
    a += ds.sum()
    ds = ds / file_sz.sum() * 100
    print(list(ds.round(2).items()))
    print(ds.round(2))

print("final")
print(a, file_sz.sum())