In [None]:
import itertools
import os
import sys

import networkx as nx
import numpy as np
import pandas as pd

from utils.split import eval_splits, get_all_combs, get_valid_tests

1. Find combination of comments that reach 85% and 15% of the train data $\approx$ 60% and 10% of the full dataset.
2. Find which combination has the most similar topic distribution to the original train data.

In [None]:
train_val = pd.read_csv("data/detests/train_with_disagreement_context_soft.csv").fillna("")
# train_val_no_fill = pd.read_csv("../data/detests/train_with_disagreement_context_no_fill_soft.csv").fillna("")
labels = [
    "xenophobia",
    "suffering",
    "economic",
    "migration",
    "culture",
    "benefits",
    "health",
    "security",
    "dehumanisation",
    "others",
]
labels_groups = labels + ["implicit"]
y_columns = ["stereo"] + labels

In [None]:
print(f"The category distribution of the data in general is: \n")
gen_dist = train_val[labels_groups].apply(pd.Series.value_counts)
gen_dist

In [None]:
gen_dist *= 100 / len(train_val)
gen_dist.round(2)

## Split with comment_id and threads

In [None]:
print(f'The number of distinct comments is {train_val["comment_id"].nunique()}')

In [None]:
comments = train_val[["comment_id", "reply_to"]].groupby("comment_id").tail(1)
G = nx.Graph()
G.add_edges_from(comments.to_numpy().tolist())
comps = list(nx.connected_components(G))
print(f"The number of distinct threads is {len(comps)}")
comps[:3]

In [None]:
le = np.array([len(c) for c in comps])

In [None]:
(le == 1).sum()

In [None]:
def number_comp(comment_id):
    i = 0
    while comment_id not in comps[i]:
        i += 1
    return i


train_val["thread"] = train_val["comment_id"].apply(number_comp)

1. Find combination of news that reach 15% of the data
2. Find which of these combinations has the most similar topic distribution

+ Problem: too many combinations
+ Approach: Batch of threads

In [None]:
def batch_files_sz(df, column, batch_size=10):
    file_sz = df.groupby(column).size().sample(frac=1, random_state=42)
    n_batches = (len(file_sz) - 1) // batch_size + 1
    return np.array_split(file_sz, n_batches)


batches = batch_files_sz(train_val, "thread", 35)
file_sz = pd.Series([batch.sum() for batch in batches])
keys = [batch.keys().tolist() for batch in batches]

In [None]:
valid_splits = get_all_combs(file_sz, test_ratio=0.15, eps=0.01)
valid_splits2 = [set(itertools.chain(*[keys[batch] for batch in split[0]])) for split in valid_splits]

In [None]:
res = eval_splits(train_val, valid_splits2, "thread", labels)

# Split with news

In [None]:
print(f'The number of distinct news is {train_val["file_id"].nunique()}')

In [None]:
file_sz = train_val.groupby("file_id").size().sort_values()
file_sz

In [None]:
valid_splits = get_all_combs(file_sz, test_ratio=0.15, eps=0.1)
valid_splits

In [None]:
valid_splits2 = [split[0] for split in valid_splits]
res = eval_splits(train_val, valid_splits2, "file_id", labels)

# Check results

Sort results by MSE and MAPE

In [None]:
res.sort_values(by="MSE").head()

In [None]:
res.sort_values(by="MAPE").head()

We keep the one with lower MAPE

In [None]:
field = "file_id"
# field = "thread"
val_split = valid_splits2[0]
print(val_split)
val = train_val[train_val[field].isin(val_split)]
train = train_val[~train_val[field].isin(val_split)]

# val_no_fill = train_val_no_fill[train_val[field].isin(val_split)]
# train_no_fill = train_val_no_fill[~train_val[field].isin(val_split)]

In [None]:
train.to_csv("data/detests/train_split_context_soft.csv", index=False)
val.to_csv("data/detests/val_split_context_soft.csv", index=False)

# train_no_fill.to_csv("data/detests/train_split_context_no_fill_soft.csv", index=False)
# val_no_fill.to_csv("data/detests/val_split_context_no_fill_soft.csv", index=False)

In [None]:
# % of train_val (original TRAIN dataset)
perc_train_val = np.array([len(train), len(val)])
perc_train_val = perc_train_val / len(train_val) * 100
perc_train_val

In [None]:
# % of whole dataset
perc_train_val * 0.70

In [None]:
train[labels_groups].apply(pd.Series.value_counts) / len(train) * 100

In [None]:
val[labels_groups].apply(pd.Series.value_counts) / len(val) * 100