In [1]:
import pandas as pd
from pathlib import Path

import altair as alt

In [2]:
dataset_dir = Path.cwd().parent / "data" / "split"

In [4]:
data = []

for path in dataset_dir.iterdir():
    if "train" in path.name or "validate" in path.name:
        dataset = path.name.split("-")[0]
        split_method = path.name.split("-")[1]
        split_id = path.name.split("-")[2][:-4]
        df = pd.read_csv(path)

        neg_count = df[df['labels'] == 0]['labels'].count()
        pos_count = df[df['labels'] == 1]['labels'].count()
        total_count = neg_count + pos_count

        data.append([dataset, split_method, split_id, "Postive", pos_count, total_count])
        data.append([dataset, split_method, split_id, "Negative", neg_count, total_count])

df = pd.DataFrame(data, columns = ["dataset", "split_method", "split_id", "label", "label_count", "total_count"])
df['Population_Percent'] = df['label_count'] / df['total_count']

df.sample(10)
        
    


Unnamed: 0,dataset,split_method,split_id,label,label_count,total_count,Population_Percent
26,lipophilicity,cluster,validate,Postive,607,630,0.963492
22,HIV,random,validate,Postive,230,6170,0.037277
34,sol_del,cluster,validate,Postive,39,169,0.230769
1,bace,cluster,train,Negative,442,1287,0.343434
20,HIV,random,train,Postive,1213,34957,0.0347
10,clintox,cluster,validate,Postive,7,221,0.031674
36,sol_del,random,train,Postive,153,958,0.159708
24,lipophilicity,cluster,train,Postive,3448,3570,0.965826
17,HIV,cluster,train,Negative,33745,34958,0.965301
30,lipophilicity,random,validate,Postive,608,630,0.965079


In [11]:
single = alt.selection_single(on='mouseover', nearest=True)

random_chart = alt.Chart(df[df['split_method'] == "random"]).mark_bar().encode(
    x = alt.X("split_id:N", title = None),
    y = alt.Y("Population_Percent:Q", title = "Population Percent"),
    column = alt.Column("dataset:N", title = "Class Balance with Random Splitting", header=alt.Header(labelFontSize=10, labelFontWeight= "bold")),
    color = "label",
    tooltip= ["dataset" ,"label", "label_count", "total_count", "Population_Percent"]
).properties(height = 150).interactive()

cluster_chart = alt.Chart(df[df['split_method'] == "cluster"]).mark_bar().encode(
    x = alt.X("split_id:N", title = None),
    y = alt.Y("Population_Percent:Q", title = None),
    column = alt.Column("dataset:N", title = "Class Balance with Clustered Splitting", header=alt.Header(labelFontSize=10, labelFontWeight= "bold")),
    color = "label",
    tooltip= ["dataset" ,"label", "label_count", "total_count", "Population_Percent"]
).properties(height = 150).interactive()

(random_chart | cluster_chart).properties(title = "Class Balance Across Datasets and Split Methods").configure_title(fontSize=24)