In [None]:
import pandas as pd


def get_train_dataset(song_info, train_concept, independent_concept, max_test_size=100):
    df_train = {}
    for train_class in train_concept[1]:
        df_train[train_class] = []
        for independent_class in independent_concept[1]:
            df_sub = song_info[song_info[independent_concept[0]] == independent_class]
            df_sub_positive = df_sub[df_sub[train_concept[0]] == train_class]
            df_sub_negative = df_sub[~(df_sub[train_concept[0]] == train_class)]

            minimum_counts = min(len(df_sub_positive), len(df_sub_negative))

            if minimum_counts > max_test_size:
                df_sub_positive = df_sub_positive.sample(minimum_counts - max_test_size)
                df_sub_negative = df_sub_negative.sample(minimum_counts - max_test_size)
                df_sub_positive["target"] = 1
                df_sub_negative["target"] = 0
                df_train[train_class].append(df_sub_positive)
                df_train[train_class].append(df_sub_negative)

        df_train[train_class] = pd.concat(df_train[train_class])
    return df_train


def get_test_dataset(song_info, train_dataset, train_concept, independent_concept):
    df_test = {}
    for train_class in train_concept[1]:
        df_test[train_class] = []
        df_not_train = song_info[
            ~song_info.song_id.isin(train_dataset[train_class].song_id)
        ]
        df_test[train_class] = []
        for independent_class in independent_concept[1]:
            df_sub = df_not_train[
                df_not_train[independent_concept[0]] == independent_class
            ]
            df_sub_positive = df_sub[df_sub[train_concept[0]] == train_class]
            df_sub_negative = df_sub[~(df_sub[train_concept[0]] == train_class)]

            minimum_counts = min(len(df_sub_positive), len(df_sub_negative))

            df_sub_positive = df_sub_positive.sample(minimum_counts)
            df_sub_negative = df_sub_negative.sample(minimum_counts)
            df_sub_positive["target"] = 1
            df_sub_negative["target"] = 0
            df_test[train_class].append(df_sub_positive)
            df_test[train_class].append(df_sub_negative)

        df_test[train_class] = pd.concat(df_test[train_class])

    return df_test


def get_genre_balanced_datasets(song_info, train_concept):
    song_info_without_nan = song_info[song_info[train_concept[0]].notnull()]
    independent_concept = ("genre", list(song_info.genre.unique()))

    train_dataset = get_train_dataset(
        song_info=song_info_without_nan,
        train_concept=train_concept,
        independent_concept=independent_concept,
    )

    test_dataset = get_test_dataset(
        song_info=song_info_without_nan,
        train_dataset=train_dataset,
        train_concept=train_concept,
        independent_concept=independent_concept,
    )

    return train_dataset, test_dataset


In [None]:
song_info = (
    pd.read_csv("data/song_info.csv")
    .merge(pd.read_csv("data/song_artist.csv"))
    .merge(pd.read_csv("data/artist_info.csv"))
)
song_info = song_info.drop_duplicates(keep="first").reset_index(drop=True)

song_info["singer_age"] = song_info.apply(
    lambda x: int(x.release_date.split("-")[0]) - x.birth_year
    if len(str(x.birth_year)) == 4
    else None,
    axis=1,
)
song_info["singer_age_decade"] = song_info["singer_age"].apply(
    lambda x: int(x / 10) * 10 if x == x else None
)


#### Create genre balanced datasets


In [None]:
train_concept_language = ("language", ["en", "fr", "it", "pt", "ja", "es", "de"])
(train_dataset_language, test_dataset_language) = get_genre_balanced_datasets(
    song_info=song_info, train_concept=train_concept_language
)

train_concept_gender = ("gender", ["male", "female"])
(train_dataset_gender, test_dataset_gender) = get_genre_balanced_datasets(
    song_info=song_info, train_concept=train_concept_gender
)

train_concept_singer_age_decade = ("singer_age_decade", [i * 10 for i in range(1, 9)])
(train_dataset_singer_age_decade, test_dataset_singer_age_decade) = (
    get_genre_balanced_datasets(
        song_info=song_info, train_concept=train_concept_singer_age_decade
    )
)


#### Save datasets


In [None]:
for key in train_dataset_language:
    train_dataset_language[key].to_csv(
        f"datasets/train_dataset_language_{key}.csv", index=False
    )
    test_dataset_language[key].to_csv(
        f"datasets/test_dataset_language_{key}.csv", index=False
    )

for key in train_dataset_gender:
    train_dataset_gender[key].to_csv(
        f"datasets/train_dataset_gender_{key}.csv", index=False
    )
    test_dataset_gender[key].to_csv(
        f"datasets/test_dataset_gender_{key}.csv", index=False
    )

for key in train_dataset_singer_age_decade:
    train_dataset_singer_age_decade[key].to_csv(
        f"datasets/train_dataset_singer_age_decade_{key}.csv", index=False
    )
    test_dataset_singer_age_decade[key].to_csv(
        f"datasets/test_dataset_singer_age_decade_{key}.csv", index=False
    )