In [46]:
import sys
from typing import Optional

import numpy as np
import pandas as pd

# imports src
sys.path.append("../")
from src import utils

In [51]:
def shuffle_by_labels(
    profile: pd.DataFrame, label_col: str, seed: Optional[int] = 1
) -> pd.DataFrame:
    """Shuffles labels in selected column.

    Parameters
    ----------
    profile : pd.DataFrame
        image-based profile with metadata
    target_col : str
        selected column that contains the labels

    Returns
    -------
    pd.DataFrame
        shuffled labeled dataframe
    """

    # type checking
    if not isinstance(profile, pd.DataFrame):
        raise TypeError(f"`profile` must be a dataframe not {type(profile)}")
    if not isinstance(label_col, str):
        raise TypeError(f"`label_col` must be a string type not {type(label_col)}")

    # select column and shuffle labels
    np.random.seed(seed)
    shuffled_labels = np.random.permutation(profile[label_col])
    profile[label_col] = shuffled_labels

    return profile


def shuffle_feature_space(
    profile: pd.DataFrame, col_idx_split: int, seed=1
) -> pd.DataFrame:
    """Shuffled profile's feature space values

    Parameters
    ----------
    feature_val : pd.DataFrame
        _description_
    col_idx_split : int
        column integer where to split the metadata and extracted features
    seed : Optional[int]
        seed seeds in order to maintain reproducibility.

    Returns
    -------
    pd.DataFrame
        feature space shuffled data
    """

    # type checker
    if not (profile, pd.DataFrame):
        raise TypeError(f"`profile` must be a dataframe not {type(profile)}")

    # select
    try:
        feature_vals = profile[profile.columns[col_idx_split:]].astype(float)
    except Exception:
        raise TypeError("The selected index splitter captures non-numerical data")

    # get metadata
    metadata = profile[profile.columns[:col_idx_split]]

    # shuffle feature
    feature_mat = feature_vals.to_numpy()
    for col in feature_mat.T:
        np.random.shuffle(col)

    # reconstruct shuffled data
    feature_shuffled_data = pd.concat(
        [metadata, pd.DataFrame(data=feature_mat)], axis=1
    )
    feature_shuffled_data.columns = profile.columns.tolist()

    # concat metadata with shuffled feature space
    return feature_shuffled_data

In [77]:
# Creating character metadata
np.random.seed(10)
characters = ["A", "B", "C", "D", "E"]

# Creating DataFrame
data = {
    "Character": characters,
    "Gender": ["Male", "Female", "Male", "Female", "Male"],
    "Role": ["Protagonist", "Antagonist", "Supporting", "Protagonist", "Supporting"],
    "NumericData1": np.random.randint(1, 100, 5),
    "NumericData2": np.random.randint(1, 100, 5),
    "NumericData3": np.random.randint(1, 100, 5),
    "NumericData4": np.random.randint(1, 100, 5),
    "NumericData5": np.random.randint(1, 100, 5),
}

control_df = pd.DataFrame(data)

# Display the DataFrame
control_df

Unnamed: 0,Character,Gender,Role,NumericData1,NumericData2,NumericData3,NumericData4,NumericData5
0,A,Male,Protagonist,10,94,41,89,50
1,B,Female,Antagonist,16,30,37,63,52
2,C,Male,Supporting,65,9,17,34,55
3,D,Female,Protagonist,29,74,12,73,78
4,E,Male,Supporting,90,1,55,79,70


In [89]:
# testing shuffled feature space
test_shuffled_feat = shuffle_feature_space(control_df, col_idx_split=3)

# check if whole dataframe is different
assert not control_df.equals(test_shuffled_feat)

# check if the values per column are equal:
control_feat_space = control_df[control_df.columns[3:]]
test_shuffled_feat_space = test_shuffled_feat[test_shuffled_feat.columns[3:]]

# check if they are the same size and shape
assert control_feat_space.shape == test_shuffled_feat_space.shape

# check if each the column values are equal
for idx in range(control_feat_space.shape[1]):
    sel_control = control_feat_space.T[idx].values.tolist()
    sel_test = test_shuffled_feat_space.T[idx].values.tolist()

    assert sel_control != sel_test