This notebook contains code to rebalance CSV label files so that there's not a huge imbalance between labels. (In the raw data, the "null" labels far outnumber the labeled data, etc.)

In [1]:
import numpy as np
import pandas as pd

In [2]:
def create_duplicate_rows(df, labels, frac_of_nulls=1, non_isl_mult=1, sample_frac=None):
    """
    Takes a label dataframe, and duplicates certain rows until the desired class balance is obtained.
    This function assumes a multilabel label set.
    Arguments:
    df: Label dataframe
    labels: List of labels
    frac_of_nulls: The fraction of your "null" data you want your labeled data to be.
    For example, "1" means an equal amount of null and labeled data; 0.5 means the labeled data
    will be half the amount of the null data, etc. Default is 1.
    non_isl_mult: The higher this is, the larger the number of non-ISL data there will be relative
    to the ISL data. Default is 1, which means the ratio of ISL to non-ISL rows will remain the same.
    sample_frac: If not None, a random sample equal to sample_frac will be used.
    For example, if sample_frac is 0.1, the total dataset will be reduced by 90%. Default None.
    """

    yes_labels = 0

    for i in range(len(df)):
        for j in range(len(labels)):
            if df.loc[i, str(j)] == 1:
                # Find number of rows with at least one label
                yes_labels += 1
                break

    no_labels = len(df) - yes_labels

    label_count = {}

    for label in labels:
        label_count[label] = 0

    for i in range(len(df)):
        for j in range(len(labels)):
            if df.loc[i, str(j)] == 1:
                # Find the total number of each label
                lbl = labels[j]
                label_count[lbl] += 1

    non_isl_count = 0

    for lbl in labels:
        if lbl != 'ISL':
            # Find the total number of non-ISL labels
            non_isl_count += label_count[lbl]

    # We will be duplicating each non-ISL row a total of "non_isl_tot_prod" times.
    non_isl_tot_prod = ((no_labels * frac_of_nulls) // non_isl_count) * non_isl_mult

    # We will be duplicating each ISL row a total of "isl_prod" times.
    isl_prod = (no_labels * frac_of_nulls) // label_count['ISL']

    # Start by just pulling out the null labels.
    only_nulls = df[
        (df['0'] == 0) & (df['1'] == 0) & (df['2'] == 0) & (df['3'] == 0) & (df['4'] == 0)
    ]

    df2 = only_nulls.copy()

    for i in range(len(labels)):
        # Pull out all the rows that have a particular label
        only_that_label = df[df[str(i)] == 1]

        if labels[i] == 'ISL':
            mult = isl_prod
        else:
            mult = non_isl_tot_prod

        # Duplicate all the rows in the dataframe using the appropriate multiplier
        dupes = pd.DataFrame(np.repeat(only_that_label.values, mult, axis=0), columns=df.columns)

        df2 = df2.append(dupes)
    
    df2 = df2.reset_index(drop=True)
    
    if sample_frac:
        df2 = df2.sample(frac=sample_frac)
        df2 = df2.reset_index(drop=True)
    
    return df2

In [7]:
def create_duplicate_rows_multiclass(df, labels, frac_of_nulls=1, non_isl_mult=1, sample_frac=None):
    """
    The same function as above, just assuming a multiclass dataset.
    This function only works if the null class is named "null", all lowercase.
    """

    label_count = {}

    for label in labels:
        label_count[label] = 0

    for i in range(len(df)):
        for j in range(len(labels)):
            if df.loc[i, str(j)] == 1:
                lbl = labels[j]
                label_count[lbl] += 1

    non_isl_count = 0

    for lbl in labels:
        if lbl != 'ISL' and lbl != 'null':
            non_isl_count += label_count[lbl]
            
    no_labels = label_count['null']

    non_isl_tot_prod = ((no_labels * frac_of_nulls) // non_isl_count) * non_isl_mult

    isl_prod = (no_labels * frac_of_nulls) // label_count['ISL']

    null_col = str(len(labels) - 1)
    
    only_nulls = df[
        (df[null_col] == 1)
    ]

    df2 = only_nulls.copy()

    for i in range(len(labels) - 1):
        only_that_label = df[df[str(i)] == 1]

        if labels[i] == 'ISL':
            mult = isl_prod
        else:
            mult = non_isl_tot_prod

        dupes = pd.DataFrame(np.repeat(only_that_label.values, mult, axis=0), columns=df.columns)

        df2 = df2.append(dupes)
    
    df2 = df2.reset_index(drop=True)
    
    if sample_frac:
        df2 = df2.sample(frac=sample_frac)
        
    df2 = df2.reset_index(drop=True)
    
    return df2

In [10]:
# Demonstration of how to use the function

df = pd.read_csv('DRC_labels_multiclass_v2_train.csv')

labels = ['ISL', 'SAB', 'industrial_agriculture', 'null']

df2 = create_duplicate_rows_multiclass(df, labels, frac_of_nulls=1, non_isl_mult=2, sample_frac=1)

In [11]:
df2.shape

(35368, 5)

In [12]:
df2.to_csv('DRC_labels_multiclass_v3_train.csv')

In [20]:
# Here I needed to balance the number of SAB and null rows
# after deciding to limit the new model to purely predicting SAB.

df = pd.read_csv('DRC_labels_SAB_base_v1.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,0,1,paths
0,0,0,1,ISL/ISL-100-100-100_1000_1100-2020-12-15-2021-...
1,1,0,1,ISL/ISL-100-100-100_1000_1500-2020-12-15-2021-...
2,2,0,1,ISL/ISL-100-100-100_1100_1000-2020-12-15-2021-...
3,3,0,1,ISL/ISL-100-100-100_1100_1100-2020-12-15-2021-...
4,4,0,1,ISL/ISL-100-100-100_1100_900-2020-12-15-2021-0...


In [21]:
df.columns

Index(['Unnamed: 0', '0', '1', 'paths'], dtype='object')

In [22]:
# There's an extra column because I forgot to drop the index when I saved to CSV
df = df.drop(['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,0,1,paths
0,0,1,ISL/ISL-100-100-100_1000_1100-2020-12-15-2021-...
1,0,1,ISL/ISL-100-100-100_1000_1500-2020-12-15-2021-...
2,0,1,ISL/ISL-100-100-100_1100_1000-2020-12-15-2021-...
3,0,1,ISL/ISL-100-100-100_1100_1100-2020-12-15-2021-...
4,0,1,ISL/ISL-100-100-100_1100_900-2020-12-15-2021-0...


In [23]:
df.shape

(19819, 3)

In [24]:
# Creating train, val, and test datasets with a 60%/20%/20% ratio.
# First, calculate 60% and 20% of the total rows.
19819 * 0.6

11891.4

In [25]:
19819 * 0.2

3963.8

In [28]:
# Randomize the dataset.
df = df.sample(frac=1)
df = df.reset_index(drop=True)

In [29]:
# Divide into train, val, and test.
df_train = df.loc[0:11891]
df_val = df.loc[11892:11892+3963]
df_test = df.loc[11892+3963:]

print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(11892, 3)
(3964, 3)
(3964, 3)


In [30]:
# Now I want to achieve class balance on the train dataset.
# I don't class balance the val or test datasets because the actual data
# we're looking to predict on won't be balanced.
df_train['0'].value_counts()

0    9865
1    2027
Name: 0, dtype: int64

In [31]:
# Adding in three sets of duplicate SAB rows will roughly balance the dataset.

only_sab = df_train[df_train['0'] == 1]

dupes = pd.DataFrame(np.repeat(only_sab.values, 3, axis=0), columns=df_train.columns)

dupes.head()

Unnamed: 0,0,1,paths
0,1,0,SAB/Shifting_cultivation-100-49-49_1600_3000-2...
1,1,0,SAB/Shifting_cultivation-100-49-49_1600_3000-2...
2,1,0,SAB/Shifting_cultivation-100-49-49_1600_3000-2...
3,1,0,SAB/Shifting_cultivation-100-81-81_5800_9800-2...
4,1,0,SAB/Shifting_cultivation-100-81-81_5800_9800-2...


In [32]:
dupes.shape

(6081, 3)

In [33]:
dupes['0'].value_counts()

1    6081
Name: 0, dtype: int64

In [34]:
df_train = df_train.append(dupes)

df_train.shape

(17973, 3)

In [35]:
# Confirm that the dataset is roughly balanced

df_train['0'].value_counts()

0    9865
1    8108
Name: 0, dtype: int64

In [36]:
# Randomize the train set so that all the duplicate rows aren't at the bottom.
df_train = df_train.sample(frac=1)
df_train.head()

Unnamed: 0,0,1,paths
3138,0,1,null/100_700_0.tif
3232,0,1,null/67_3500_5600.tif
9486,0,1,null/67_300_6300.tif
10315,1,0,SAB/Shifting_cultivation-100-52-52_900_1400-20...
5753,1,0,SAB/Shifting_cultivation-100-81-81_8200_10900-...


In [10]:
df2 = df2.reset_index(drop=True)

# Now to divide into train/val/test, with a 60%/20%/20% distribution.
df2.shape[0] * 0.2

6611.0

In [11]:
df2.shape[0] * 0.6

19833.0

In [15]:
df2_train = df2.loc[0:19832]
df2_train.shape

(19833, 3)

In [16]:
df2_train['0'].value_counts()

1    9984
0    9849
Name: 0, dtype: int64

In [19]:
df2_val = df2.loc[19833:19833+6611]
df2_test = df2.loc[19833+6611:]
print(df2_val.shape)
print(df2_test.shape)

(6612, 3)
(6611, 3)


In [37]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [38]:
df_train.to_csv('DRC_labels_SAB_train_v1.csv', index=None, header=True)
df_val.to_csv('DRC_labels_SAB_val_v1.csv', index=None, header=True)
df_test.to_csv('DRC_labels_SAB_test_v1.csv', index=None, header=True)