In [6]:
import pandas as pd
import numpy as np
import gc
from collections import Counter
import csv
import pathlib
import math

from nltk.tokenize.casual import TweetTokenizer

In [7]:
cwd = pathlib.Path.cwd()
repo_dir = cwd.parent
dataset_dir = repo_dir / "datasets" / "amazon_movies"

df = pd.read_csv(dataset_dir / "raw" / "en_amazon_movies_0Mto0p5M.csv")

In [12]:
def my_round(x, base=.05, prec=2):
    return round(base * round(float(x) / base), prec)


def split_dataset_into_even_class_distributions(X_data, Y_data, mini_batch_size=32):
    """
    Split a dataset into train, validate, and test sets that have the same distribution of classes as the original data.
    Also this function ensures the resulting datasets are multiples of the mini_batch_size

    :param X_data: Numpy 2d array or pandas dataframe, each row is a record
    :param Y_data: Pandas Series, the categorial class labels
    :param train_size: float 0-1, quantity of the original dataset to be in the train set (val is created from the residual)
    :param test_size: float 0-1, quantity of the original dataset to be in the test set (val is created from the residual)
    :param mini_batch_size: Int, the size of the mini batches to ensure each dataset is a multiple of that values

    :return: 6 data sets of X and Y
    """

    dist = Counter(val for val in Y_data)
    print('Total class distribution:', dict(dist))

    Y_data.reset_index(drop=True, inplace=True)
    X_data.reset_index(drop=True, inplace=True)

    train_ix = []
    validate_ix = []
    test_ix = []
    for cls in dist.keys():
        num_train = my_round(dist[cls] * train_size, base=mini_batch_size)
        num_test = my_round(dist[cls] * test_size, base=mini_batch_size)

        cls_targets = Y_data[Y_data == cls]

        full_ixs = np.random.choice(cls_targets.index, size=num_train, replace=False)

        train_ix += list(full_ixs)

        cls_targets = cls_targets[~cls_targets.index.isin(full_ixs)]

        full_ixs = np.random.choice(cls_targets.index, size=num_test, replace=False)

        test_ix += list(full_ixs)

        cls_targets = cls_targets[~cls_targets.index.isin(full_ixs)]

        validate_ix += list(cls_targets.index)

    X_train = X_data[train_ix]
    Y_train = Y_data[train_ix].reset_index(drop=True)

    X_validate = X_data[validate_ix]
    Y_validate = Y_data[validate_ix].reset_index(drop=True)

    X_test = X_data[test_ix]
    Y_test = Y_data[test_ix].reset_index(drop=True)

    print('Train class distribution:', dict(Counter(val for val in Y_train)))
    print('Validate class distribution:', dict(Counter(val for val in Y_validate)))
    print('Test class distribution:', dict(Counter(val for val in Y_test)))

    return X_train, Y_train, X_validate, Y_validate, X_test, Y_test


def parse_df_into_sets_by_tertiary_grouping(df, ter="ID", y="Score", x="text", train_prop=.7, test_prop=.2):

    ideal_train_size = math.floor(len(df)*train_prop)
    ideal_test_size = math.floor(len(df)*test_prop)
    ideal_val_size = len(df) - (ideal_train_size + ideal_test_size)

    title_avg_count = df[ter].value_counts().mean()

    titles = df[ter].unique().tolist()

    train_titles = []
    train_count = 0

    while train_count <= ideal_train_size - title_avg_count:
        title = np.random.choice(titles)
        titles.remove(title)
        train_titles.append(title)
        train_count += len(df[df[ter] == title])

    test_titles = []
    test_count = 0

    while test_count <= ideal_test_size - title_avg_count:
        title = np.random.choice(titles)
        titles.remove(title)
        test_titles.append(title)
        test_count += len(df[df[ter] == title]) 

    val_titles = titles
    val_count = len(df[df[ter].isin(val_titles)])

    print("Ideal train size is:", ideal_train_size, "whearas the actual is:", train_count)
    print("Ideal val size is:", ideal_val_size, "whearas the actual is:", val_count)
    print("Ideal test size is:", ideal_test_size, "whearas the actual is:", test_count)

    df_trn = df[df[ter].isin(train_titles)]
    df_val = df[df[ter].isin(val_titles)]
    df_tst = df[df[ter].isin(test_titles)]
    
    print("Train class distribution:", df_trn[y].value_counts())
    print("Val class distribution:", df_val[y].value_counts())
    print("Test class distribution:", df_tst[y].value_counts())

    return df_trn[x], df_trn[y], df_val[x], df_val[y], df_tst[x], df_tst[y]

def balance_classes(X_data, Y_data, min_class_value=None):
    
    kes = Y_data.value_counts().keys().tolist()
    
    min_class = Y_data.value_counts().keys()[-1]
    
    if not min_class_value:
        min_class_value = Y_data.value_counts().values[-1]
    
    all_ix = []
            
    for ke in kes:
        ke_indexes = Y_data[Y_data == ke].index.tolist()
        if len(ke_indexes) == min_class_value:
            all_ix += ke_indexes
        else:
            all_ix += list(np.random.choice(ke_indexes, size=min_class_value, replace=False))
        
    X_data = X_data[all_ix]
    Y_data = Y_data[all_ix]
    
    print("Y class distribution:", Y_data.value_counts())
    
    return X_data, Y_data

In [11]:
df.head()

Unnamed: 0,ID,User ID,User Name,Helpfulness,Score,Time,Header,Text,Cleaned,Hashtags,At Mentions,Extracted URLs,Stemmed,Preprocessed
0,B003AI2VGA,A141HP4LYPWMSR,"Brian E. Erland ""Rainbow Sphinx""",7/7,3.0,1182729600,"""There Is So Much Darkness Now ~ Come For The ...","Synopsis: On the daily trek from Juarez, Mexic...",synopsis on the daily trek from juarez mexico ...,[],[],[],synopsi on the daili trek from juarez mexico t...,synopsi daili trek juarez mexico el paso texa ...
1,B003AI2VGA,A328S9RN3U5M68,Grady Harp,4/4,3.0,1181952000,Worthwhile and Important Story Hampered by Poo...,THE VIRGIN OF JUAREZ is based on true events s...,the virgin of juarez is based on true events s...,[],[],[],the virgin of juarez is base on true event sur...,virgin juarez base true event surround crime p...
2,B003AI2VGA,A1I7QGUDP043DG,"Chrissy K. McVay ""Writer""",8/10,5.0,1164844800,This movie needed to be made.,The scenes in this film can be very disquietin...,the scenes in this film can be very disquietin...,[],[],[],the scene in this film can be veri disquiet du...,scene film veri disquiet due graphic enact rea...
3,B003AI2VGA,A1M5405JH9THP9,golgotha.gov,1/1,3.0,1197158400,distantly based on a real tragedy,THE VIRGIN OF JUAREZ (2006)<br />directed by K...,the virgin of juarez 2006 br directed by kevin...,[],[],[],the virgin of juarez 2006 br direct by kevin j...,virgin juarez 2006 br direct kevin jame dobson...
4,B003AI2VGA,ATXL536YX71TR,"KerrLines ""&#34;Movies,Music,Theatre&#34;""",1/1,3.0,1188345600,"""What's going on down in Juarez and shining a ...","Informationally, this SHOWTIME original is ess...",informationally this showtime original is esse...,[],[],[],inform this showtim origin is essenti view for...,inform showtim origin essenti view enlighten a...


# "Unfair" - same movie mixed in train and test

In [None]:
# Unbalanced
X_train, Y_train, X_validate, Y_validate, X_test, Y_test = split_dataset_into_even_class_distributions(X_data=df['Text'],
                                                                                                       Y_data=df['Score'], 
                                                                                                       train_size=.7, 
                                                                                                       test_size=.2,
                                                                                                       mini_batch_size=1)

df_trn = pd.DataFrame(np.array([Y_train, X_train]).T, columns=['label', 'text'])
df_val = pd.DataFrame(np.array([Y_validate, X_validate]).T, columns=['label', 'text'])
df_tst = pd.DataFrame(np.array([Y_test, X_test]).T, columns=['label', 'text'])

df_trn.to_csv("../datasets/amazon_movies/unbalanced/train.csv", index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')
df_val.to_csv("../datasets/amazon_movies/unbalanced/validate.csv", index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')
df_tst.to_csv("../datasets/amazon_movies/unbalanced/test.csv", index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')

In [None]:
# Balanced

lowest_class_num = min(df["Score"].value_counts())
print("Smallest class size: {}".format(lowest_class_num))
df_bl = df.groupby("Score").apply(lambda x: x.sample(n=lowest_class_num, replace=False)).reset_index(drop=True).loc[:, ["Text", "Score"]]
df_bl = df_bl.sample(frac=1).reset_index(drop=True)

X_train, Y_train, X_validate, Y_validate, X_test, Y_test = split_dataset_into_even_class_distributions(X_data=df_bl['Text'],
                                                                                                       Y_data=df_bl['Score'], 
                                                                                                       train_size=.7, 
                                                                                                       test_size=.2,
                                                                                                       mini_batch_size=1)

df_trn = pd.DataFrame(np.array([Y_train, X_train]).T, columns=['label', 'text'])
df_val = pd.DataFrame(np.array([Y_validate, X_validate]).T, columns=['label', 'text'])
df_tst = pd.DataFrame(np.array([Y_test, X_test]).T, columns=['label', 'text'])

df_trn.to_csv("../datasets/amazon_movies/balanced/train.csv", index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')
df_val.to_csv("../datasets/amazon_movies/balanced/validate.csv", index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')
df_tst.to_csv("../datasets/amazon_movies/balanced/test.csv", index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')

# Fair

In [None]:
# Unbalanced
X_train, Y_train, X_validate, Y_validate, X_test, Y_test = parse_df_into_sets_by_tertiary_grouping(df=df,
                                                                                                   ter="ID",
                                                                                                   y="Score",
                                                                                                   x="Text",
                                                                                                   train_prop=.7, 
                                                                                                   test_prop=.2)

df_trn = pd.DataFrame(np.array([Y_train, X_train]).T, columns=['label', 'text'])
df_val = pd.DataFrame(np.array([Y_validate, X_validate]).T, columns=['label', 'text'])
df_tst = pd.DataFrame(np.array([Y_test, X_test]).T, columns=['label', 'text'])

df_trn.to_csv("../datasets/amazon_movies/fair_unbalanced/train.csv", index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')
df_val.to_csv("../datasets/amazon_movies/fair_unbalanced/validate.csv", index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')
df_tst.to_csv("../datasets/amazon_movies/fair_unbalanced/test.csv", index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')

In [None]:
# Unbalanced
X_train, Y_train, X_validate, Y_validate, X_test, Y_test = parse_df_into_sets_by_tertiary_grouping(df=df,
                                                                                                   ter="ID",
                                                                                                   y="Score",
                                                                                                   x="Text",
                                                                                                   train_prop=.7, 
                                                                                                   test_prop=.2)

X_train, Y_train = balance_classes(X_data=X_train, Y_data=Y_train)
X_validate, Y_validate = balance_classes(X_data=X_validate, Y_data=Y_validate)
X_test, Y_test = balance_classes(X_data=X_test, Y_data=Y_test)

df_trn = pd.DataFrame(np.array([Y_train, X_train]).T, columns=['label', 'text'])
df_val = pd.DataFrame(np.array([Y_validate, X_validate]).T, columns=['label', 'text'])
df_tst = pd.DataFrame(np.array([Y_test, X_test]).T, columns=['label', 'text'])

df_trn.to_csv("../datasets/amazon_movies/fair_balanced/train.csv", index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')
df_val.to_csv("../datasets/amazon_movies/fair_balanced/validate.csv", index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')
df_tst.to_csv("../datasets/amazon_movies/fair_balanced/test.csv", index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')

# Fair split for Low Shot paper

In [14]:
# Unbalanced

X_train, Y_train, X_validate, Y_validate, X_test, Y_test = parse_df_into_sets_by_tertiary_grouping(df=df,
                                                                                                   ter="ID",
                                                                                                   y="Score",
                                                                                                   x="Text",
                                                                                                   train_prop=.4, 
                                                                                                   test_prop=.4)

N_CLASSES = len(Y_train.value_counts())
print(N_CLASSES, "classes")

df_trn = pd.DataFrame(np.array([Y_train, X_train]).T, columns=['label', 'text'])
df_val = pd.DataFrame(np.array([Y_validate, X_validate]).T, columns=['label', 'text'])
df_tst = pd.DataFrame(np.array([Y_test, X_test]).T, columns=['label', 'text'])

if len(df_val) > 25000:
    df_val = df_val.sample(n=25000)

df_tst = df_tst.sample(n=25000)

for size in [100, 300, 1000]:
    df_trn.sample(n=size*N_CLASSES).to_csv(dataset_dir / ("fair_unbalanced_"+str(size)) / "train.csv", index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')
    df_val.to_csv(dataset_dir / ("fair_unbalanced_"+str(size)) / "validate.csv", index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')
    df_tst.to_csv(dataset_dir / ("fair_unbalanced_"+str(size)) / "test.csv", index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')

Ideal train size is: 200000 whearas the actual is: 199971
Ideal val size is: 100000 whearas the actual is: 100019
Ideal test size is: 200000 whearas the actual is: 200010
Train class distribution: 5.0    108800
4.0     41601
3.0     20198
1.0     17263
2.0     12109
Name: Score, dtype: int64
Val class distribution: 5.0    54982
4.0    21283
3.0    10126
1.0     7805
2.0     5823
Name: Score, dtype: int64
Test class distribution: 5.0    111157
4.0     41631
3.0     19950
1.0     15758
2.0     11514
Name: Score, dtype: int64
5 classes


In [15]:
# Balanced
X_train, Y_train, X_validate, Y_validate, X_test, Y_test = parse_df_into_sets_by_tertiary_grouping(df=df,
                                                                                                   ter="ID",
                                                                                                   y="Score",
                                                                                                   x="Text",
                                                                                                   train_prop=.4, 
                                                                                                   test_prop=.4)

X_validate, Y_validate = balance_classes(X_data=X_validate, Y_data=Y_validate, min_class_value=5000)
X_test, Y_test = balance_classes(X_data=X_test, Y_data=Y_test, min_class_value=5000)

df_val = pd.DataFrame(np.array([Y_validate, X_validate]).T, columns=['label', 'text'])
df_tst = pd.DataFrame(np.array([Y_test, X_test]).T, columns=['label', 'text'])

for size in [100, 300, 1000]:
    X_train_crnt, Y_train_crnt = balance_classes(X_data=X_train, Y_data=Y_train, min_class_value=size)
    df_trn = pd.DataFrame(np.array([Y_train_crnt, X_train_crnt]).T, columns=['label', 'text'])
    df_trn.to_csv(dataset_dir / ("fair_balanced_"+str(size)) / "train.csv", index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')
    df_val.to_csv(dataset_dir / ("fair_balanced_"+str(size)) / "validate.csv", index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')
    df_tst.to_csv(dataset_dir / ("fair_balanced_"+str(size)) / "test.csv", index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')

Ideal train size is: 200000 whearas the actual is: 199970
Ideal val size is: 100000 whearas the actual is: 100043
Ideal test size is: 200000 whearas the actual is: 199987
Train class distribution: 5.0    108737
4.0     42130
3.0     20294
1.0     16895
2.0     11914
Name: Score, dtype: int64
Val class distribution: 5.0    55981
4.0    20364
3.0     9767
1.0     8075
2.0     5856
Name: Score, dtype: int64
Test class distribution: 5.0    110221
4.0     42021
3.0     20213
1.0     15856
2.0     11676
Name: Score, dtype: int64
Y class distribution: 2.0    5000
1.0    5000
3.0    5000
4.0    5000
5.0    5000
Name: Score, dtype: int64
Y class distribution: 2.0    5000
1.0    5000
3.0    5000
4.0    5000
5.0    5000
Name: Score, dtype: int64
Y class distribution: 2.0    100
1.0    100
3.0    100
4.0    100
5.0    100
Name: Score, dtype: int64
Y class distribution: 2.0    300
1.0    300
3.0    300
4.0    300
5.0    300
Name: Score, dtype: int64
Y class distribution: 2.0    1000
1.0    1000
3.0

In [16]:
df = pd.read_csv(dataset_dir / ("fair_balanced_"+str(100)) / "train.csv")
df['label'].value_counts()

2.0    100
1.0    100
3.0    100
4.0    100
5.0    100
Name: label, dtype: int64