In [1]:
filename_raw = "raw_data"
filename_processed = "XXXX-XX-XX_XX:XX:XX"
validation_size = 0.25
test_size = 0.25

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
# Future optional inputs
feature_columns = ['item_name', 'vendor_name']
target_column = 'category_name'

In [4]:
TEXT_FEATURE = 'text_feature'
TARGET_COLUMN = 'target'

In [5]:
df = pd.read_csv(filename_raw, dtype=str)

Fill empty feature columns with empty strings

In [6]:
df.loc[:, feature_columns] = df.loc[:, feature_columns].fillna(value='')

Concatenate all text features to a single column, trimming whitespace

Select only text feature and target column

In [7]:
df[TEXT_FEATURE] = df.apply(lambda x: " ".join(str(text) for text in x).strip(), 
                            axis=1)

df = df.rename(columns={target_column: TARGET_COLUMN})

df = df[[TEXT_FEATURE, TARGET_COLUMN]]

Drop empty targets

In [8]:
df = df.loc[df[TARGET_COLUMN].notna()]

Drop empty text features

In [9]:
df = df.loc[df[TEXT_FEATURE] != '']

Remove duplicates

In [10]:
df.drop_duplicates(subset=TEXT_FEATURE, inplace=True)

Drop anything with fewer than n entries

In [11]:
n_min = 10
# Get target names for all targets above threshold
counts = df.groupby(TARGET_COLUMN).count()
counts = counts[counts > 10].dropna()
targets_above_threshold = counts.index

# Filter
df = df.loc[df[TARGET_COLUMN].isin(targets_above_threshold)]

In [12]:
train_size = 1 - (validation_size + test_size)
df_train, df_test_val = train_test_split(df, test_size=(validation_size + test_size), random_state=42, stratify=df[TARGET_COLUMN])
test_fraction = test_size / (test_size + validation_size)
df_validation, df_test = train_test_split(df_test_val, test_size=test_fraction, random_state=42, stratify=df_test_val[TARGET_COLUMN])

In [13]:
df_train.to_csv(filename_processed + "_train.csv", index=False)
df_validation.to_csv(filename_processed + "_validation.csv", index=False)
df_test.to_csv(filename_processed + "_test.csv", index=False)