In [48]:
from setup_general import *
from prep_helpers import *

def get_data(feat_percent_cut, feat_freq_cut):
    lang = 'est'
    if lang == 'en':
        data = combined_data_fully_translated.copy()
    if lang == 'est':
        data = combined_data.copy()
    # Feature specific engineering
    ## units - sizes -values
    # Finish unit translation/ unification &  values to float
    data['value'] = data['value'].apply(lambda x: float(x.replace(',', '.')) if type(x) == str else x)

    # unify units
    data['unit'] = data['unit'].replace('10 x 15 cm','100 x 150 mm')

    # mm to cm
    data['value'] = data.apply(lambda item: item['value'] / 10 if item['unit'] == 'mm' else item['value'], axis=1)
    data['unit'] = data['unit'].replace('mm','cm')
    data['value'] = pd.to_numeric(data['value'])    

    data['unit'] = data['unit'].replace(np.nan,'*')
    data['parameter'] = data['parameter'].replace(np.nan,'*')
    data['unit'] = data['unit'].apply(lambda x: get_squared(x))
    # execution order is important
    data['value'] = data.apply(lambda item: extract_width_height_from_unit_to_value(item[['unit','value']])[1], axis=1)
    data['unit'] = data.apply(lambda item: extract_width_height_from_unit_to_value(item[['unit','value']])[0], axis=1)
    data['parameter_and_unit'] = data['parameter'] + ' IN ' + data['unit']

    # parameter_and_units as single features with respective values
    # parameter_and_unit turned into one hot encoded features
    data = pd.get_dummies(data, columns=['parameter_and_unit'], prefix='', prefix_sep='')

    #  for all new "parameter with unit" columns put the value in the column where a 1 is - others are 0 and remain 0
    for column in data.columns:
        if ' IN ' in column and '*' not in column:
            data[column] = data.apply(lambda item: extract_value(item['value'], item[column]), axis=1)            

    for column in data.columns:
        # all the parameter with unit columns that contain arrays that are represeted as strings
        if (' IN ' in column) and (data[column].dtype == object):
            data[column + '_height'] = data.apply(lambda item: extract_height_width(item[column])[0], axis=1)
            data[column + '_width'] = data.apply(lambda item: extract_height_width(item[column])[1], axis=1)
            pd.to_numeric(data[column + '_height'])
            pd.to_numeric(data[column + '_width'])
            data = data.drop(column, axis=1)

    for column in data.columns:
        if (' IN ' in column):
            data[column] = data[column].replace(np.nan,0)
    

    data['country_and_unit'] = data.apply(lambda x: empty_to_nan(x['country_and_unit']), axis=1)
    data['technique'] = data['technique'].apply(lambda x: x.strip() if (type(x) == str) else x)
    ## country_unit - material - technique - location (splitting for features including multiple information)

    

    data['city_municipality'] = data.apply(lambda item: extract_city_country(item['country_and_unit'])[0], axis=1)
    data['country'] = data.apply(lambda item: extract_city_country(item['country_and_unit'])[1], axis=1)

    # material
    # to make the following work even for nan values
    data['material'] = data['material'].replace(np.nan, 'nan')
    # prepare single values to be distinguishable
    data['material'] = data['material'].apply(lambda x: x.split('>'))

    # https://stackoverflow.com/questions/45312377/how-to-one-hot-encode-from-a-pandas-column-containing-a-list

    mlb = MultiLabelBinarizer()
    data = data.join(pd.DataFrame(mlb.fit_transform(data.pop('material')),
                            columns='material_' + mlb.classes_,
                            index=data.index))

    # technique
    # to make the following work even for nan values
    data['technique'] = data['technique'].replace(np.nan, 'nan')

    # prepare single values to be distinguishable
    data['technique'] = data['technique'].apply(lambda x: x.split('>'))

    data = data.join(pd.DataFrame(mlb.fit_transform(data.pop('technique')),
                            columns='technique_' + mlb.classes_,
                            index=data.index), rsuffix='')

    # location
    data['location_city'] = data['location'].apply(lambda x: 1 if (type(x) == str) and ('linn ' in x) else 0)
    data['location_building'] = data['location'].apply(lambda x: 1 if (type(x) == str) and ('hoone ' in x) else 0)
    data['location_street'] = data['location'].apply(lambda x: 1 if (type(x) == str) and ('tänav ' in x) else 0)
    data['location_country'] = data['location'].apply(lambda x: 1 if (type(x) == str) and ('riik ' in x) else 0)
    data['location_address'] = data['location'].apply(lambda x: 1 if (type(x) == str) and ('aadress ' in x) else 0)
    # start - end (formatting)

    data['start'] = data[['name', 'start']].apply(extract_year_from_name, axis=1)    
        
    #grouping applied to the dataframe
    data['startYear'] = data['start'].apply(year_Grouping)
    data['startMonth'] = data['start'].apply(month_Grouping)
    data['startDay'] = data['start'].apply(day_Grouping)
            
    data['endYear'] = data['end'].apply(year_Grouping)
    data['endMonth'] = data['end'].apply(month_Grouping)
    data['endDay'] = data['end'].apply(day_Grouping)

    #if there is no start year, but an end year, then the start year is set to the end year
    for i in range(1,len(data)):
        if data['startYear'].iloc[i] == 0 and data['startDay'].iloc[i] != 0:
            data['startYear'].iloc[i] = data['endYear'].iloc[i]


    #original columns are dropped as they are no longer needed
    data.drop(['start', 'end'], axis=1, inplace=True)
    ## event_type (brackets)

    data['event_type'] = data['event_type'].apply(strip_brackets)
    ## color (grouping)
    #Grouping colours by their base colour - to avoid too many extra cloumns when hot encoding -> could always reverse this step
    #by using  something like data['color'] = combined_data_translated['color'] ?

    #The base colours: red, blue, green, grey, yellow, patterned, orange, brown, white, black , pink
    #The most common/distingtive stay unchanged

    #apply colour_grouping to the dataset
    data['color'] = data['color'].apply(colour_grouping)
    ## technique - material - sizes (threshold previously encoded)

    # best found combination (local optimum on 500 estimators)
    perc = feat_percent_cut/100
    threshold_sum = len(data) * perc
    min_freq = feat_freq_cut

    tech = helpers.col_collection(data, 'technique_')
    mat = helpers.col_collection(data, 'material_')
    size = data.columns[data.columns.str.contains('IN')]

    features = [tech,mat,size]

    for feat in features:
        frequencies = {}
        for col in feat:
            frequencies[col] = data[col].sum()
        frequencies = dict(sorted(frequencies.items(), key=lambda item: item[1], reverse=True))
        instance_sum = 0
        for col in frequencies:
            frequency = frequencies[col]
            #if instance_sum > threshold_sum or frequency < min_freq:
            if frequency < min_freq:
                data.drop(columns=[col], inplace=True)
            instance_sum += frequency

            
    ## hot encoding & thresholding
    # categorical columns
    # already encoded
    # material, technique, unit, size, value

    cols = ['musealia_additional_nr', 'collection_mark', 'musealia_mark', 'museum_abbr', 'before_Christ', 'is_original', 'class', 'parish', 'state',  'event_type', 'participants_role', 'parish', 'color', 'collection_additional_nr', 'damages', 'participant', 'location', 'name', 'commentary', 'text', 'legend', 'initial_info', 'additional_text', 'country', 'city_municipality']

    text_features = ['name', 'commentary', 'text', 'legend', 'initial_info', 'additional_text']
    for col in cols:
        data[col] = data[col].fillna('nan')
        instance_sum = 0
        val_counts = data[col].value_counts()
        values_to_group = []
        for idx, name in enumerate(val_counts.index):
            frequency = val_counts[idx]
            if instance_sum > threshold_sum or frequency < min_freq:
                values_to_group.append(name)

            instance_sum += frequency
        data[col] = data[col].apply(lambda x: 'uncommon' if (x in values_to_group) else x)

    # one hot encoding
    data = pd.get_dummies(data, columns=cols)
        
    ## Delete unneeded features


    data.drop(columns=['full_nr','country_and_unit','parameter','unit','value'], inplace=True)

    ## continous numeric features (nan -> 0)
    data = data.replace(np.nan, 0)
    ## rename for xgboost (cant deal with <>[] in feature names)
    for i in data.columns:
        if '>' in i:
            data.rename(columns={i:i.replace('>','')}, inplace=True)
        if '<' in i:
            data.rename(columns={i:i.replace('<','')}, inplace=True)
        if ']' in i:
            data.rename(columns={i:i.replace(']','')}, inplace=True)
        if '[' in i:
            data.rename(columns={i:i.replace('[','')}, inplace=True)

    # resplit test/train
    train = data.loc[data['source']=='train'].drop('source',axis=1)

    # modify types
    train['type'] = train['type'].replace('fotonegatiiv, fotonegatiiv', 'fotonegatiiv')
    

    # resplit test/train
    train, val = train_test_split(train, test_size=0.3, random_state=0)
    test = data.loc[data['source']=='test'].drop('source',axis=1)

    return train, val, test

#function to have resamplers resample to specific number of samples per class
def by_num(y, min_samples):
    b = Counter(y).values()
    a = Counter(y).keys()
    a = list(a)
    b = list(b)

    if min_samples > max(b):
        min_samples = max(b)

    for i in range(len(a)):
        if b[i] < min_samples :
            b[i] = min_samples
    return dict(zip(a, b))

#function to have resamplers resample to specific number of samples per class
def by_perc(y, increase_perc):
    a = Counter(y).keys()
    b = Counter(y).values()
    a = list(a)
    b = list(b)

    max_samples = max(b)

    for i in range(len(b)):
        new_samples = int(b[i] * (1 + increase_perc/100))
        if new_samples > max_samples:
            b[i] = max_samples
        else:
            b[i] = new_samples
    return dict(zip(a, b))


def rebalancing(X, y, reb_method, strategy, by_value):

    if strategy == 'perc':
        sampling_strategy = by_perc
    else:
        sampling_strategy = by_num
    
    if reb_method == 'smote':
        balancer = SMOTE(sampling_strategy=sampling_strategy(y,by_value), random_state=0)
    elif reb_method == 'ros':
        balancer = RandomOverSampler(sampling_strategy=sampling_strategy(y,by_value), random_state=0)
    else:
        return X, y

    X_res, y_res = balancer.fit_resample(X, y)

    return X_res, y_res


In [45]:
 # note that we define values from `wandb.config` instead 
# of defining hard values 
min_samples_split = 5
max_depth = 100
min_samples_leaf = 1
n_estimators = 100
max_features = 'sqrt'
criterion = 'gini'
feat_percent_cut = 84
feat_freq_cut = 15
reb_method = 'ros'
rebalance = ('num',100)
class_weight = None

# -------------------------- data prep code  -------------------------------------

print('data prep')
train, val, test = get_data(feat_percent_cut=feat_percent_cut, feat_freq_cut=feat_freq_cut)

print('balancing')
print(rebalance)
strategy, by_value = rebalance
print(strategy, by_value)

#val = val_est_prepared.copy()

X_train = train.drop('type', axis=1)
y_train = train.type

""""
X_val = val.drop('type', axis=1)
y_val = val.type
"""



data prep


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['startYear'].iloc[i] = data['endYear'].iloc[i]


105 columns found that start with technique_
111 columns found that start with material_
balancing
('num', 100)
num 100


'"\nX_val = val.drop(\'type\', axis=1)\ny_val = val.type\n'

In [46]:

label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(y_train)

y_train = label_encoder.transform(y_train)
#y_val = label_encoder.transform(y_val)

In [49]:

# -------------------------- usual training code starts here  -------------------------------------
print('training')

rfc = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_leaf=min_samples_leaf,\
        max_features=max_features, min_samples_split=min_samples_split, class_weight=class_weight, random_state=0)


skf = StratifiedKFold(n_splits=4)

val_acc = []
val_f1_macro = []

for i, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
    print('fold', i)
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]

    # replace uncommon types
    unique, counts = np.unique(y_train_fold, return_counts=True)
    # 6 to have 5 samples per class left for standard knn in smote
    # -> uncommon classes become 100
    for i in np.argwhere(counts < 6):
        y_train_fold[y_train_fold == i[0]] = 100

    X_train_fold, y_train_fold = rebalancing(X_train_fold, y_train_fold, reb_method=reb_method, strategy=strategy, by_value=by_value)

    rfc.fit(X_train_fold, y_train_fold)

    y_pred = rfc.predict(X_test_fold)
    val_acc.append(accuracy_score(y_test_fold, y_pred))
    val_f1_macro.append(f1_score(y_test_fold, y_pred, average='macro'))

crossval_acc = np.mean(val_acc)
crossval_f1_macro = np.mean(val_f1_macro)
print('crossval_acc', crossval_acc)
print('crossval_f1_macro', crossval_f1_macro)

training
fold 0
fold 1
fold 2
fold 3
crossval_acc 0.8860204081632652
crossval_f1_macro 0.6292118932638248
