In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


import xgboost as xgb

In [None]:
data = pd.read_csv('../input/glass/glass.csv')

In [None]:
data.info()

In [None]:
data['Type'].value_counts()

In [None]:
data.isnull().sum()

In [None]:
# so we have a data set that's clean already and we want to build a model that predicts the glass type
X, y = data.drop(columns = 'Type'), data['Type']

In [None]:
label_cnts = y.value_counts().to_dict()
label_cnts

In [None]:
new_y_label_for_xgb = {label: idx for idx, label in enumerate(label_cnts.keys())}
new_y_label_for_xgb

In [None]:
y_2 = y.copy()
y_2 = y_2.map(new_y_label_for_xgb)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_2, 
                                                    train_size=0.7, 
                                                    stratify = y,
                                                    shuffle=True, 
                                                    random_state=1)

In [None]:
y_train.value_counts() / len(y_train)

In [None]:
y_test.value_counts() / len(y_test)

In [None]:
# now because the data set is soo soo tiny it's worth using xgb cross validation to do this

### approaches that I will be testing

#### multi-classification approaches
- simple model
- upsampling all observations to the class with the highest number of observations
- downsampling all observations to the class with the lower number of observations
- both approahces and basing it of the median difference of observations between all combinations -> so some classes will be downsampled others upsamplied
- using a loss function that accounts for the error -> is it possible to use the loss function? try and see!

#### OVR
- simple model for each class -> prediction with the highest prob is the class to predict
- upsampling -> each model -> and then see predictive accuracy
- downsampling -> each model -> and then see predictive accuracy
- both upsampling and downsampling -> and then seeing predictive accuracy
- using a loss function 

In [None]:
train_d_matrix = xgb.DMatrix(X_train, label = y_train)
test_d_matrix = xgb.DMatrix(X_test, label = y_test)

In [None]:
xgb_params = {'nfolds' : 5, 'num_boost_round' : 100,'early_stopping_rounds' : 5,}
xgb_gen_params = {'objective' : 'multi:softprob',
                  'num_class' : 6,
                  'eval_metric' : 'mlogloss'}

In [None]:
def carry_out_training(train_d_matrix):
    cvresult = xgb.cv(xgb_gen_params, 
                      train_d_matrix, 
                      num_boost_round=xgb_params['num_boost_round'],
                      nfold=xgb_params['nfolds'],
                      verbose_eval = True, 
                      early_stopping_rounds=xgb_params['early_stopping_rounds'])

    num_boost_round = cvresult.shape[0]
    
    model = xgb.train(dtrain  = train_d_matrix, 
                      params = xgb_gen_params,
                      num_boost_round  = num_boost_round,
                      verbose_eval = True)
    return model

In [None]:
model = carry_out_training(train_d_matrix)

In [None]:
def get_performance_metrics(model, test_d_matrix, y_train):
    dep_vars = [0, 1, 2, 3, 4, 5]
    preds = pd.DataFrame(model.predict(test_d_matrix), columns = dep_vars)
    preds['TARGET'] = test_d_matrix.get_label()
    # now get the max score and max kpi assciated with it
    preds['pred_max_score'] = preds.loc[:, dep_vars].max(axis = 1)
    preds['PRED'] = preds.loc[:, dep_vars].idxmax(axis = 1)

    
    model_f1 = f1_score(preds['TARGET'], preds['PRED'], average = 'macro')
    model_acc = accuracy_score(preds['TARGET'], preds['PRED'])
    conf_matrix = pd.DataFrame(confusion_matrix(preds['TARGET'], preds['PRED']), columns = [0, 1, 2, 3, 4, 5])

    for idx in range(6):
        conf_matrix.loc[idx, 'recall'] = conf_matrix.loc[idx, idx] / conf_matrix.loc[idx, ].sum()
    conf_matrix['recall'] = conf_matrix['recall'].round(4)

    perf = {'f1' : model_f1, 'acc' : model_acc, 'conf' : conf_matrix}
    
    # now consider the predicted class vs the logit
    unconditional_probs = y_train.value_counts() / len(y_train)
    unconditional_odds = unconditional_probs / (1-unconditional_probs)
    unconditional_logits = np.log(unconditional_odds)

    
    for y_targ in range(6):
        preds[f'logit__{y_targ}'] = preds[y_targ] / (1-preds[y_targ])
        preds[f'logit__{y_targ}'] = np.log(preds[f'logit__{y_targ}'])
        preds[f'logit_vs_uncond__{y_targ}'] = preds[f'logit__{y_targ}'] - unconditional_logits.loc[y_targ]

    new_dep_preds = [f'logit_vs_uncond__{y_targ}' for y_targ in range(6)]
    preds['PRED_2'] = np.argmax(preds[new_dep_preds].to_numpy(), axis=1)


    model_f1 = f1_score(preds['TARGET'], preds['PRED_2'], average = 'macro')
    model_acc = accuracy_score(preds['TARGET'], preds['PRED_2'])
    conf_matrix = pd.DataFrame(confusion_matrix(preds['TARGET'], preds['PRED_2']), columns = [0, 1, 2, 3, 4, 5])

    for idx in range(6):
        conf_matrix.loc[idx, 'recall'] = conf_matrix.loc[idx, idx] / conf_matrix.loc[idx, ].sum()
    conf_matrix['recall'] = conf_matrix['recall'].round(4)
    
    perf_logit_class = {'f1' : model_f1, 'acc' : model_acc, 'conf' : conf_matrix}

    return perf, perf_logit_class

In [None]:
perf, perf_logit_class = get_performance_metrics(model, test_d_matrix, y_train)

In [None]:
perf

In [None]:
results = {}
results['baseline'] = perf
results['baseline-logit-approach'] = perf_logit_class
results

## now for upsampling all to have the same number of records as the class with the highest obs

In [None]:
class_labels = y_train.value_counts().reset_index().rename(columns = {'index': 'target', 'Type':'n'})
class_labels

In [None]:
class_labels = class_labels.sort_values('n',ascending = False).reset_index(drop=True)
class_labels

In [None]:
(target_max, n_max) = class_labels.loc[0,['target', 'n']]
class_labels['samples_to_add'] = n_max - class_labels['n'] 
class_labels

In [None]:
X_train_new_data = []
y_train_new_data = []

for target, samples_to_add in zip(class_labels['target'], class_labels['samples_to_add']):
    existing = y_train[y_train == target].index.to_list()
    if samples_to_add > 0 :
        to_add = y_train[y_train == target].sample(samples_to_add, replace=True, random_state = 10).index.to_list()
        y_train_to_add = y_train.loc[existing + to_add].copy()
        X_train_to_add = X_train.loc[existing + to_add].copy()
        X_train_new_data.append(X_train_to_add)
        y_train_new_data.append(y_train_to_add)
    else:
        y_train_to_add = y_train.loc[existing].copy()
        X_train_to_add = X_train.loc[existing].copy()
        X_train_new_data.append(X_train_to_add)
        y_train_new_data.append(y_train_to_add)

In [None]:
list(map(lambda x: (x.shape[0], x.shape[1]), X_train_new_data))

In [None]:
list(map(len, y_train_new_data))

In [None]:
X_train_new_data = pd.concat(X_train_new_data, axis = 0)
print(X_train_new_data.shape)

In [None]:
y_train_new_data = pd.concat(y_train_new_data, axis = 0)
y_train_new_data.value_counts()

In [None]:
print(len(y_train_new_data))
print(len(X_train_new_data))

In [None]:
X_train_new_data.shape

In [None]:
y_train_new_data.shape

In [None]:
train_d_matrix = xgb.DMatrix(X_train_new_data, label = y_train_new_data)

In [None]:
model = carry_out_training(train_d_matrix)

In [None]:
perf, perf_logit_class = get_performance_metrics(model, test_d_matrix, y_train_new_data)

In [None]:
perf

In [None]:
perf_logit_class

In [None]:
results['upsampling'] = perf
results['upsampling-logit-approach'] = perf_logit_class
results

In [None]:
# interesting how now the unsampling logit approach matches the unsampling approach

## now for downsampling all to have the clas with the lowest number! this will be bad

### approach find class with the lowest number

In [None]:
class_labels = class_labels.sort_values('n',ascending = True).reset_index(drop=True)
class_labels

In [None]:
(target_min, n_min) = class_labels.loc[0,['target', 'n']]
class_labels

In [None]:
X_train_new_data = []
y_train_new_data = []

for target in class_labels['target']:
    to_select = y_train[y_train == target].sample(n_min, replace=False, random_state = 10).index.to_list()
    y_train_selected = y_train.loc[to_select].copy()
    X_train_selected = X_train.loc[to_select].copy()
    X_train_new_data.append(X_train_selected)
    y_train_new_data.append(y_train_selected)

In [None]:
list(map(lambda x: (x.shape[0], x.shape[1]), X_train_new_data))

In [None]:
list(map(len, y_train_new_data))

In [None]:
X_train_new_data = pd.concat(X_train_new_data, axis = 0)
print(X_train_new_data.shape)
y_train_new_data = pd.concat(y_train_new_data, axis = 0)
print(y_train_new_data.value_counts())

train_d_matrix = xgb.DMatrix(X_train_new_data, label = y_train_new_data)

In [None]:
model = carry_out_training(train_d_matrix)
perf, perf_logit_class = get_performance_metrics(model, test_d_matrix, y_train_new_data)
results['downsampling'] = perf
results['downsampling-logit-approach'] = perf_logit_class
results

## now for both find median between the all cases

In [None]:
class_labels = class_labels.sort_values('n',ascending = False).reset_index(drop=True)
class_labels

In [None]:
differences = {}
for i in range(1, 6):
        differences[i] = np.floor((class_labels.loc[0, 'n'] -  class_labels.loc[i, 'n']) / 2)

In [None]:
differences

In [None]:
to_dec_big_target = int(np.min([val for targ, val in differences.items()]))
to_dec_big_target

In [None]:
target_biggest, target_big_n = class_labels.loc[0, ['target', 'n']]

In [None]:
target_big_n

In [None]:
# i.e. increase everything by 16 and decrease the major class by 16 only once!

In [None]:
X_train_new_data = []
y_train_new_data = []

for target in range(1, 6):
    to_inc = int(differences[target])
    target_existing = y_train[y_train == target].index.to_list()
    target_to_add = y_train[y_train == target].sample(to_inc, replace=True, random_state = 10).index.to_list()
        
    y_train_selected = y_train.loc[target_existing + target_to_add].copy()
    X_train_selected = X_train.loc[target_existing + target_to_add].copy()
    X_train_new_data.append(X_train_selected)
    y_train_new_data.append(y_train_selected)

In [None]:
target_to_keep = y_train[y_train == target_biggest].sample(target_big_n - to_dec_big_target, replace=False, random_state = 10).index.to_list()
len(target_to_keep)

In [None]:
y_train_selected = y_train.loc[target_to_keep].copy()
X_train_selected = X_train.loc[target_to_keep].copy()
X_train_new_data.append(X_train_selected)
y_train_new_data.append(y_train_selected)

In [None]:
list(map(lambda x: (x.shape[0], x.shape[1]), X_train_new_data))

In [None]:
X_train_new_data = pd.concat(X_train_new_data, axis = 0)
print(X_train_new_data.shape)
y_train_new_data = pd.concat(y_train_new_data, axis = 0)
print(y_train_new_data.value_counts())

In [None]:
train_d_matrix = xgb.DMatrix(X_train_new_data, label = y_train_new_data)

In [None]:
model = carry_out_training(train_d_matrix)
perf, perf_logit_class = get_performance_metrics(model, test_d_matrix, y_train_new_data)
results['both'] = perf
results['both-logit-approach'] = perf_logit_class
results

### so now do both but do it repeatedly until all classes have the same observations

i.e. round one find class with the highest number of records -> now that class find mid point between all other classes for it - that represents observations to add for those minority classes

for the majority class reduce only once by it's midpoint between it and the 2nd most majority class


then repeat the above until all classes have the same number of observations

In [None]:
class_labels = y_train.value_counts().reset_index().rename(columns = {'index': 'target', 'Type':'n'})
class_labels = class_labels.sort_values('n',ascending = False).reset_index(drop=True)
class_labels

In [None]:
target_biggest, target_big_n = class_labels.loc[0, ['target', 'n']]
rount_nbr = 0
for target in class_labels.loc[1:]['target'].tolist():
    class_labels.loc[target, f'round__{rount_nbr}__inc'] = int((target_big_n - class_labels.loc[target, 'n'])/2)
    
class_labels.loc[0, f'round__{rount_nbr}__inc'] = -class_labels.loc[1, f'round__{rount_nbr}__inc']
class_labels[f'round__{rount_nbr}'] = class_labels['n'] + class_labels[f'round__{rount_nbr}__inc']
class_labels

In [None]:
class_labels_round__0 = class_labels.groupby(['round__0'])['target'].apply(list).reset_index().sort_values('round__0', ascending=False).reset_index(drop=True)
class_labels_round__0 = class_labels_round__0[['target', 'round__0']]
class_labels_round__0

In [None]:
def do_both_get_numbers(df, current_nbr, rount_nbr):
    target_biggest, target_big_n = df.loc[0, ['target', current_nbr]]
    for target in range(1, len(df)):
        df.loc[target, f'round__{rount_nbr}__inc'] = int((target_big_n - df.loc[target, current_nbr])/2)

    df.loc[0, f'round__{rount_nbr}__inc'] = -df.loc[1, f'round__{rount_nbr}__inc']
    df[f'round__{rount_nbr}'] = df[current_nbr] + df[f'round__{rount_nbr}__inc']
    return df

In [None]:
round_1 = do_both_get_numbers(class_labels_round__0, current_nbr='round__0', rount_nbr=1)
round_1

In [None]:
class_labels_round__1 = round_1.groupby(['round__1'])['target'].apply(list).reset_index().sort_values('round__1', ascending=False).reset_index(drop=True)
class_labels_round__1 = class_labels_round__0[['target', 'round__1']]
class_labels_round__1

In [None]:
class_labels_round__1 = pd.DataFrame()
# it's so so tricky because 
for group, df in round_1.groupby(['round__1'])['target']:
    round__1 = [var for vars2 in df.to_list() for var in vars2]
    print(round__1)
    round__1_df = pd.DataFrame({'round__1' : [group]})
    round__1_df['target'] = [round__1]

    class_labels_round__1 = class_labels_round__1.append(round__1_df)

In [None]:
class_labels_round__1
class_labels_round__1 = class_labels_round__1.sort_values('round__1', ascending=False).reset_index(drop=True)
class_labels_round__1 = class_labels_round__1[['target', 'round__1']]
class_labels_round__1


In [None]:
# the above you repeat - this is quite a lot of work and doesn't achieve much I don't think!
round_2 = do_both_get_numbers(class_labels_round__1, current_nbr='round__1', rount_nbr=2)
round_2

In [None]:
class_labels_round__2 = pd.DataFrame()
# it's so so tricky because 
for group, df in round_2.groupby(['round__2'])['target']:
    round__2 = [var for vars2 in df.to_list() for var in vars2]
    round__2_df = pd.DataFrame({'round__2' : [group]})
    round__2_df['target'] = [round__2]

    class_labels_round__2 = class_labels_round__2.append(round__2_df)
    
    
class_labels_round__2
class_labels_round__2 = class_labels_round__2.sort_values('round__2', ascending=False).reset_index(drop=True)
class_labels_round__2 = class_labels_round__2[['target', 'round__2']]
class_labels_round__2
    

In [None]:
# the above you repeat - this is quite a lot of work and doesn't achieve much I don't think!
round_3 = do_both_get_numbers(class_labels_round__2, current_nbr='round__2', rount_nbr=3)
round_3

In [None]:
# so I don't like the approach that I've set it up as! I think this is too complicated!

In [None]:

def up_plus_down_sampling_herlper(df):
    class_labels = df['TARGET'].value_counts().reset_index().rename(columns = {'index': 'target', 'TARGET':'n'})
    class_labels = class_labels.sort_values('n',ascending = False).reset_index(drop=True)

    differences = {}
    for i in range(len(class_labels)):
        target = class_labels.loc[i, 'target']
        differences[target] = int(np.floor((class_labels.loc[0, 'n'] -  class_labels.loc[i, 'n']) / 2))

    print(differences)

    target_biggest, target_big_n = class_labels.loc[0, ['target', 'n']]
    to_dec_big_target = int(np.min([val for targ, val in differences.items() if val >0]))

    
    df_mod_new = []
    for target in class_labels['target'].tolist():
        to_inc = differences[target]
        n = class_labels.query(f'target == "{target}"')['n'].tolist()[0]
        if to_inc == 0:
            # then to this group we're going to downsample!
            to_select = df['TARGET'][df['TARGET'] == target].sample(n - to_dec_big_target, 
                                                                    replace=False, 
                                                                    random_state = 10).index.to_list() 
            df_mod_new_selection = df.loc[to_select].copy()
            df_mod_new.append(df_mod_new_selection)
            print(f'target = {target} downsampled')
        else:
            # then to this group we're going to downsample!
            base_to_select = df['TARGET'][df['TARGET'] == target].index.to_list() 
            to_add = df['TARGET'][df['TARGET'] == target].sample(to_inc, 
                                                                 replace=True, 
                                                                 random_state = 10).index.to_list() 
            df_mod_new_selection = df.loc[base_to_select+to_add].copy()
            df_mod_new.append(df_mod_new_selection)
            print(f'target = {target} upsampled')
            
    print(list(map(lambda x: (x.shape[0], x.shape[1]), df_mod_new)))
    df_mod_new = pd.concat(df_mod_new, axis = 0).reset_index(drop=True)
    return df_mod_new

#df_mod_pass_1 = up_plus_down_sampling_herlper(df_mod_2)
#df_mod_pass_2 = up_plus_down_sampling_herlper(df_mod_pass_1)


### using weighting on the objective function

In [None]:
class Weight_Binary_Cross_Entropy:
    '''
    The class of binary cross entropy loss, allows the users to change the weight parameter
    '''

    def __init__(self, imbalance_alpha):
        '''
        :param imbalance_alpha: the imbalanced \alpha value for the minority class (label as '1')
        '''
        self.imbalance_alpha = imbalance_alpha

    def weighted_binary_cross_entropy(self, pred, dtrain):
        # assign the value of imbalanced alpha
        imbalance_alpha = self.imbalance_alpha
        # retrieve data from dtrain matrix
        label = dtrain.get_label()
        # compute the prediction with sigmoid
        sigmoid_pred = 1.0 / (1.0 + np.exp(-pred))
        # gradient
        grad = -(imbalance_alpha ** label) * (label - sigmoid_pred)
        hess = (imbalance_alpha ** label) * sigmoid_pred * (1.0 - sigmoid_pred)

        return grad, hess

    
def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    # return a pair metric_name, result
    # since preds are margin(before logistic transformation, cutoff at 0)

    return 'error', float(sum(labels != (preds > 0.0))) / len(labels)

In [None]:
[y_train == 1] * 1
    

In [None]:
y_train_2 = y_train.copy()
y_train_2 = np.where([y_train == 0], 1, 0)[0]
y_train_2

In [None]:
train_d_matrix = xgb.DMatrix(X_train, label = y_train_2)

weighted_loss_obj = Weight_Binary_Cross_Entropy(imbalance_alpha=1)
    
    
xgb_params = {'nfolds' : 5, 
              'num_boost_round' : 100,
              'early_stopping_rounds' : 5}

xgb_gen_params = {'eval_metric' : 'logloss',
                  'objective': 'binary:logitraw'}


In [None]:
cvresult = xgb.cv(xgb_gen_params, 
                      train_d_matrix, 
                      num_boost_round=xgb_params['num_boost_round'],
                      nfold=xgb_params['nfolds'],
                      obj = weighted_loss_obj.weighted_binary_cross_entropy,
                      feval=evalerror,
                      verbose_eval = True, 
                      early_stopping_rounds=xgb_params['early_stopping_rounds'])


In [None]:
def carry_out_training_vers2(X_train, y_train, predicted_class, imbalance_alpha):
    
    
    y_train_2 = y_train.copy()
    y_train_2 = np.where([y_train == predicted_class], 1, 0)[0]
    print(y_train_2.sum())
    train_d_matrix = xgb.DMatrix(X_train, label = y_train_2)

    
    weighted_loss_obj = Weight_Binary_Cross_Entropy(imbalance_alpha=imbalance_alpha)
    

    xgb_params = {'nfolds' : 5, 
                  'num_boost_round' : 100,
                  'early_stopping_rounds' : 5}

    xgb_gen_params = {'eval_metric' : 'logloss',
                      'objective': 'binary:logitraw'}



    cvresult = xgb.cv(xgb_gen_params, 
                      train_d_matrix, 
                      num_boost_round=xgb_params['num_boost_round'],
                      nfold=xgb_params['nfolds'],
                      obj = weighted_loss_obj.weighted_binary_cross_entropy,
                      feval=evalerror,
                      verbose_eval = True, 
                      early_stopping_rounds=xgb_params['early_stopping_rounds'])

    num_boost_round = cvresult.shape[0]
    
    model = xgb.train(dtrain  = train_d_matrix, 
                      params = xgb_gen_params,
                      num_boost_round  = num_boost_round,
                      obj = weighted_loss_obj.weighted_binary_cross_entropy,
                      feval=evalerror,
                      verbose_eval = True)
    return model

In [None]:
all_models = {}
imbalance_alpha = 1
for predicted_class in range(6):
    
    all_models[f'model__{predicted_class}'] = carry_out_training_vers2(X_train, 
                                                                       y_train, 
                                                                       predicted_class=predicted_class, 
                                                                       imbalance_alpha=1)

In [None]:
pred_probs = []

for predicted_class in range(6):
    pred = all_models[f'model__{predicted_class}'].predict(test_d_matrix)
    pred = 1. / (1. + np.exp(-pred))
    pred_probs.append(pd.Series(pred))

In [None]:
pred_probs_df = pd.concat(pred_probs, axis=1)

In [None]:
pred_probs_df

In [None]:
pred_probs_df[list(range(6))].to_numpy().shape

In [None]:
pred_probs_df['PRED'] = np.argmax(pred_probs_df[list(range(6))].to_numpy(), axis=1)
pred_probs_df['PRED'].value_counts()

In [None]:
model_f1 = f1_score(y_test, pred_probs_df['PRED'], average = 'macro')
model_f1

In [None]:
model_acc = accuracy_score(y_test, pred_probs_df['PRED'])
model_acc

In [None]:
conf_matrix = pd.DataFrame(confusion_matrix(y_test, pred_probs_df['PRED']), columns = [0, 1, 2, 3, 4, 5])
conf_matrix

In [None]:
for idx in range(6):
    conf_matrix.loc[idx, 'recall'] = conf_matrix.loc[idx, idx] / conf_matrix.loc[idx, ].sum()
conf_matrix['recall'] = conf_matrix['recall'].round(4)

{'f1' : model_f1, 'acc' : model_acc, 'conf' : conf_matrix}


### focal loss using lightgbm

In [None]:
import lightgbm

In [None]:
# https://maxhalford.github.io/blog/lightgbm-focal-loss/#first-order-derivative