# COMP47590: Advanced Machine Learning
# Assignment 1: Multi-label Classification

Name(s): Li Weijing

Student Number(s): 19204246

## Import Packages Etc

In [104]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn import metrics
import joblib
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm

from imblearn.under_sampling import RandomUnderSampler
import random
import warnings
warnings.filterwarnings('ignore')
# import other useful packages

## Task 0: Load the Yeast Dataset

In [105]:
# load data from csv
dataset = pd.read_csv('yeast.csv')
dataset.head()

Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,...,Class5,Class6,Class7,Class8,Class9,Class10,Class11,Class12,Class13,Class14
0,0.004168,-0.170975,-0.156748,-0.142151,0.058781,0.026851,0.197719,0.04185,0.066938,-0.056617,...,0,0,1,1,0,0,0,1,1,0
1,-0.103956,0.011879,-0.098986,-0.054501,-0.00797,0.049113,-0.03058,-0.077933,-0.080529,-0.016267,...,0,0,0,0,0,0,0,0,0,0
2,0.509949,0.401709,0.293799,0.087714,0.011686,-0.006411,-0.006255,0.013646,-0.040666,-0.024447,...,0,0,0,0,0,0,0,1,1,0
3,0.119092,0.004412,-0.002262,0.072254,0.044512,-0.051467,0.074686,-0.00767,0.079438,0.062184,...,0,0,0,0,0,0,0,0,0,0
4,0.042037,0.007054,-0.069483,0.081015,-0.048207,0.089446,-0.004947,0.064456,-0.133387,0.068878,...,1,1,0,0,0,0,0,0,0,0


In [106]:
X = dataset.iloc[:,:-14]
y = dataset.iloc[:,-14:]
y.head()

Unnamed: 0,Class1,Class2,Class3,Class4,Class5,Class6,Class7,Class8,Class9,Class10,Class11,Class12,Class13,Class14
0,0,0,0,0,0,0,1,1,0,0,0,1,1,0
1,0,0,1,1,0,0,0,0,0,0,0,0,0,0
2,0,1,1,0,0,0,0,0,0,0,0,1,1,0
3,0,0,1,1,0,0,0,0,0,0,0,0,0,0
4,0,0,1,1,1,1,0,0,0,0,0,0,0,0


In [107]:
X_train, X_test, y_train, y_test \
    = train_test_split(X,y,random_state=0,train_size = 0.7)

## Task 1: Implement the Binary Relevance Algorithm

In [108]:
# Write your code here
class Binary_Relevance():
    def __init__(self, BaseEstimator="decision tree",undersampling = False,print_ = True):
        """Binary Relevance for multi-label classification
        Use train_fit to train model, and use predict_model to predict
        Parameters
        ----------
        BaseEstimator : 'decision tree' or 'logistic regression' or 'SVM'.
        undersampling : True or False
        print_ : print process or not
        -------
        """
        self.BaseEstimator = BaseEstimator
        self.feature_num = 0
        self.undersampling = undersampling
        self.print_ = print_
        
    def train_fit(self,X,y):
        if self.print_==True:
            print("Use %s to train"%self.BaseEstimator)
        self.feature_num = len(y.iloc[0,:])
        for i in range(0,self.feature_num):
            if self.undersampling==False:
                y_slice = y.iloc[:,i]
                X_slice = X
                #print(y_slice.value_counts())
            elif self.undersampling==True:
                X_slice,y_slice=RandomUnderSampler(random_state=0).fit_resample(X, y.iloc[:,i])
                #print(y_slice.value_counts())
            if self.BaseEstimator == 'SVM':
                model = BaggingClassifier(base_estimator=SVC()).fit(X_slice,y_slice)
                joblib.dump(model, "model_store/task1_%s.m"%(str(i)))
            elif self.BaseEstimator == 'logistic regression':
                model = BaggingClassifier(base_estimator=LogisticRegression()).fit(X_slice,y_slice)
                joblib.dump(model, "model_store/task1_%s.m"%(str(i)))
            elif self.BaseEstimator == 'decision tree':
                model = RandomForestClassifier().fit(X_slice,y_slice)
                joblib.dump(model, "model_store/task1_%s.m"%(str(i)))
        return self
    def predict_model(self,X):
        y_output = pd.DataFrame()
        for i in range(0,self.feature_num):
            model = joblib.load("model_store/task1_%s.m"%(str(i)))
            y_ptr = pd.DataFrame(model.predict(X))
            y_output = pd.concat([y_output,y_ptr],axis = 1)
        if self.print_==True:
            print("Successfully predict")
        return y_output

In [109]:
"""
Use Binary_Relevance() to train and predit with imbalanced training data, and
show the accuracy of each feature of different BaseEstimator
"""
def binary_relevance_train_imbalanced():
    base_estimator_list=['decision tree', 'logistic regression', 'SVM']
    accuracy_list_for_imbalanced = pd.DataFrame(columns=base_estimator_list\
                             ,index=['Class'+str(i) for i in range(1,15)])
    for j in range(0,len(base_estimator_list)):
        binary_ = Binary_Relevance(base_estimator_list[j])
        binary_.train_fit(X_train,y_train)
        y_pred = binary_.predict_model(X_test)
        accuracy_n = []
        for i in range(0,len(y_test.iloc[0,:])):
            accuracy_ptr =metrics.accuracy_score(y_test.iloc[:,i],y_pred.iloc[:,i])
            accuracy_n.append(accuracy_ptr)
        accuracy_n.reverse()
        accuracy_list_for_imbalanced.iloc[:,j]=accuracy_n
    return accuracy_list_for_imbalanced
accuracy_list_for_imbalanced = binary_relevance_train_imbalanced()
print(accuracy_list_for_imbalanced)

Use decision tree to train
Successfully predict
Use logistic regression to train
Successfully predict
Use SVM to train
Successfully predict
         decision tree  logistic regression       SVM
Class1        0.986226             0.986226  0.986226
Class2        0.723140             0.710744  0.707989
Class3        0.725895             0.716253  0.732782
Class4        0.888430             0.888430  0.892562
Class5        0.896694             0.896694  0.895317
Class6        0.920110             0.920110  0.920110
Class7        0.814050             0.807163  0.812672
Class8        0.852617             0.841598  0.852617
Class9        0.778237             0.761708  0.776860
Class10       0.794766             0.754821  0.798898
Class11       0.742424             0.742424  0.746556
Class12       0.730028             0.684573  0.721763
Class13       0.633609             0.611570  0.640496
Class14       0.787879             0.783747  0.794766


## Task 2: Implement the Binary Relevance Algorithm with Under-Sampling

In [110]:
"""
Use Binary_Relevance() to train and predit with balanced training data, and 
show the accuracy of each feature of different BaseEstimator
"""
def binary_relevance_train_balanced():
    base_estimator_list=['decision tree', 'logistic regression', 'SVM']
    accuracy_list_for_under_sample = pd.DataFrame(columns=base_estimator_list\
                             ,index=['Class'+str(i) for i in range(1,15)])
    for j in range(0,len(base_estimator_list)):
        binary_ = Binary_Relevance(base_estimator_list[j],True)
        binary_.train_fit(X_train,y_train)
        y_pred = binary_.predict_model(X_test)
        accuracy_n = []
        for i in range(0,len(y_test.iloc[0,:])):
            accuracy_ptr =metrics.accuracy_score(y_test.iloc[:,i],y_pred.iloc[:,i])
            accuracy_n.append(accuracy_ptr)
        accuracy_n.reverse()
        accuracy_list_for_under_sample.iloc[:,j]=accuracy_n
    return accuracy_list_for_under_sample
accuracy_list_for_under_sample = binary_relevance_train_balanced()
print(accuracy_list_for_under_sample)

Use decision tree to train
Successfully predict
Use logistic regression to train
Successfully predict
Use SVM to train
Successfully predict
         decision tree  logistic regression       SVM
Class1        0.549587             0.466942  0.512397
Class2        0.573003             0.557851  0.570248
Class3        0.555096             0.567493  0.574380
Class4        0.630854             0.630854  0.670799
Class5        0.636364             0.646006  0.698347
Class6        0.556474             0.545455  0.522039
Class7        0.644628             0.611570  0.612948
Class8        0.679063             0.603306  0.652893
Class9        0.655647             0.625344  0.665289
Class10       0.741047             0.673554  0.753444
Class11       0.721763             0.683196  0.713499
Class12       0.723140             0.684573  0.702479
Class13       0.625344             0.595041  0.644628
Class14       0.768595             0.734160  0.769972


## Task 3: Compare the Performance of Different Binary Relevance Approaches

In [111]:
"""
Evaluate use Hamming loss and macro-averaged f-score with k-fold cross validation.
"""
base_estimator_list=['decision tree', 'logistic regression', 'SVM']   

accuracy_list1 = pd.DataFrame(columns=base_estimator_list\
                             ,index=['imbalanced','balanced'])
accuracy_list2 = pd.DataFrame(columns=base_estimator_list\
                             ,index=['imbalanced','balanced'])

def evaluation(train_dataset,base_estimator_list,accuracy_list1,accuracy_list2,k_fold_num):
    def random_split(full_list,k_fold_num1):
        offset = int(len(full_list)/k_fold_num1)
        full_list = full_list.sample(frac=1).reset_index(drop=True)
        split_point = []
        for i in range(0,k_fold_num1):
            split_point.append(offset*i)
        split_point.append(len(full_list))
        return split_point
    split_ptr = random_split(train_dataset,k_fold_num)
    print("evaluating imbalanced data")
    for j in range(0,len(base_estimator_list)):
        accuracy_aver1,accuracy_aver2 = 0,0
        for i in tqdm(range(0,k_fold_num)):
            X_train = pd.concat([train_dataset.iloc[:split_ptr[i],:-14],train_dataset.iloc[split_ptr[i+1]:,:-14]],axis=0)
            y_train = pd.concat([train_dataset.iloc[:split_ptr[i],-14:],train_dataset.iloc[split_ptr[i+1]:,-14:]],axis=0)
            X_valid = train_dataset.iloc[split_ptr[i]:split_ptr[i+1],:-14]
            y_valid = train_dataset.iloc[split_ptr[i]:split_ptr[i+1],-14:]
            binary_ = Binary_Relevance(base_estimator_list[j],False,False)
            binary_.train_fit(X_train,y_train)
            y_pred = binary_.predict_model(X_valid)
            accuracy_aver1 += metrics.hamming_loss(y_valid, y_pred)
            accuracy_aver2 += metrics.f1_score(y_valid, y_pred, average='macro')
        accuracy_list1.iloc[0,j]=accuracy_aver1/k_fold_num
        accuracy_list2.iloc[0,j]=accuracy_aver2/k_fold_num  
    print("evaluating balanced data")
    for j in range(0,len(base_estimator_list)):
        accuracy_aver1,accuracy_aver2 = 0,0
        for i in tqdm(range(0,k_fold_num)):
            X_train = pd.concat([train_dataset.iloc[:split_ptr[i],:-14],train_dataset.iloc[split_ptr[i+1]:,:-14]],axis=0)
            y_train = pd.concat([train_dataset.iloc[:split_ptr[i],-14:],train_dataset.iloc[split_ptr[i+1]:,-14:]],axis=0)
            X_valid = train_dataset.iloc[split_ptr[i]:split_ptr[i+1],:-14]
            y_valid = train_dataset.iloc[split_ptr[i]:split_ptr[i+1],-14:]
            binary_ = Binary_Relevance(base_estimator_list[j],True,False)
            binary_.train_fit(X_train,y_train)
            y_pred = binary_.predict_model(X_valid)
            accuracy_aver1 += metrics.hamming_loss(y_valid, y_pred)
            accuracy_aver2 += metrics.f1_score(y_valid, y_pred, average='macro')
        accuracy_list1.iloc[1,j]=accuracy_aver1/k_fold_num
        accuracy_list2.iloc[1,j]=accuracy_aver2/k_fold_num
    return (accuracy_list1,accuracy_list2)
    
(accuracy_list1,accuracy_list2) = evaluation(dataset,base_estimator_list,accuracy_list1,accuracy_list2,10)
print("Hamming loss")
print(accuracy_list1)
print("macro-averaged f-score")
print(accuracy_list2)

evaluating imbalanced data


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [03:17<00:00, 19.76s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:26<00:00,  2.69s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [04:59<00:00, 29.95s/it]


evaluating balanced data


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [01:16<00:00,  7.62s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:16<00:00,  1.64s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [01:52<00:00, 11.21s/it]


Hamming loss
           decision tree logistic regression       SVM
imbalanced      0.188791            0.199282  0.186407
balanced        0.347453            0.365083  0.340572
macro-averaged f-score
           decision tree logistic regression       SVM
imbalanced      0.365289            0.348632  0.383072
balanced        0.466935            0.454504  0.469602


## Task 4: Implement the Classifier Chains Algorithm

In [112]:
# Write your code here
class Classifier_Chains():
    def __init__(self, BaseEstimator="decision tree",undersampling = False,print_ = True):
        """Classifier Chains for multi-label classification
        Use train_fit to train model, and use predict_model to predict
        Parameters
        ----------
        BaseEstimator : 'decision tree' or 'logistic regression' or 'SVM'.
        undersampling : True or False
        print_ : print process or not
        -------
        """
        self.BaseEstimator = BaseEstimator
        self.feature_num = 0
        self.undersampling = undersampling
        self.print_ = print_
        
    def get_class_order(self,X,y):
        accuracy_n = []
        binary_ = Binary_Relevance(self.BaseEstimator,False,False)
        X_train, X_test, y_train, y_test = train_test_split(X,y,\
                                random_state=0,train_size = 0.7)
        binary_.train_fit(X_train,y_train)
        y_pred = binary_.predict_model(X_test)
        for i in range(0,len(y_test.iloc[0,:])):        
            accuracy_ptr =metrics.accuracy_score(y_test.iloc[:,i],y_pred.iloc[:,i])
            accuracy_n.append(accuracy_ptr)
        accuracy_n=np.array(accuracy_n)
        self.class_order = np.argsort(accuracy_n)
        return self
        
    
    def train_fit(self,X,y):
        if self.print_==True:
            print("Use %s to train"%self.BaseEstimator)
        self.feature_num = len(y.iloc[0,:])
        self.get_class_order(X,y)
        if self.print_==True:
            print(str("Class order"),end="")
            print(self.class_order)
        for i in self.class_order:
            if self.undersampling==False:
                y_slice = y.iloc[:,i]
                X_slice = X
            elif self.undersampling==True:
                X_slice,y_slice=RandomUnderSampler(random_state=0).fit_resample(X, y.iloc[:,i])
            if self.BaseEstimator == 'SVM':
                model = BaggingClassifier(base_estimator=SVC()).fit(X_slice,y_slice)
                joblib.dump(model, "model_store/task4_%s.m"%(str(i)))
            elif self.BaseEstimator == 'logistic regression':
                model = BaggingClassifier(base_estimator=LogisticRegression()).fit(X_slice,y_slice)
                joblib.dump(model, "model_store/task4_%s.m"%(str(i)))
            elif self.BaseEstimator == 'decision tree':
                model = RandomForestClassifier().fit(X_slice,y_slice)
                joblib.dump(model, "model_store/task4_%s.m"%(str(i)))
            X = pd.concat([X,y.iloc[:,i]],axis = 1)
        return self
    
    def predict_model(self,X):
        y_output = pd.DataFrame() 
        X.reset_index(drop=True, inplace=True)
        for i in self.class_order:
            model = joblib.load("model_store/task4_%s.m"%(str(i)))
            y_ptr = pd.DataFrame(model.predict(X))
            y_ptr.columns=['Class'+str(i)]
            y_output = pd.concat([y_output,y_ptr],axis = 1)
            X = pd.concat([X,y_ptr],axis = 1)
            
        y_output=y_output[['Class'+str(i) for i in range(0,14)]]
        if self.print_==True:
            print("Successfully predict")
        return y_output

In [113]:
"""
Use Classifier_Chains() to train and predit with imbalanced training data, and
show the accuracy of each feature of different BaseEstimator
"""
def classifier_chains_train_imbalanced():
    base_estimator_list=['decision tree', 'logistic regression', 'SVM']
    accuracy_list_for_imbalanced = pd.DataFrame(columns=base_estimator_list\
                             ,index=['Class'+str(i) for i in range(1,15)])
    for j in range(0,len(base_estimator_list)):
        chains_ = Classifier_Chains(base_estimator_list[j],False,True)
        chains_.train_fit(X_train,y_train)
        y_pred = chains_.predict_model(X_test)
        accuracy_n = []
        for i in range(0,len(y_test.iloc[0,:])):
            accuracy_ptr =metrics.accuracy_score(y_test.iloc[:,i],y_pred.iloc[:,i])
            accuracy_n.append(accuracy_ptr)
        accuracy_n.reverse()
        accuracy_list_for_imbalanced.iloc[:,j]=accuracy_n
    return accuracy_list_for_imbalanced
accuracy_list_for_imbalanced = classifier_chains_train_imbalanced()
print(accuracy_list_for_imbalanced)

Use decision tree to train
Class order[ 1  3  2 12 11  4  0  7  5  6 10  9  8 13]
Successfully predict
Use logistic regression to train
Class order[ 1  2 12  3 11  4  0  5  7  6 10  9  8 13]
Successfully predict
Use SVM to train
Class order[ 1 12  3  2 11  0  7  4  5  6 10  9  8 13]
Successfully predict
         decision tree  logistic regression       SVM
Class1        0.986226             0.986226  0.986226
Class2        0.720386             0.709366  0.719008
Class3        0.727273             0.716253  0.725895
Class4        0.891185             0.877410  0.877410
Class5        0.896694             0.887052  0.884298
Class6        0.920110             0.920110  0.920110
Class7        0.814050             0.782369  0.790634
Class8        0.849862             0.809917  0.807163
Class9        0.786501             0.758953  0.757576
Class10       0.790634             0.747934  0.765840
Class11       0.747934             0.695592  0.705234
Class12       0.691460             0.687328  0.

In [114]:
"""
Use Classifier_Chains() to train and predit with imbalanced training data, and
show the accuracy of each feature of different BaseEstimator
"""
def classifier_chains_train_imbalanced():
    base_estimator_list=['decision tree', 'logistic regression', 'SVM']
    accuracy_list_for_imbalanced = pd.DataFrame(columns=base_estimator_list\
                             ,index=['Class'+str(i) for i in range(1,15)])
    for j in range(0,len(base_estimator_list)):
        chains_ = Classifier_Chains(base_estimator_list[j],True,True)
        chains_.train_fit(X_train,y_train)
        y_pred = chains_.predict_model(X_test)
        accuracy_n = []
        for i in range(0,len(y_test.iloc[0,:])):
            accuracy_ptr =metrics.accuracy_score(y_test.iloc[:,i],y_pred.iloc[:,i])
            accuracy_n.append(accuracy_ptr)
        accuracy_n.reverse()
        accuracy_list_for_imbalanced.iloc[:,j]=accuracy_n
    return accuracy_list_for_imbalanced
accuracy_list_for_imbalanced = classifier_chains_train_imbalanced()
print(accuracy_list_for_imbalanced)

Use decision tree to train
Class order[ 1  3  2 12 11  4  0  5  7  6 10  9  8 13]
Successfully predict
Use logistic regression to train
Class order[ 1  2  3 12 11  4  0  5  7  6 10  9  8 13]
Successfully predict
Use SVM to train
Class order[ 1  2 12  3 11  0  5  4  7  6 10  9  8 13]
Successfully predict
         decision tree  logistic regression       SVM
Class1        0.717631             0.742424  0.772727
Class2        0.578512             0.563361  0.589532
Class3        0.582645             0.564738  0.590909
Class4        0.680441             0.713499  0.679063
Class5        0.684573             0.724518  0.687328
Class6        0.564738             0.672176  0.559229
Class7        0.652893             0.709366  0.690083
Class8        0.681818             0.721763  0.706612
Class9        0.757576             0.703857  0.701102
Class10       0.772727             0.695592  0.699725
Class11       0.709366             0.666667  0.741047
Class12       0.688705             0.681818  0.

## Task 5: Evaluate the Performance of the Classifier Chains Algorithm

In [115]:
"""
Evaluate use Hamming loss and macro-averaged f-score with k-fold cross validation.
"""
base_estimator_list=['decision tree', 'logistic regression', 'SVM']   

accuracy_list1 = pd.DataFrame(columns=base_estimator_list\
                             ,index=['imbalanced','balanced'])
accuracy_list2 = pd.DataFrame(columns=base_estimator_list\
                             ,index=['imbalanced','balanced'])

def evaluation(train_dataset,base_estimator_list,accuracy_list1,accuracy_list2,k_fold_num):
    def random_split(full_list,k_fold_num1):
        offset = int(len(full_list)/k_fold_num1)
        full_list = full_list.sample(frac=1).reset_index(drop=True)
        split_point = []
        for i in range(0,k_fold_num1):
            split_point.append(offset*i)
        split_point.append(len(full_list))
        return split_point
    split_ptr = random_split(train_dataset,k_fold_num)
    print("evaluating imbalanced data")
    for j in range(0,len(base_estimator_list)):
        accuracy_aver1,accuracy_aver2 = 0,0
        for i in tqdm(range(0,k_fold_num)):
            X_train = pd.concat([train_dataset.iloc[:split_ptr[i],:-14],train_dataset.iloc[split_ptr[i+1]:,:-14]],axis=0)
            y_train = pd.concat([train_dataset.iloc[:split_ptr[i],-14:],train_dataset.iloc[split_ptr[i+1]:,-14:]],axis=0)
            X_valid = train_dataset.iloc[split_ptr[i]:split_ptr[i+1],:-14]
            y_valid = train_dataset.iloc[split_ptr[i]:split_ptr[i+1],-14:]
            chain_ = Classifier_Chains(base_estimator_list[j],False,False)
            chain_.train_fit(X_train,y_train)
            y_pred = chain_.predict_model(X_valid)
            accuracy_aver1 += metrics.hamming_loss(y_valid, y_pred)
            accuracy_aver2 += metrics.f1_score(y_valid, y_pred, average='macro')
        accuracy_list1.iloc[0,j]=accuracy_aver1/k_fold_num
        accuracy_list2.iloc[0,j]=accuracy_aver2/k_fold_num  
    print("evaluating balanced data")
    for j in range(0,len(base_estimator_list)):
        accuracy_aver1,accuracy_aver2 = 0,0
        for i in tqdm(range(0,k_fold_num)):
            X_train = pd.concat([train_dataset.iloc[:split_ptr[i],:-14],train_dataset.iloc[split_ptr[i+1]:,:-14]],axis=0)
            y_train = pd.concat([train_dataset.iloc[:split_ptr[i],-14:],train_dataset.iloc[split_ptr[i+1]:,-14:]],axis=0)
            X_valid = train_dataset.iloc[split_ptr[i]:split_ptr[i+1],:-14]
            y_valid = train_dataset.iloc[split_ptr[i]:split_ptr[i+1],-14:]
            chain_ = Classifier_Chains(base_estimator_list[j],True,False)
            chain_.train_fit(X_train,y_train)
            y_pred = chain_.predict_model(X_valid)
            accuracy_aver1 += metrics.hamming_loss(y_valid, y_pred)
            accuracy_aver2 += metrics.f1_score(y_valid, y_pred, average='macro')
        accuracy_list1.iloc[1,j]=accuracy_aver1/k_fold_num
        accuracy_list2.iloc[1,j]=accuracy_aver2/k_fold_num
    return (accuracy_list1,accuracy_list2)
    
(accuracy_list1,accuracy_list2) = evaluation(dataset,base_estimator_list,accuracy_list1,accuracy_list2,10)
print("Hamming loss")
print(accuracy_list1)
print("macro-averaged f-score")
print(accuracy_list2)

evaluating imbalanced data


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [05:07<00:00, 30.71s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:51<00:00,  5.15s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [06:51<00:00, 41.11s/it]


evaluating balanced data


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [03:23<00:00, 20.37s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:37<00:00,  3.79s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [04:52<00:00, 29.22s/it]


Hamming loss
           decision tree logistic regression       SVM
imbalanced      0.191286            0.212943  0.212105
balanced        0.324185            0.311368  0.304586
macro-averaged f-score
           decision tree logistic regression       SVM
imbalanced       0.38904            0.383174  0.411222
balanced        0.463796            0.427854  0.427047


## Task 6: Reflect on the Performance of the Different Models Evaluated

### <center>Hamming loss</center>
| Binary Relevance | decision tree | logistic regression | SVM |
| :----: | :----:  | :----:  | :----: |
| imbalanced | 0.189 | 0.199 | 0.186 |
| balanced | 0.347 | 0.365 | 0.341 |

| Classifier Chain | decision tree | logistic regression | SVM |
|  :----:   | :----:  | :----:  | :----:  |
| imbalanced | 0.191 | 0.213 | 0.212 |
| balanced | 0.324 | 0.311 | 0.305 |

### <center>Macro-Averaged F-Score</center>
| Binary Relevance | decision tree | logistic regression | SVM |
| :----: | :----:  | :----:  | :----: |
| imbalanced | 0.365 | 0.349 | 0.383 | 
| balanced | 0.467 | 0.455 | 0.470 | 

| Classifier Chain | decision tree | logistic regression | SVM |
|  :----:   | :----:  | :----:  | :----:  |
| imbalanced | 0.389 | 0.383 | 0.411 |
| balanced | 0.464 | 0.428 | 0.427 |

<script type="text/x-mathjax-config">
MathJax.Hub.Config({
tex2jax: {
inlineMath: [['$','$'], ['\\(','\\)']],
processEscapes: true},
jax: ["input/TeX","input/MathML","input/AsciiMath","output/CommonHTML"],
extensions: ["tex2jax.js","mml2jax.js","asciimath2jax.js","MathMenu.js","MathZoom.js","AssistiveMML.js", "[Contrib]/a11y/accessibility-menu.js"],
TeX: {
extensions: ["AMSmath.js","AMSsymbols.js","noErrors.js","noUndefined.js"],
equationNumbers: {
autoNumber: "AMS"
}
}
});
</script>

#### Evaluation results:    
1. The performance of imbalanced data is better than balanced data, because undersampling delete part of the origin train dataset to balance data.  
2. Classifier Chain achieves better performance than Binary Relevance with balanced data while perform a little worse with biased data.  

#### Complexity Analysis:    
1. The computational complexity of binary relevance is $\mathcal{O}(L\times f(d,N))$,while the complexity of classifier chain is $\mathcal{O}(L\times f(d+L,N))$, where L denotes the number of functional classes, $f(d,N)$ denotes the complexity of base estimator with $N$ examples and $d$ attributes.  

