In [2]:
import skmultilearn
import numpy as np
import pandas as pd
from util import label_anomoly, feature_transform, label_transform
from time import time
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import label_ranking_average_precision_score as LARP

In [3]:
train_set = pd.read_csv('dataset/train.csv')

In [4]:
test_set = pd.read_csv('dataset/dev.csv')

In [5]:
train_lab_anomoly = label_anomoly(train_set['labels'])

In [6]:
print('Anomoly label indexes below')
print(' '.join(str(i) for i in train_lab_anomoly))

Anomoly label indexes below
93 252 509 1527 1939 1953 4029 4426 4643 4727 5231 5762 6295 6333 6703 7083 9477 9675 9999 10488 10736 10910 11674 12280 12599 13147 14167 14722


In [7]:
# delete train_set rows with anomoly labels
train_set = train_set.drop(train_lab_anomoly)
# reindex
train_set = train_set.reindex([i for i in range(len(train_set))])
label_anomoly(train_set)

[]

In [8]:
train_feature = feature_transform(train_set['features'], 5000)
train_label_dict = label_transform(train_set['labels'])

In [9]:
# label matrix 
train_label = np.zeros((len(train_feature),3993))
for key in train_label_dict:
    for idx in train_label_dict[key]:
        train_label[idx][key] = 1

In [10]:
train_feature.shape

(15511, 5000)

In [11]:
# clean test data 
test_lab_anomoly = label_anomoly(test_set['labels'])
print('Anomoly label indexes below')
print(' '.join(str(i) for i in test_lab_anomoly))
test_set = test_set.drop(test_lab_anomoly)
test_set = test_set.reindex([i for i in range(len(test_set))])
label_anomoly(test_set)

# test data transformation 
test_feature = feature_transform(test_set['features'], 5000)
test_label = label_transform(test_set['labels'])

# ground_truth label 
test_truth = np.zeros((len(test_feature),3993))
for key in test_label:
    for idx in test_label[key]:
        test_truth[idx][key] = 1

Anomoly label indexes below
193 414


In [11]:
import numpy as np
from numpy import linalg as la
from sklearn.linear_model import Ridge
from sklearn.preprocessing import minmax_scale

class PLST():
    def __init__(self, m, alpha=0.1):
        '''init

         Parameters
        ----------
        m:      label space compressed dimension, less than label number
        alpha:  linear regression regular coefficient
        '''
        self.m = m
        self.alpha = alpha
        self.models = []

    def fit(self, X, y):
        '''fit model

         Parameters
        ----------
        X:  numpy.ndarray
            train input feature
        y:  numpy.ndarray {0,1}
            train output
        '''
        y_new = np.copy(y)
        y_new[y_new == 0] = -1
        z, self.Um = self.encode(y_new)
        new_X = np.c_[np.ones(X.shape[0]), X]
        # regress x on z 
        z = z.T
        for i in range(len(z)):
            current_y = z[i]
            #print(current_y)
            #print(current_y.shape)
            linear_regress = Ridge()
            linear_regress.fit(new_X,current_y)
            self.models += [linear_regress]
        print('train complete')
        return self

    def encode(self, y):
        '''encode y use svd

         Parameters
        ----------
        y:  numpy.ndarray {0,1}
            train output of shape :code:`(n_samples, n_target)`

        Returns
        -------
        z:      numpy.ndarray
                dimensionality reduction matrix of y shape :code:`(n_samples, m)`
        Vm:     numpy.ndarray
                top mright singular matrix after svd shape :code:`(n_features, m)`
        shift:  numpy.ndarray
                mean of y by col shape :code:`(1, n_features)`
        '''
        y = y.T
        #shift = np.mean(y, axis=0)
        #y_shift = y - shift
        U, var, _= la.svd(y)
        # u is shape n_sample * m
        this_var = sum(var[i] for i in range(self.m))
        print('variance accounted for m = {} is {}'.format(self.m,this_var/ sum(var)))
        Um = U[:,0:self.m]
        #print(y_shift)
        #print(Vm)
        y = y.T
        z = np.dot(y,Um)
        return z, Um

    def predict(self, X):
        '''encode y use svd

        Parameters
       ----------
       X:   numpy.ndarray
            train input feature :code:`(n_samples, n_features)`

       Returns
       -------
       y_pred:      numpy.ndarray {0, 1}
                    predict of y shape :code:`(n_samples, n_traget)`
       y_pred_prob: numpy.ndarray [0, 1]
                    predict probility of y  shape :code:`(n_features, n_traget)`
        '''
        
        new_X = np.c_[np.ones(X.shape[0]), X]
        result = np.zeros((X.shape[0],self.Um.shape[0]))
        for idx, x in enumerate(new_X):
            this_result = np.zeros(self.Um.shape[0])
            for i,model in enumerate(self.models):
                Um = self.Um.T
                current_U = Um[i]
                pred = model.predict(x.reshape(1,-1))
                this_result += pred * current_U
            result[idx] = this_result
            
        y_pred_prob = minmax_scale(result, axis=1)
        y_pred = np.zeros(result.shape)
        y_pred[result > 0] = 1
        y_pred[result <= 0] = 0
        
        '''
        z_pred = self.w * np.c_(np.ones(X.shape(0)), X)
        y_real = z_pred * self.Vm.T + self.shift
        y_pred = np.zeros(y_real.shape)
        y_pred[y_real > 0] = 1
        y_pred[y_real <= 0] = 0
        y_pred_prob = minmax_scale(y_real, axis=1)
        '''
        return y_pred, y_pred_prob



In [18]:
ms = [10,1500,2000]
for m in ms:
    st_train = time()
    clf = PLST(m = m)
    clf.fit(train_feature,train_label)
    print('train time for m = {} is {}'.format(m,time()-st_train))
    st = time()
    res,res_prob = clf.predict(test_feature)
    print('predict time {}'.format(time() - st))
    
    print('accuracy for test set is {}'.format(LARP(test_truth,res_prob)))
    

variance accounted for m = 10 is 0.2628339564754551
train complete
train time for m = 10 is 295.0827462673187
predict time 2.2367608547210693
accuracy for test set is 0.15129850982623216
variance accounted for m = 1500 is 0.8038219009352727
train complete
train time for m = 1500 is 13828.745360136032
predict time 196.4737401008606
accuracy for test set is 0.49648710470553004
variance accounted for m = 2000 is 0.8748395188364513
train complete
train time for m = 2000 is 20633.943397045135
predict time 340.26476287841797
accuracy for test set is 0.5028034295556337


In [12]:
import numpy as np
from numpy import linalg as la
from sklearn.linear_model import Ridge
from sklearn.preprocessing import minmax_scale

class PLST_tree():
    def __init__(self, m, alpha=0.1):
        '''init

         Parameters
        ----------
        m:      label space compressed dimension, less than label number
        alpha:  linear regression regular coefficient
        '''
        self.m = m
        self.alpha = alpha
        self.models = []

    def fit(self, X, y):
        '''fit model

         Parameters
        ----------
        X:  numpy.ndarray
            train input feature
        y:  numpy.ndarray {0,1}
            train output
        '''
        y_new = np.copy(y)
        y_new[y_new == 0] = -1
        z, self.Um = self.encode(y_new)
        new_X = np.c_[np.ones(X.shape[0]), X]
        # regress x on z 
        z = z.T
        for i in range(len(z)):
            current_y = z[i]
            #print(current_y)
            #print(current_y.shape)
            tree_regress = DecisionTreeRegressor()
            tree_regress.fit(new_X,current_y)
            self.models += [tree_regress]
        print('train complete')
        return self

    def encode(self, y):
        '''encode y use svd

         Parameters
        ----------
        y:  numpy.ndarray {0,1}
            train output of shape :code:`(n_samples, n_target)`

        Returns
        -------
        z:      numpy.ndarray
                dimensionality reduction matrix of y shape :code:`(n_samples, m)`
        Vm:     numpy.ndarray
                top mright singular matrix after svd shape :code:`(n_features, m)`
        shift:  numpy.ndarray
                mean of y by col shape :code:`(1, n_features)`
        '''
        y = y.T
        #shift = np.mean(y, axis=0)
        #y_shift = y - shift
        U, var, _= la.svd(y)
        # u is shape n_sample * m
        this_var = sum(var[i] for i in range(self.m))
        print('variance accounted for m = {} is {}'.format(self.m,this_var/ sum(var)))
        Um = U[:,0:self.m]
        #print(y_shift)
        #print(Vm)
        y = y.T
        z = np.dot(y,Um)
        return z, Um

    def predict(self, X):
        '''encode y use svd

        Parameters
       ----------
       X:   numpy.ndarray
            train input feature :code:`(n_samples, n_features)`

       Returns
       -------
       y_pred:      numpy.ndarray {0, 1}
                    predict of y shape :code:`(n_samples, n_traget)`
       y_pred_prob: numpy.ndarray [0, 1]
                    predict probility of y  shape :code:`(n_features, n_traget)`
        '''
        
        new_X = np.c_[np.ones(X.shape[0]), X]
        result = np.zeros((X.shape[0],self.Um.shape[0]))
        for idx, x in enumerate(new_X):
            this_result = np.zeros(self.Um.shape[0])
            for i,model in enumerate(self.models):
                Um = self.Um.T
                current_U = Um[i]
                pred = model.predict(x.reshape(1,-1))
                this_result += pred * current_U
            result[idx] = this_result
            
        y_pred_prob = minmax_scale(result, axis=1)
        y_pred = np.zeros(result.shape)
        y_pred[result > 0] = 1
        y_pred[result <= 0] = 0
        
        '''
        z_pred = self.w * np.c_(np.ones(X.shape(0)), X)
        y_real = z_pred * self.Vm.T + self.shift
        y_pred = np.zeros(y_real.shape)
        y_pred[y_real > 0] = 1
        y_pred[y_real <= 0] = 0
        y_pred_prob = minmax_scale(y_real, axis=1)
        '''
        return y_pred, y_pred_prob



In [21]:
ms = [10,50,100]
for m in ms:
    st_train = time()
    clf = PLST_tree(m = m)
    clf.fit(train_feature,train_label)
    print('train time for m = {} is {}'.format(m,time()-st_train))
    st = time()
    res,res_prob = clf.predict(test_feature)
    print('predict time {}'.format(time() - st))
    
    print('accuracy for test set is {}'.format(LARP(test_truth,res_prob)))
    

variance accounted for m = 10 is 0.2628339564754551
train complete
train time for m = 10 is 472.3658459186554
predict time 1.4657750129699707
accuracy for test set is 0.1586367620998991
variance accounted for m = 50 is 0.31016787702420723
train complete
train time for m = 50 is 2304.288167953491
predict time 6.4213879108428955
accuracy for test set is 0.27484142496045905
variance accounted for m = 100 is 0.3544222940323518
train complete
train time for m = 100 is 5381.154091835022
predict time 12.86738395690918
accuracy for test set is 0.3391016102630934
