In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
cd /content/drive/My\ Drive/1003\ Machine Learning/1003\ Project/Data

/content/drive/.shortcut-targets-by-id/1cXJSX-Wb546Od-de-PxfpA-IOtl_oV-Q/1003 Project/Data


# Load data

Code in this part credit to Man Jin: mj1637@nyu.edu


In [0]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.preprocessing import MultiLabelBinarizer

In [0]:
def load_data(file_name):

    # load data from csv
    data = pd.read_csv(file_name, usecols=['labels', 'features'])

    # remove rows without proper label
    rows_to_remove = [i for i in range(len(data)) if ':' in data.loc[i,'labels']]
    data.drop(rows_to_remove, inplace=True)
    data.reset_index(drop=True, inplace=True)

    # extract features from sparse representation
    feature = np.zeros((len(data), 5000))
    for i in range(len(data)):
        for j in data.loc[i,'features'].replace('\n','').split():
            ft, val = j.split(':')
            feature[i,int(ft)] = float(val)
    X = pd.DataFrame(feature)

    # extract labels
    y = data['labels'].map(lambda x: tuple([int(i) for i in x.replace(' ','').split(',')]))
    
    return X, y

In [0]:
X_train, y_train = load_data("train.csv")
X_val, y_val = load_data('test.csv')

In [0]:
# Let's examine multi-labelness.
m,n = X_train.shape
q = max([label for y_i in y_train for label in y_i ])
lCard = sum([len(y_train[i]) for i in range(m)])/m
lDen = lCard/q

print("m,n=", (m,n))
print("q=|y|=", q)
print("label diversity:", len(np.unique(y_train)))
print("label cardinality:", lCard)
print("label density:", lDen)

m,n= (15511, 5000)
q=|y|= 3992
label diversity: 13543
label cardinality: 5.320740119914899
label density: 0.001332850731441608


In [0]:
# # if needed
binarizer = MultiLabelBinarizer(classes=np.arange(3993))
binary_y_train = binarizer.fit_transform(y_train)
binary_y_val = binarizer.fit_transform(y_val)

In [0]:
X_train_small, binary_y_train_small = X_train[:1000], binary_y_train[:1000]
X_val_small, binary_y_val_small = X_val[:250], binary_y_val[:250]

# LEML

http://proceedings.mlr.press/v32/yu14.pdf

In [0]:
# !git clone https://github.com/AnthonyMRios/leml.git

Cloning into 'leml'...
remote: Enumerating objects: 149, done.[K
remote: Total 149 (delta 0), reused 0 (delta 0), pack-reused 149[K
Receiving objects: 100% (149/149), 1.64 MiB | 1.71 MiB/s, done.
Resolving deltas: 100% (57/57), done.


In [0]:
cd leml/

/content/drive/.shortcut-targets-by-id/1cXJSX-Wb546Od-de-PxfpA-IOtl_oV-Q/1003 Project/Data/leml


In [0]:
!python2.7 setup.py install

running install
running bdist_egg
running egg_info
writing leml.egg-info/PKG-INFO
writing top-level names to leml.egg-info/top_level.txt
writing dependency_links to leml.egg-info/dependency_links.txt
writing manifest file 'leml.egg-info/SOURCES.txt'
installing library code to build/bdist.linux-x86_64/egg
running install_lib
running build_py
creating build/lib.linux-x86_64-2.7
creating build/lib.linux-x86_64-2.7/pyleml
copying pyleml/LEML.py -> build/lib.linux-x86_64-2.7/pyleml
copying pyleml/LEML_parallel.py -> build/lib.linux-x86_64-2.7/pyleml
copying pyleml/LEML_single.py -> build/lib.linux-x86_64-2.7/pyleml
copying pyleml/__init__.py -> build/lib.linux-x86_64-2.7/pyleml
running build_ext
skipping './pyleml/mul_sparse.c' Cython extension (up-to-date)
building 'mul_sparse' extension
creating build/temp.linux-x86_64-2.7
creating build/temp.linux-x86_64-2.7/pyleml
x86_64-linux-gnu-gcc -pthread -fno-strict-aliasing -Wdate-time -D_FORTIFY_SOURCE=2 -g -fdebug-prefix-map=/build/python2.7-UK

In [0]:
from sklearn.model_selection import KFold
from sklearn.datasets import make_multilabel_classification
from scipy.sparse import csr_matrix
from sklearn.metrics import precision_score
# from pyleml import LEML

In [0]:
import numpy as np
import scipy.sparse as scipy_sp

class LEMLs:
    def __init__(self, num_factors = 128, num_iterations = 25, reg_param = 1.,
                 stopping_criteria = 1e-3, cg_max_iter = 25, cg_gtol = 1e-3, verbose = False):
        self.num_factors = num_factors
        self.num_iterations = num_iterations
        self.reg_param = reg_param
        self.cg_max_iter = cg_max_iter
        self.cg_gtol = cg_gtol
        self.verbose = verbose

    def fit(self, train_data, train_labels):
        self.W = np.random.random((train_data.shape[1], self.num_factors))
        self.H = np.random.random((train_labels.shape[1], self.num_factors))

        prev_loss = None
        for iteration in range(self.num_iterations):
            self.fit_H(train_data, train_labels)
            num_cg_iters = self.fit_W(train_data, train_labels)
            if self.verbose:
                print('Iteration {} done'.format(iteration+1))

    def predict(self, test_data):
        return test_data.dot(self.W).dot(self.H.T)>0.5
    
    def predict_proba(self, test_data):
        return test_data.dot(self.W).dot(self.H.T)

    def fit_H(self, train_data, train_labels):
        X = train_data.dot(self.W)
        X2 = X.T.dot(X)
        eye_reg_param = np.eye(X2.shape[0])*self.reg_param
        X2 = X2 + eye_reg_param
        inv = np.linalg.inv(X2)
        missing = train_labels.T.dot(X)
        for j in range(train_labels.shape[1]):
            self.H[j,:] =  inv.dot(missing[j,:].flatten()).flatten()

    def fit_W(self, train_data, train_labels):
        def vec(A):
            return A.flatten('F')

        def dloss(w, X, Y, H, reg_param):
            W = self.W
            A = X.dot(W)
            B = Y.dot(H)
            M = H.T.dot(H)
            return vec(X.T.dot(A.dot(M)-B)) + reg_param*w

        self.M = np.dot(self.H.T, self.H)
        def Hs(s, X, reg_param):
            S = s.reshape((X.shape[1],self.H.shape[1]), order='F')
            A = X.dot(S)
            AdM = A.dot(self.M)
            XdAdM = X.T.dot(AdM)
            v = vec(XdAdM)
            return v + reg_param*s
            #return vec((X.T.dot(A.dot(self.M)))) + reg_param*s

        wt = vec(self.W)
        rt = -dloss(wt, train_data, train_labels, self.H, self.reg_param)
        dt = rt
        total_iters = 0
        for i in range(self.cg_max_iter):
            if np.linalg.norm(rt) < self.cg_gtol:
                break
            total_iters += 1
            hst = Hs(dt, train_data, self.reg_param)
            rtdot = rt.T.dot(rt)
            at = rtdot/(dt.T.dot(hst))
            wt = wt + at*dt
            rtp1 = rt - at*hst
            bt = rtp1.T.dot(rtp1)/(rtdot)
            rt = rtp1
            dt = rt + bt*dt

        self.W = wt.reshape((self.W.shape[0], self.W.shape[1]), order='F')

        return total_iters



import sys
from time import time

import numpy as np
from sklearn.externals import joblib
from sklearn.metrics import precision_score 
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier

# from pyleml import LEML

# def main():
#     print('Loading data')
#     sys.stdout.flush()
#     X = joblib.load('../example/test_data/bibtex-train.pkl')
#     labels = joblib.load('../example/test_data/bibtex-Y-train.pkl')
#     X_test = joblib.load('../example/test_data/bibtex-test.pkl')
#     labels_test = joblib.load('../example/test_data/bibtex-Y-test.pkl')
#     print(X.shape, labels.shape, X.getformat(), labels.getformat())

#     print('Training LEML')
#     sys.stdout.flush()
#     #for l in [1e-3, 1e-2, 1e-1, 1., 10]:
#     t0 = time()
#     #leml = LEML.get_instance('single', num_factors=200, num_iterations=25, reg_param=1., verbose=True)
#     leml = LEMLs(num_factors=200, num_iterations=5, reg_param=1., verbose=True)
#     leml.fit(X.tocsr(), labels.tocsr())
#     print('Train time', time() - t0, 'seconds')
#     sys.stdout.flush()
#     preds = leml.predict_proba(X_test)
#     preds_top_k = preds.argsort()[:,::-1]
#     preds_top_k = preds_top_k[:,:1]
#     new_preds = np.zeros((preds.shape[0], preds.shape[1]))
#     new_preds[np.arange(preds.shape[0]).repeat(1),preds_top_k.flatten()] = 1
#     print('Precision @ 1:', precision_score(labels_test.toarray(), new_preds, average='samples'))

#     preds_top_k = preds.argsort()[:,::-1]
#     preds_top_k = preds_top_k[:,:3]
#     new_preds = np.zeros((preds.shape[0], preds.shape[1]))
#     new_preds[np.arange(preds.shape[0]).repeat(3),preds_top_k.flatten()] = 1
#     print('Precision @ 3:', precision_score(labels_test.toarray(), new_preds, average='samples'))

#     preds_top_k = preds.argsort()[:,::-1]
#     preds_top_k = preds_top_k[:,:5]
#     new_preds = np.zeros((preds.shape[0], preds.shape[1]))
#     new_preds[np.arange(preds.shape[0]).repeat(5),preds_top_k.flatten()] = 1
#     print('Precision @ 5:', precision_score(labels_test.toarray(), new_preds, average='samples'))


# if __name__ == '__main__':
#     main()

In [0]:
type(X_train_small), type(binary_y_train_small)

(pandas.core.frame.DataFrame, numpy.ndarray)

## NUM_FACTORS = 500

In [0]:
leml = LEMLs(num_factors=500, num_iterations=20, reg_param=0.1, verbose=True)
leml.fit(train_data = X_train_small.values, train_labels = binary_y_train_small)

Iteration 1 done
Iteration 2 done
Iteration 3 done
Iteration 4 done
Iteration 5 done
Iteration 6 done
Iteration 7 done
Iteration 8 done
Iteration 9 done
Iteration 10 done
Iteration 11 done
Iteration 12 done
Iteration 13 done
Iteration 14 done
Iteration 15 done
Iteration 16 done
Iteration 17 done
Iteration 18 done
Iteration 19 done
Iteration 20 done


In [0]:
predictions = leml.predict(X_val_small)

In [0]:
# calculate lrap
import numpy as np
from sklearn.metrics import label_ranking_average_precision_score

label_ranking_average_precision_score(binary_y_val_small, np.multiply(predictions.values, 1))

0.10429081661901443

## NUM_FACTORS = 1000

In [0]:
leml = LEMLs(num_factors=1000, num_iterations=20, reg_param=0.1, verbose=True)
leml.fit(train_data = X_train_small.values, train_labels = binary_y_train_small)

Iteration 1 done
Iteration 2 done
Iteration 3 done
Iteration 4 done
Iteration 5 done
Iteration 6 done
Iteration 7 done
Iteration 8 done
Iteration 9 done
Iteration 10 done
Iteration 11 done
Iteration 12 done
Iteration 13 done
Iteration 14 done
Iteration 15 done
Iteration 16 done
Iteration 17 done
Iteration 18 done
Iteration 19 done
Iteration 20 done


In [0]:
predictions = leml.predict(X_val_small)

In [0]:
# calculate lrap
import numpy as np
from sklearn.metrics import label_ranking_average_precision_score

label_ranking_average_precision_score(binary_y_val_small, np.multiply(predictions.values, 1))

0.05863427773243243

## NUM_FACTORS = 5000

In [0]:
leml = LEMLs(num_factors=5000, num_iterations=20, reg_param=0.1, verbose=True)
leml.fit(train_data = X_train_small.values, train_labels = binary_y_train_small)

Iteration 1 done
Iteration 2 done
Iteration 3 done
Iteration 4 done
Iteration 5 done
Iteration 6 done
Iteration 7 done
Iteration 8 done
Iteration 9 done
Iteration 10 done
Iteration 11 done
Iteration 12 done
Iteration 13 done
Iteration 14 done
Iteration 15 done
Iteration 16 done
Iteration 17 done
Iteration 18 done
Iteration 19 done
Iteration 20 done


In [0]:
predictions = leml.predict(X_val_small)

In [0]:
# calculate lrap
import numpy as np
from sklearn.metrics import label_ranking_average_precision_score

label_ranking_average_precision_score(binary_y_val_small, np.multiply(predictions.values, 1))

0.10182233545337331