In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
cd /content/drive/My\ Drive/1003\ Machine Learning/1003\ Project/Data

/content/drive/.shortcut-targets-by-id/1cXJSX-Wb546Od-de-PxfpA-IOtl_oV-Q/1003 Project/Data


# Load data

Code in this part credit to Man Jin: mj1637@nyu.edu


In [0]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.preprocessing import MultiLabelBinarizer

In [0]:
def load_data(file_name):

    # load data from csv
    data = pd.read_csv(file_name, usecols=['labels', 'features'])

    # remove rows without proper label
    rows_to_remove = [i for i in range(len(data)) if ':' in data.loc[i,'labels']]
    data.drop(rows_to_remove, inplace=True)
    data.reset_index(drop=True, inplace=True)

    # extract features from sparse representation
    feature = np.zeros((len(data), 5000))
    for i in range(len(data)):
        for j in data.loc[i,'features'].replace('\n','').split():
            ft, val = j.split(':')
            feature[i,int(ft)] = float(val)
    X = pd.DataFrame(feature)

    # extract labels
    y = data['labels'].map(lambda x: tuple([int(i) for i in x.replace(' ','').split(',')]))
    
    return X, y

In [0]:
X_train, y_train = load_data("train.csv")
X_val, y_val = load_data('test.csv')

In [0]:
# Let's examine multi-labelness.
m,n = X_train.shape
q = max([label for y_i in y_train for label in y_i ])
lCard = sum([len(y_train[i]) for i in range(m)])/m
lDen = lCard/q

print("m,n=", (m,n))
print("q=|y|=", q)
print("label diversity:", len(np.unique(y_train)))
print("label cardinality:", lCard)
print("label density:", lDen)

m,n= (15511, 5000)
q=|y|= 3992
label diversity: 13543
label cardinality: 5.320740119914899
label density: 0.001332850731441608


In [0]:
# # if needed
binarizer = MultiLabelBinarizer(classes=np.arange(3993))
binary_y_train = binarizer.fit_transform(y_train)
binary_y_val = binarizer.fit_transform(y_val)

In [0]:
X_train_small, binary_y_train_small = X_train[:1000], binary_y_train[:1000]
X_val_small, binary_y_val_small = X_val[:250], binary_y_val[:250]

# Multi-label Decision Tree

In [0]:
from sklearn.tree import DecisionTreeClassifier
import time
start_time = time.time()
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, binary_y_train)
print(time.time() - start_time)

In [0]:
binarizer.inverse_transform(clf.predict(X_val)[0:5])

In [0]:
y_val[0:5]

# Multi-label KNN

In [0]:
!pip install scikit-multilearn

Collecting scikit-multilearn
[?25l  Downloading https://files.pythonhosted.org/packages/bb/1f/e6ff649c72a1cdf2c7a1d31eb21705110ce1c5d3e7e26b2cc300e1637272/scikit_multilearn-0.2.0-py3-none-any.whl (89kB)
[K     |███▊                            | 10kB 20.1MB/s eta 0:00:01[K     |███████▍                        | 20kB 5.5MB/s eta 0:00:01[K     |███████████                     | 30kB 5.4MB/s eta 0:00:01[K     |██████████████▊                 | 40kB 6.3MB/s eta 0:00:01[K     |██████████████████▍             | 51kB 6.1MB/s eta 0:00:01[K     |██████████████████████          | 61kB 7.0MB/s eta 0:00:01[K     |█████████████████████████▊      | 71kB 6.3MB/s eta 0:00:01[K     |█████████████████████████████▍  | 81kB 6.1MB/s eta 0:00:01[K     |████████████████████████████████| 92kB 4.2MB/s 
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0


In [0]:
from skmultilearn.adapt import MLkNN
from sklearn.model_selection import GridSearchCV
import time 

s_time = time.time()
knn_clf = MLkNN(k=30, s=1.0, ignore_first_neighbours=0)
knn_clf.fit(X_train_small, binary_y_train_small)
print(time.time()-s_time)

130.86086583137512


In [0]:
# binary_y_val_small = binarizer.transform(y_val_small)
from sklearn.metrics import label_ranking_average_precision_score as LRAP

y_val_pred = knn_clf.predict_proba(X_val_small)
print('LRAP: ', LRAP(binary_y_val_small, y_val_pred.toarray()))

LRAP:  0.07931521277677434


# Rank SVM

Here we use a twin SVM adapted from this paper in 2016:
https://www.sciencedirect.com/science/article/abs/pii/S0031320315003751 

In [0]:
type(X_train_small.values)

numpy.ndarray

In [0]:
# from skmultilearn.adapt import MLTSVM
import time 

s_time = time.time()
svm_clf = MLTSVM(c_k = 2**-1)
svm_clf.fit(X_train_small.values, binary_y_train_small)
print(time.time()-s_time)

In [0]:
# pred
y_pred = svm_clf.predict(X_val)
y_pred_ = binarizer.inverse_transform(y_pred)

In [0]:
binarizer.fit_transform(y_val).shape

(2489, 3993)

In [0]:
import sklearn.metrics as metrics

print(metrics.label_ranking_average_precision_score(binarizer.fit_transform(y_val), 
                                                    binarizer.fit_transform(y_pred_)))

0.052771879688835586


# Label Space Embedding: LNEMLC

Reference: https://arxiv.org/abs/1812.02956 


Most approaches on multi-label classification focus on effective adaptation or transformation of existing binary and multi-class learning approaches but fail in modelling the joint probability of labels or do not preserve generalization abilities for unseen label combinations. To address these issues we propose a new multi-label classification scheme, LNEMLC - Label Network Embedding for Multi-Label Classification, that embeds the label network and uses it to extend input space in learning and inference of any base multi-label classifier. The approach allows capturing of labels' joint probability at low computational complexity providing results comparable to the best methods reported in the literature. We demonstrate how the method reveals statistically significant improvements over the simple kNN baseline classifier. We also provide hints for selecting the robust configuration that works satisfactorily across data domains.

In [0]:
!pip install scikit-multilearn



In [0]:
# # install openne
# !git clone https://github.com/thunlp/OpenNE.git
# !pip install -r requirements.txt

fatal: destination path 'OpenNE' already exists and is not an empty directory.
[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'[0m


In [0]:
cd OpenNE/src

/content/drive/.shortcut-targets-by-id/1cXJSX-Wb546Od-de-PxfpA-IOtl_oV-Q/1003 Project/Data/OpenNE/src


In [0]:
!python setup.py install

running install
running bdist_egg
running egg_info
writing openne.egg-info/PKG-INFO
writing dependency_links to openne.egg-info/dependency_links.txt
writing top-level names to openne.egg-info/top_level.txt
writing manifest file 'openne.egg-info/SOURCES.txt'
installing library code to build/bdist.linux-x86_64/egg
running install_lib
running build_py
creating build/bdist.linux-x86_64/egg
creating build/bdist.linux-x86_64/egg/openne
copying build/lib/openne/hope.py -> build/bdist.linux-x86_64/egg/openne
copying build/lib/openne/classify.py -> build/bdist.linux-x86_64/egg/openne
copying build/lib/openne/__init__.py -> build/bdist.linux-x86_64/egg/openne
copying build/lib/openne/walker.py -> build/bdist.linux-x86_64/egg/openne
copying build/lib/openne/sdne.py -> build/bdist.linux-x86_64/egg/openne
copying build/lib/openne/lap.py -> build/bdist.linux-x86_64/egg/openne
copying build/lib/openne/grarep.py -> build/bdist.linux-x86_64/egg/openne
copying build/lib/openne/node2vec.py -> build/bdist

In [0]:
from skmultilearn.embedding import SKLearnEmbedder, EmbeddingClassifier
from sklearn.manifold import SpectralEmbedding
from sklearn.ensemble import RandomForestRegressor
from skmultilearn.adapt import MLkNN
import time

clf = EmbeddingClassifier(
    SKLearnEmbedder(SpectralEmbedding(n_components = 100)),
    RandomForestRegressor(n_estimators=10),
    MLkNN(k=10)
)

start_time = time.time()
clf.fit(X_train, binary_y_train)
print("Training time is {}".format(time.time()-start_time))
predictions = clf.predict_proba(X_val)

# calculate lrap
import numpy as np
from sklearn.metrics import label_ranking_average_precision_score

label_ranking_average_precision_score(binary_y_val, predictions.toarray())

Training time is 7213.777302265167


0.2009070404066415

0.09628422715943351

In [0]:
# fine tuning

n_components_vals = [10, 50, 100, 150, 200]
n_estimators_vals = [10, 50, 100, 150, 200]
knn_vals = [5, 10, 20, 30 ,50]

for a in n_components_vals:
    for b in n_estimators_vals:
        for c in knn_vals:
            clf = EmbeddingClassifier(SKLearnEmbedder(SpectralEmbedding(n_components = 100)),
                                      RandomForestRegressor(n_estimators=100),
                                      MLkNN(k=10))
            start_time = time.time()
            clf.fit(X_train_small, binary_y_train_small)
            print("n_component for embedding:{}, n_trees_in_rf:{}, knn_k:{}".format(a,b,c))
            print("Training time: {}".format(time.time()-start_time))
            predictions = clf.predict(X_val_small)
            print("Evaluating lrap: {}".format(label_ranking_average_precision_score(binary_y_val_small, predictions.toarray())))

n_component for embedding:10, n_trees_in_rf:10, knn_k:5
Training time: 383.17239594459534
Evaluating lrap: 0.05401250879515343
n_component for embedding:10, n_trees_in_rf:10, knn_k:10
Training time: 378.3155310153961
Evaluating lrap: 0.05401250879515343
n_component for embedding:10, n_trees_in_rf:10, knn_k:20
Training time: 383.9575204849243
Evaluating lrap: 0.05401250879515343
n_component for embedding:10, n_trees_in_rf:10, knn_k:30
Training time: 380.64761781692505
Evaluating lrap: 0.05401250879515343
n_component for embedding:10, n_trees_in_rf:10, knn_k:50
Training time: 381.0272285938263
Evaluating lrap: 0.05401250879515343
n_component for embedding:10, n_trees_in_rf:50, knn_k:5
Training time: 388.5618441104889
Evaluating lrap: 0.05401250879515343
n_component for embedding:10, n_trees_in_rf:50, knn_k:10
Training time: 388.84656524658203
Evaluating lrap: 0.05401250879515343
n_component for embedding:10, n_trees_in_rf:50, knn_k:20
Training time: 390.4758560657501
Evaluating lrap: 0.0