In [0]:
!pip install scikit-multilearn

Collecting scikit-multilearn
[?25l  Downloading https://files.pythonhosted.org/packages/bb/1f/e6ff649c72a1cdf2c7a1d31eb21705110ce1c5d3e7e26b2cc300e1637272/scikit_multilearn-0.2.0-py3-none-any.whl (89kB)
[K     |████████████████████████████████| 92kB 2.8MB/s 
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0


In [0]:
cd drive/My\ Drive/1003\ Project

/content/drive/My Drive/1003 Project


# Load data

In [0]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import label_ranking_average_precision_score as LRAP

In [0]:
def load_data(file_name):

    # load data from csv
    data = pd.read_csv(file_name, usecols=['labels', 'features'])

    # remove rows without proper label
    rows_to_remove = [i for i in range(len(data)) if ':' in data.loc[i,'labels']]
    data.drop(rows_to_remove, inplace=True)
    data.reset_index(drop=True, inplace=True)

    # extract features from sparse representation
    feature = np.zeros((len(data), 5000))
    for i in range(len(data)):
        for j in data.loc[i,'features'].replace('\n','').split():
            ft, val = j.split(':')
            feature[i,int(ft)] = float(val)
    X = pd.DataFrame(feature)

    # extract labels
    y = data['labels'].map(lambda x: tuple([int(i) for i in x.replace(' ','').split(',')]))
    
    return X, y

In [0]:
X_train, y_train = load_data("train.csv")
X_val, y_val = load_data('dev.csv')

In [0]:
X_train.shape

(15511, 5000)

In [0]:
binarizer = MultiLabelBinarizer()
binary_y_train = binarizer.fit_transform(y_train)

In [0]:
binary_y_train.shape

(15511, 3786)

## Classifier Chain with RF

In [0]:
from skmultilearn.problem_transform import ClassifierChain
from sklearn.ensemble import RandomForestClassifier
import time
start_time = time.time()
clf = ClassifierChain(RandomForestClassifier())
clf.fit(X_train, binary_y_train)
end_time = time.time()

In [0]:
print('Training time:', end_time - start_time)

Training time: 11963.349047899246


In [0]:
binary_y_val = binarizer.transform(y_val)
y_val_pred = clf.predict_proba(X_val)
print('LRAP: ', LRAP(binary_y_val, y_val_pred.toarray()))

  .format(sorted(unknown, key=str)))


LRAP:  0.5551620623956518


In [0]:
from joblib import dump, load
dump(clf, 'CC_RF.joblib') 

['CC_RF.joblib']

In [0]:
# clf = load('CC_RF.joblib') 

In [0]:
# binary_y_val = binarizer.transform(y_val)
# y_val_pred = clf.predict_proba(X_val)
# print('LRAP: ', LRAP(binary_y_val, y_val_pred.toarray()))

  .format(sorted(unknown, key=str)))


LRAP:  0.5551620623956518
