In [1]:
!pip install scikit-multilearn



In [2]:
cd drive/My\ Drive/1003\ Project/Data

/content/drive/My Drive/1003 Project/Data


# Load data

In [0]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import label_ranking_average_precision_score as LRAP

In [0]:
def load_data(file_name):

    # load data from csv
    data = pd.read_csv(file_name, usecols=['labels', 'features'])

    # remove rows without proper label
    rows_to_remove = [i for i in range(len(data)) if ':' in data.loc[i,'labels']]
    data.drop(rows_to_remove, inplace=True)
    data.reset_index(drop=True, inplace=True)

    # extract features from sparse representation
    feature = np.zeros((len(data), 5000))
    for i in range(len(data)):
        for j in data.loc[i,'features'].replace('\n','').split():
            ft, val = j.split(':')
            feature[i,int(ft)] = float(val)
    X = pd.DataFrame(feature)

    # extract labels
    y = data['labels'].map(lambda x: tuple([int(i) for i in x.replace(' ','').split(',')]))
    
    return X, y

In [0]:
X_train, y_train = load_data("train.csv")
X_val, y_val = load_data('dev.csv')

In [6]:
X_train.shape

(15511, 5000)

In [0]:
binarizer = MultiLabelBinarizer()
binary_y_train = binarizer.fit_transform(y_train)

In [9]:
binary_y_train.shape

(15511, 3786)

## MLP

In [10]:
from sklearn.neural_network import MLPClassifier
import time
start_time = time.time()
clf = MLPClassifier(hidden_layer_sizes=(4300,), activation='tanh')
clf.fit(X_train, binary_y_train)
end_time = time.time()



In [11]:
print('Training time:', end_time - start_time)

Training time: 2040.0393915176392


In [12]:
binary_y_val = binarizer.transform(y_val)
y_val_pred = clf.predict_proba(X_val)
print('LRAP: ', LRAP(binary_y_val, y_val_pred))

  .format(sorted(unknown, key=str)))


LRAP:  0.6160757445141217


In [0]:
# from joblib import dump, load
# dump(clf, 'MLP_4300_tanh_es.joblib') 