In [3]:
import numpy as np
import warnings
import sys
import pandas as pd
import scipy
from scipy.io import arff
if not sys.warnoptions:
    warnings.simplefilter("ignore")


In [4]:
data_train, meta = scipy.io.arff.loadarff('scene-train.arff')
df_train = pd.DataFrame(data_train)
data_test, meta = scipy.io.arff.loadarff('scene-test.arff')
df_test = pd.DataFrame(data_test)
df_test.head()

Unnamed: 0,Att1,Att2,Att3,Att4,Att5,Att6,Att7,Att8,Att9,Att10,...,Att291,Att292,Att293,Att294,Beach,Sunset,FallFoliage,Field,Mountain,Urban
0,0.983225,0.981488,0.978542,0.975908,0.972962,0.968367,0.9659,0.938807,0.888701,0.862884,...,0.002494,0.012225,0.045861,0.094455,b'1',b'0',b'0',b'0',b'1',b'0'
1,0.01816,0.074481,0.3695,0.454558,0.423218,0.37996,0.661803,0.653965,0.66601,0.671838,...,0.005567,0.014649,0.025994,0.024701,b'1',b'0',b'0',b'0',b'0',b'0'
2,0.7628,0.79123,0.799371,0.795679,0.779932,0.756109,0.719042,0.877675,0.907015,0.919592,...,0.040818,0.051593,0.057795,0.057418,b'1',b'0',b'0',b'0',b'0',b'0'
3,0.758518,0.738699,0.709982,0.686717,0.694709,0.687867,0.461537,0.852653,0.83134,0.798184,...,0.599021,0.068253,0.061698,0.053979,b'1',b'0',b'0',b'0',b'1',b'0'
4,0.948445,0.969889,0.965691,0.969223,0.969579,0.966511,0.949118,0.823672,0.853659,0.857751,...,0.001599,0.000875,0.000813,0.003186,b'1',b'0',b'0',b'0',b'0',b'0'


In [5]:
X_train = df_train.iloc[:,:-6].values
y_train = df_train.iloc[:,-6:].values.astype(int)
X_test = df_test.iloc[:,:-6].values
y_test = df_test.iloc[:,-6:].values.astype(int)

In [6]:
print("Train_X: ",X_train.shape)
print("Train_Y: ",y_train.shape)
print("Test_X: ",X_test.shape)
print("Test_Y: ",y_test.shape)

Train_X:  (1211, 294)
Train_Y:  (1211, 6)
Test_X:  (1196, 294)
Test_Y:  (1196, 6)


## Accuracy Score


In [7]:
def hamming_accuracy_score(y_true, y_pred, normalize=True, sample_weight=None):
    '''
    Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
    http://stackoverflow.com/q/32239577/395857
    '''
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        #print('\nset_true: {0}'.format(set_true))
        #print('set_pred: {0}'.format(set_pred))
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/float( len(set_true.union(set_pred)) )
        #print('tmp_a: {0}'.format(tmp_a))
        acc_list.append(tmp_a)
    return np.mean(acc_list)

## MODEL 1: Chain Classifier of LR

In [8]:
from sklearn.metrics import hamming_loss, log_loss, f1_score, accuracy_score

In [10]:
# using classifier chains
from skmultilearn.problem_transform import ClassifierChain
from sklearn.svm import SVC
# initialize classifier chains multi-label classifier
classifier = ClassifierChain(classifier= SVC(kernel='linear'))

# Training logistic regression model on train data
classifier.fit(X_train, y_train)

# predict
predictions = classifier.predict(X_test)

In [11]:
# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions.toarray()))
print("Hamming accuracy = ",hamming_accuracy_score(y_test,predictions.toarray()))
print("Hamming loss = ",hamming_loss(y_test,predictions))
print("Log loss = ",log_loss(y_test,predictions.toarray()))
print("F1 score = ",f1_score(y_test,predictions.toarray(),average='macro'))

Accuracy =  0.611204013378
Hamming accuracy =  0.659420289855
Hamming loss =  0.11580267558528429
Log loss =  12.7459829616
F1 score =  0.682575521853


## Model 2: Multi-lable Lazy Learning

In [12]:
from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix
classifier_new = MLkNN(k=10)
# Note that this classifier can throw up errors when handling sparse matrices.
x_train = lil_matrix(X_train).toarray()
y_train = lil_matrix(y_train).toarray()
x_test = lil_matrix(X_test).toarray()
# train
classifier_new.fit(X_train, y_train)
# predict
predictions = classifier_new.predict(X_test)

In [13]:
# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions.toarray()))
print("Hamming accuracy = ",hamming_accuracy_score(y_test,predictions.toarray()))
print("Hamming loss = ",hamming_loss(y_test,predictions))
print("Log loss = ",log_loss(y_test,predictions.toarray()))
print("F1 score = ",f1_score(y_test,predictions.toarray(),average='macro'))

Accuracy =  0.605351170569
Hamming accuracy =  0.651477146042
Hamming loss =  0.09002229654403568
Log loss =  6.76991680741
F1 score =  0.72405849133


## Model 3: Classic ANN

In [53]:
import os
os.environ['CUDA_VISIBLE_DIVICES'] = '-1'
import keras
from keras.models import Sequential
from keras.layers import Dense,Dropout
from keras.backend.tensorflow_backend import clear_session
clear_session()
model = Sequential()
model.add(Dense(512,activation='relu',input_shape=(X_train.shape[1],),kernel_initializer='glorot_uniform'))
model.add(Dropout(0.4))
model.add(Dense(256,activation='relu',kernel_initializer='glorot_uniform'))
model.add(Dropout(0.4))
model.add(Dense(y_train.shape[1],activation='sigmoid',kernel_initializer='glorot_uniform'))
model.compile(loss='binary_crossentropy', optimizer='adagrad', metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               151040    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               131328    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 1542      
Total params: 283,910
Trainable params: 283,910
Non-trainable params: 0
_________________________________________________________________


In [65]:
model.fit(x=X_train,y=y_train,epochs=5,batch_size=50,validation_data=(x_test,y_test))

Train on 1211 samples, validate on 1196 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1ea9f690eb8>

In [66]:
#Predictions
predictions = model.predict(x_test)
predictions = np.array([list(np.round(x).astype(int)) for x in predictions])
# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("Hamming loss = ",hamming_loss(y_test,predictions))
print("Log loss = ",log_loss(y_test,predictions))
print("F1 score = ",f1_score(y_test,predictions,average='macro'))

Accuracy =  0.663740245262
Hamming loss =  0.1020066889632107
Log loss =  9.42201292523
F1 score =  0.715475620991
