In [0]:
import scipy.io
import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score,hamming_loss,label_ranking_average_precision_score,zero_one_loss,auc,coverage_error

In [0]:
data=scipy.io.loadmat('emotions.mat')

In [0]:
data

{'X': <391x72 sparse matrix of type '<class 'numpy.float64'>'
 	with 27995 stored elements in Compressed Sparse Column format>,
 'Xt': <202x72 sparse matrix of type '<class 'numpy.float64'>'
 	with 14487 stored elements in Compressed Sparse Column format>,
 'Y': <391x6 sparse matrix of type '<class 'numpy.float64'>'
 	with 709 stored elements in Compressed Sparse Column format>,
 'Yt': <202x6 sparse matrix of type '<class 'numpy.float64'>'
 	with 399 stored elements in Compressed Sparse Column format>,
 '__globals__': [],
 '__header__': b'MATLAB 5.0 MAT-file, Platform: PCWIN64, Created on: Wed May 04 19:29:54 2016',
 '__version__': '1.0'}

In [0]:
X=pd.DataFrame(data['X'].toarray())
Y=pd.DataFrame(data['Y'].toarray())
Xt=pd.DataFrame(data['Xt'].toarray())
Yt=pd.DataFrame(data['Yt'].toarray())

# LIFT stage-1

In [0]:
def batchkmeans_cluster(D,clust_num):
    batch_kmeans = MiniBatchKMeans(n_clusters=clust_num,max_iter=100,batch_size=D.shape[0]*10)
    batch_kmeans.fit(D)
    centroids=batch_kmeans.cluster_centers_
    return centroids

#calculates distances of each instance from a centroid
def centroid_distance(data,centroid):
    dist=[]
    for instance in range(len(data)):
        dist.append(np.linalg.norm(data.iloc[instance,:].values-centroid))
    return dist

#calculates distances of each instance from all centroid
def dist_of_rows_from_centroids(X,centroids):
    distance_dataframe=pd.DataFrame()
    i=0
    for centre in centroids:
        distance_dataframe[i]=centroid_distance(X,centre) #this will loop for all centroids
        i=i+1
    return distance_dataframe

def SGD_svm(X,Y):
    clf=SGDClassifier(n_jobs=-1)
    clf.fit((X),Y)
    return clf

In [0]:
def LIFT(X,Y,Xt,Yt,ratio):
    #step-1
    classifiers_for_label={} #this will store all classifier functions
    centroids_per_label={}
    for label in range(Y.shape[1]):
        positive_instances=X[Y.iloc[:,label]==1]
        negative_instances=X[Y.iloc[:,label]!=1]
        clust_num=int(ratio*(min(len(positive_instances),len(negative_instances)))) #calculates the number of clusters
        centroids=[] #will stores all the centroids
        centroids.extend(batchkmeans_cluster(positive_instances,clust_num))
        centroids.extend(batchkmeans_cluster(negative_instances,clust_num))
        centroids_per_label[str(label)]=centroids
        distance_dataframe=dist_of_rows_from_centroids(X,centroids)#it saves distance from instances to each centroids
        #step-2
        classifiers_for_label[str(label)]=SGD_svm(distance_dataframe,Y.iloc[:,label]) #classifier is trained label wise from the distance matrix and label
    #step-3
    results =pd.DataFrame()
    for label_2b_pred in range(Y.shape[1]):
        Xt_dist_for_label=dist_of_rows_from_centroids(Xt,centroids_per_label[str(label_2b_pred)])
        results[str(label_2b_pred)]=classifiers_for_label[str(label_2b_pred)].predict(Xt_dist_for_label) #this transforms test set to the distance form upon which all classifiers will act to give labelset
    print('Hamming loss : {}'.format(hamming_loss(Yt.values,results.values)))
    print('zero_one_loss : {}'.format(zero_one_loss(Yt.values,results.values)))
    print('coverage_error : {}'.format(coverage_error(Yt.values,results.values)))
    print('label_ranking_average_precision_score : {}'.format(label_ranking_average_precision_score(Yt.values,results.values)))

In [0]:
LIFT(X,Y,Xt,Yt,0.1)

Hamming loss : 0.3316831683168317
zero_one_loss : 0.8613861386138614
coverage_error : 4.891089108910891
label_ranking_average_precision_score : 0.5134763476347634
