In [None]:
import scipy.io
import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score,hamming_loss,label_ranking_average_precision_score,zero_one_loss,auc,coverage_error,mean_squared_error

In [None]:
def batchkmeans_cluster(D,clust_num):
    batch_kmeans = MiniBatchKMeans(n_clusters=clust_num,max_iter=100,batch_size=D.shape[0]*10)
    batch_kmeans.fit(D)
    centroids=batch_kmeans.cluster_centers_
    return centroids

#calculates distances of each instance from a centroid
def centroid_distance(data,centroid):
    dist=[]
    for instance in range(len(data)):
        dist.append(np.linalg.norm(data.iloc[instance,:].values-centroid))
    return dist

#calculates distances of each instance from all centroid
def dist_of_rows_from_centroids(X,centroids):
    distance_dataframe=pd.DataFrame()
    i=0
    for centre in centroids:
        distance_dataframe[i]=centroid_distance(X,centre) #this will loop for all centroids
        i=i+1
    return distance_dataframe
# any classifier can be used
def SGD_svm(X,Y):
    clf=SGDClassifier(n_jobs=-1)
    clf.fit((X),Y)
    return clf

In [None]:
def LIFT(X,Y,Xt,Yt,ratio):
    #step-1
    classifiers_for_label={} #this will store all classifier functions
    centroids_per_label={}
    for label in range(Y.shape[1]):
        positive_instances=X[Y.iloc[:,label]==1]
        negative_instances=X[Y.iloc[:,label]!=1]
        clust_num=int(ratio*(min(len(positive_instances),len(negative_instances)))) #calculates the number of clusters
        centroids=[] #will stores all the centroids
        centroids.extend(batchkmeans_cluster(positive_instances,clust_num))
        centroids.extend(batchkmeans_cluster(negative_instances,clust_num))
        centroids_per_label[str(label)]=centroids
        distance_dataframe=dist_of_rows_from_centroids(X,centroids)#it saves distance from instances to each centroids
        #step-2
        classifiers_for_label[str(label)]=SGD_svm(distance_dataframe,Y.iloc[:,label]) #classifier is trained label wise from the distance matrix and label
    #step-3
    results =pd.DataFrame()
    for label_2b_pred in range(Y.shape[1]):
        Xt_dist_for_label=dist_of_rows_from_centroids(Xt,centroids_per_label[str(label_2b_pred)])
        results[str(label_2b_pred)]=classifiers_for_label[str(label_2b_pred)].predict(Xt_dist_for_label) #this transforms test set to the distance form upon which all classifiers will act to give labelset
    print('Hamming loss : {}'.format(hamming_loss(Yt.values,results.values)))
    print('zero_one_loss : {}'.format(zero_one_loss(Yt.values,results.values)))
    print('coverage_error : {}'.format(coverage_error(Yt.values,results.values)))
    print('label_ranking_average_precision_score : {}'.format(label_ranking_average_precision_score(Yt.values,results.values)))
    return results,centroids_per_label

In [None]:
# Mediamill

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
data=scipy.io.loadmat('/content/drive/My Drive/Code/mediamill.mat')

In [None]:
data

{'X': <35125x120 sparse matrix of type '<class 'numpy.float64'>'
 	with 4215000 stored elements in Compressed Sparse Column format>,
 'Xt': <8782x120 sparse matrix of type '<class 'numpy.float64'>'
 	with 1053840 stored elements in Compressed Sparse Column format>,
 'Y': <35125x101 sparse matrix of type '<class 'numpy.float64'>'
 	with 153669 stored elements in Compressed Sparse Column format>,
 'Yt': <8782x101 sparse matrix of type '<class 'numpy.float64'>'
 	with 38449 stored elements in Compressed Sparse Column format>,
 '__globals__': [],
 '__header__': b'MATLAB 5.0 MAT-file, Platform: PCWIN64, Created on: Sun Mar 18 10:33:29 2018',
 '__version__': '1.0'}

In [None]:
X=pd.DataFrame(data['X'].toarray())
Y=pd.DataFrame(data['Y'].toarray())
Xt=pd.DataFrame(data['Xt'].toarray())
Yt=pd.DataFrame(data['Yt'].toarray())

In [None]:
def minlabelindices(Y):
    labcount=np.sum(Y)
    maxlabcount=np.max(labcount)
    IRlabel=np.divide(maxlabcount,labcount)
    meanIRlabel=np.mean(IRlabel)
    indexlist=pd.DataFrame(Y.columns)
    minlabidx=np.reshape((indexlist[IRlabel>meanIRlabel]).values,
                         len((indexlist[IRlabel>meanIRlabel]).values))
    return minlabidx,IRlabel

In [None]:
indices,IR=minlabelindices(Y)

In [None]:
IR.idxmax()

100

In [None]:
q,w = LIFT(X,Y.iloc[:,[89,99]],Xt,Yt.iloc[:,[89,99]],0.1) #case - 1 : without smote

Hamming loss : 0.0007970849464814393
zero_one_loss : 0.0015941698929629045
coverage_error : 0.0031883397859257573
label_ranking_average_precision_score : 0.9992029150535185


In [None]:
# LIFT with SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
def LIFT_smote(X,Y,Xt,Yt,ratio):
    #step-1
    classifiers_for_label={} #this will store all classifier functions
    centroids_per_label={}
    for label in range(Y.shape[1]):
        #applied smote to balance the data
        sme=SMOTE(n_jobs=-1)
        X_res,y_res=sm.fit_resample(X,Y.iloc[:,label])
        X_res=pd.DataFrame(X_res)
        y_res=pd.DataFrame(y_res)
        positive_instances=X_res[y_res.iloc[:,label]==1]
        negative_instances=X_res[y_res.iloc[:,label]!=1]
        #hope the cluster number dwindles :)
        clust_num=int(ratio*(min(len(positive_instances),len(negative_instances)))) #calculates the number of clusters
        centroids=[] #will stores all the centroids
        centroids.extend(batchkmeans_cluster(positive_instances,clust_num))
        centroids.extend(batchkmeans_cluster(negative_instances,clust_num))
        centroids_per_label[str(label)]=centroids
        distance_dataframe=dist_of_rows_from_centroids(X_res,centroids)#it saves distance from instances to each centroids
        #step-2
        classifiers_for_label[str(label)]=SGD_svm(distance_dataframe,y_res.iloc[:,label]) #classifier is trained label wise from the distance matrix and label
    #step-3
    results =pd.DataFrame()
    for label_2b_pred in range(Y.shape[1]):
        Xt_dist_for_label=dist_of_rows_from_centroids(Xt,centroids_per_label[str(label_2b_pred)])
        results[str(label_2b_pred)]=classifiers_for_label[str(label_2b_pred)].predict(Xt_dist_for_label) #this transforms test set to the distance form upon which all classifiers will act to give labelset
    print('Hamming loss : {}'.format(hamming_loss(Yt.values,results.values)))
    print('zero_one_loss : {}'.format(zero_one_loss(Yt.values,results.values)))
    print('coverage_error : {}'.format(coverage_error(Yt.values,results.values)))
    print('label_ranking_average_precision_score : {}'.format(label_ranking_average_precision_score(Yt.values,results.values)))
    return results,centroids_per_label

In [None]:
q1,w1=LIFT_smote(X,Y.iloc[:,[89,99]],Xt,Yt.iloc[:,[89,99]],0.01)



IndexError: ignored