# Extreme Multi-Label Machine Model

### Baseline Machine Gaussian Naive Bayes Learning Model

In [None]:
import hickle as hkl
X_train = hkl.load( 'X_train.hkl' )
X_test = hkl.load( 'X_test.hkl' )
y_test = hkl.load( 'y_test.hkl' )
y_train = hkl.load( 'y_train.hkl' )

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from sklearn.naive_bayes import GaussianNB

classif_list = []
train_score_list, test_score_list = [], []

# Training
print("Training")
for i in range(len(labels_list)):
    y_train_for_this_ub04 = y_train[:,i]
    new_classifier = GaussianNB()
    new_classifier.fit(X_train, y_train_for_this_ub04)
    classif_list.append(new_classifier)

# Test & display results
print("Test and display results")
for i in range(len(labels_list)):
    classif = classif_list[i]
    train_score = float('{0:.3f}'.format(classif.score(X_train, y_train[:,i])))
    test_score = float('{0:.3f}'.format(classif.score(X_test, y_test[:,i])))
    train_score_list.append(train_score)
    test_score_list.append(test_score)
    print('Detecting {} with {}% accuracy (training {}%)'.format(labels_list[i], 100*test_score, 100*train_score))

predict_train = np.zeros_like(y_train)
predict_test = np.zeros_like(y_test)
for i in range(len(labels_list)):
    classif = classif_list[i]
    predict_train[:,i] = classif.predict(X_train)
    predict_test[:,i] = classif.predict(X_test)
acc_train = 1 - np.sum(np.abs(predict_train - y_train))/(y_train.shape[0]*y_train.shape[1])
acc_test = 1 - np.sum(np.abs(predict_test - y_test))/(y_test.shape[0]*y_test.shape[1])
print('###')
print('Global accuracy: testing {}, training {}'.format(acc_test, acc_train))

well_labeled = 0
for i in range(len(y_train)):
    if np.sum(np.abs(y_train[i,:] - predict_train[i,:])) == 0:
        well_labeled +=1
print('Overall {} out of the {} training samples were well labeled'.format(well_labeled,len(y_train)))

well_labeled = 0
for i in range(len(y_test)):
    if np.sum(np.abs(y_test[i,:] - predict_test[i,:])) == 0:
        well_labeled +=1
print('Overall {} out of the {} testing samples were well labeled'.format(well_labeled,len(y_test)))

# Evidemment, en plus d'être mathématiquement désapprouvée, cette méthode est encombrante:
# imaginez avoir à construire un million de classificateurs ! Ce n'est pas du tout extensible à de la classification extrême

### Deep Extreme Multi-label Learning by Zhang et al
Follwing XML methods in the paper [Deep Extreme Multi-label Learning by Zhang et al](https://arxiv.org/pdf/1704.03718.pdf) and [eXtreme Multilabel Classification Notebook](https://github.com/therhappy/xml-tuto/blob/master/eXtreme%20Multilabel%20Classification%20Notebook%20-%20EN.ipynb). 



The goal in this section is build an embedded vector of the UB-04, ICD-10, CPT and Modifier labels. This embedded vector will be used to predict the correct label for EHR procedures.


In [2]:
import numpy as np
%matplotlib inline

In [99]:
import hickle as hkl
y_train = hkl.load( 'y_train.hkl' )
labels_list = hkl.load('labels_list.hkl' )

In [100]:
labels_list[0:5]

['10022', '10060', '10120', '10160', '10180']

In [101]:
y_train[0:5]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [102]:
print("lables names: ", len(labels_list))

lables names:  8630


In [103]:
#y_train = y_train[0:1000]
print(y_train.shape[0])
print(y_train.shape[1])

67000
8630


In [104]:
edges = []
matrix = np.zeros((y_train.shape[0],y_train.shape[1]))
print(matrix.shape[0])
print(matrix.shape[1])

67000
8630


In [105]:
%%time
for row in y_train:
    act = list(np.where(row == 1))
    act = [list(i) for i in act ][0]
    for i in range(len(act)):
        for j in range(len(act)):
            matrix[i,act[j]] +=1
            
            #if ([act[i],act[j]] not in edges) and ([act[j],act[i]] not in edges): #and not i==j:
            #if [act[j],act[i]] not in edges and not i==j:
            #if not i==j and i <:
            if i < j:
                edges.append([act[i],act[j]])
        

Wall time: 16.6 s


In [106]:
matrix[0][1914]

1692.0

In [107]:
matrix

array([[  1.,  10.,   1., ..., 267., 613.,  47.],
       [  1.,  10.,   1., ..., 267., 613.,  47.],
       [  1.,  10.,   1., ..., 267., 613.,  47.],
       ...,
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.]])

In [108]:
len(edges)

3612991

In [109]:
edges2 = [[x,y] for x,y in (set(tuple(x) for x in edges))]

In [110]:
edges2[:5]

[[292, 8167], [2538, 2974], [959, 3912], [1559, 1957], [3897, 4036]]

In [111]:
label_dict = {}
for i in range(len(labels_list)):
    label_dict[i] = labels_list[i]

In [95]:
'''
for i in range(len(edges2)):
    val1 = edges2[i][0]
    val2 = edges2[i][1]
    edges2[i][0] = label_dict[val1]
    edges2[i][1] = label_dict[val2]
'''

'\nfor i in range(len(edges2)):\n    val1 = edges2[i][0]\n    val2 = edges2[i][1]\n    edges2[i][0] = label_dict[val1]\n    edges2[i][1] = label_dict[val2]\n'

In [112]:
edges2[0:10]

[[292, 8167],
 [2538, 2974],
 [959, 3912],
 [1559, 1957],
 [3897, 4036],
 [328, 8279],
 [6208, 6330],
 [1978, 7968],
 [3462, 6113],
 [1568, 3416]]

In [53]:
from sklearn.preprocessing import normalize
norm_matrix = normalize(matrix, axis=1, norm='l1')

In [113]:
G=nx.Graph()
for edge in edges2:
        G.add_edge(edge[0], edge[1])

In [54]:
norm_matrix

array([[1.93574488e-06, 1.93574488e-05, 1.93574488e-06, ...,
        5.16843884e-04, 1.18661161e-03, 9.09800096e-05],
       [1.93913445e-06, 1.93913445e-05, 1.93913445e-06, ...,
        5.17748898e-04, 1.18868942e-03, 9.11393191e-05],
       [2.07942575e-06, 2.07942575e-05, 2.07942575e-06, ...,
        5.55206674e-04, 1.27468798e-03, 9.77330101e-05],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [40]:
import networkx as nx #For those among you that use Anaconda dist, you already got this

def draw_graph(edges, weights_matrix=None, threshold=0, figsize=(20,20)):
    
    '''
    edges : liste des connected labels
    weights_matrix : labels proximity matrix
    threshold : number of occurence required to draw a connection
    figsize : size of the graph to display
    '''
    
    edges = [edge for edge in edges if weights_matrix[edge[0],edge[1]] > threshold]
    
    # additional settings (you can mess around here)
    node_size = 1600
    node_color = 'blue'
    node_alpha = 0.2
    node_text_size = 12
    edge_color = 'blue'
    edge_alpha= 0.3
    edge_tickness = 1
    edge_text_pos = 0.3
    text_font = 'sans-serif'

    plt.figure(figsize=figsize)
    # create networkx graph
    G=nx.Graph()

    # add edges
    for edge in edges:
        G.add_edge(edge[0], edge[1])

    # select shell autolocation
    graph_pos=nx.shell_layout(G)

    # draw graph
    nx.draw_networkx_nodes(G,
                           graph_pos,
                           node_size=node_size, 
                           alpha=node_alpha,
                           node_color=node_color)
    nx.draw_networkx_edges(G, graph_pos,width=edge_tickness, alpha=edge_alpha,edge_color=edge_color)
    nx.draw_networkx_labels(G, graph_pos,font_size=node_text_size, font_family=text_font)
    
    # construct weights dict
    weights={}
    for i in range(len(edges)):
        weights[tuple(edges[i])] = weights_matrix.astype(int)[edges[i][0],edges[i][1]]
    
    # draw weights
    edge_labels = weights
    nx.draw_networkx_edge_labels(G, graph_pos, edge_labels=edge_labels, 
                                 label_pos=edge_text_pos)

    # show graph
    plt.title('Proximity graph between labels with a proximity threshold at {} occurences'.format(threshold))
    plt.show()

In [None]:
import networkx as nx
num_nodes = norm_matrix.shape[0] + norm_matrix.shape[1]
rows, cols = np.where(norm_matrix == 1)
edges = list(zip(rows.tolist(), (cols + norm_matrix.shape[0]).tolist()))
print("X:", norm_matrix)
print("U nodes:", np.arange(norm_matrix.shape[0]))
print("V nodes:", np.arange(norm_matrix.shape[1]) + norm_matrix.shape[0])
print("edges")
print(edges[0:50])

In [None]:
graph = nx.draw_networkx(b, pos=pos, node_color=(['c'] * norm_matrix.shape[0]) + (['y'] * norm_matrix.shape[1]))

In [None]:
#adjacency list
#nx.write_edgelist(b, "test.edgelist.txt", delimiter='\t', data = False)

In [114]:
# save as edge list file
with open('./test.edgelist', 'w') as f:
    for edge in G.edges():
        f.write("{} {}\n".format(edge[0] ,edge[1]))

In [None]:
nx.write_gexf(b, "test.gexf")

### Analyze the graph in GraphCrunch 2
+ [GraphCrunch 2](http://www0.cs.ucl.ac.uk/staff/natasa/graphcrunch2/index.html)
+ [Paper](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-24)

### Choose How to Convert to Vec
There are four methods to convert a network graph each node into a network as a low-dimensional feature vector.

1. [Deepwalk](https://github.com/phanein/deepwalk)
2. [Node2Vec](https://snap.stanford.edu/node2vec/)
3. [Struct2Vec](https://github.com/leoribeiro/struc2vec)
4. Graphlets

Due a paper from Shawn Gu and Tijana Milenković called [*Graphlets versus node2vec and struc2vec in the task of network alignment*](https://www.groundai.com/project/graphlets-versus-node2vec-and-struc2vec-in-the-task-of-network-alignment/), in certain situations a graphlet could outperform Node2Vec or Struct2Vec to quantify node similarities.

In this section two embedded graphs will be made. One, using Node2Vec and the second using Graphlets. 

*Note* :To use Node2Vec, you must use a Python 2.7 environment at the time of writing this

### Node2Vec
To run NodeVec, you first must download the repository and run the line below

In [None]:
python src/main.py --input graph/test.edgelist --output emb/test.emd