In [64]:
import math
import pandas as pd
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
df_data = pd.read_csv('parsed_file_dida2_orphanet.csv', index_col=0)
df_data.head()

Unnamed: 0_level_0,DEO1,DEO2,Rec1,EssA,DEO3,DEO4,RecB,EssB,Distance,Pathway,DE,Orphanet,Name,Combination
DIDAid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
dd001,0.99,0.0,0.34,0.0,1.0,0.0,0.3,0.392,1.11,1,CO,768,Familial long QT syndrome,KCNH2/KCNQ1
dd002,0.39,0.0,0.16,0.392,1.0,0.0,0.51,0.392,1.25,1,TD,87884,Non-syndromic genetic deafness,GJB2/GJB3
dd003,0.4,0.0,0.16,0.392,1.0,0.0,0.51,0.392,1.25,1,TD,87884,Non-syndromic genetic deafness,GJB2/GJB3
dd004,0.4,0.0,0.16,0.392,1.0,0.0,0.51,0.392,1.25,1,TD,87884,Non-syndromic genetic deafness,GJB2/GJB3
dd005,0.12,0.0,0.2,0.392,0.21,0.0,0.44,0.0,1.25,0,TD,87884,Non-syndromic genetic deafness,FOXI1/SLC26A4


In [4]:
X, y = array(df_data)[:,:10].astype(float), array(df_data.replace('UK', 0).replace('TD', 1).replace('CO', -1))[:,10].astype(int)

In [200]:
class GraphEnergyClassifier:
    
    def __init__(self, similarity='default', loss='default', eps='default', precision='default'):
        
        # Weights between vertices, dict of (i, j)
        self.weights = {}
        
        # Convex loss function parameters
        self.similarity, self.loss, self.eps, self.precision = (
            lambda x_1, x_2: 1 / (1 + sum(abs(x_1 - x_2))) if similarity == 'default' else similarity,
            (lambda i, j: (i - j)**2) if loss == 'default' else loss,
            0.005 if eps == 'default' else eps,
            5e-4 if precision == 'default' else precision
        )
        
        # Learning values
        self.X_fit, self.y_fit = None, None
        self.predictions = None
        
        # Amount of labeled/unlabeled data
        self.l, self.u = 0, 0
        self.selector_l = None
        
    def __len__(self):
        """ __len(self)__ -> number of samples """
        assert self.l + self.u > 0, "Model must be fit."
        
        return self.l + self.u
        
    def _weight_(self, i, j):
        """ _weight_(int, int) -> float """
        assert self.X_fit is not None and self.y_fit is not None, "Model must be fit."
        
        i, j = min(i, j), max(i, j)
        if (i, j) not in self.weights:
            self.weights[(i, j)] = self.similarity( self.X_fit[i], self.y_fit[j] )
        return self.weights[(i, j)]
    
    def _label_(self, i):
        """ _label_(i) -> {-1, 0, 1} """
        assert self.y_fit is not None, "Model must be fit."
        assert i < len(self.y_fit), "Index out of range : " + str(i)
        
        return self.y_fit[i]
    
    def _energy_(self, f):
        """ _energy_(self, f) -> energy necessary to affectation f """
        return sum(
            sum(
                (f[i] - f[j])**2 * self._weight_(i, j)
                for j in range(len(self))
            )
            for i in range(len(self))
        )
    
    def _gradient_(self, f):
        """ _gradient_(self, f) -> partial derivatives of energy at point f """
        
        def subgradient(i):
            if i <= self.l:
                return 0
            else:
                return 2 * (
                    sum( (f[i] - f[j])*self._weight_(i, j) for j in range(self.l, len(self)) ) +
                    sum( (f[i] - f[j])*self._weight_(i, j) for j in range(1, len(self)) )
                )
        return array([
            subgradient(i) for i in range(len(self))
        ])
    
    def fit(self, X, y, learning_rate='default', max_iter='default', starting_point='default'):
        assert len(X) == len(y), "Design matrix and class vector must have same length."
        
        # Data loading, putting unlabeled at the end
        labeled_selector = y != 0
        self.X_fit = concatenate( (X[labeled_selector], X[labeled_selector == False]) )
        self.y_fit = concatenate( (y[labeled_selector], y[labeled_selector == False]) )
        
        # Labeled and unlabeled data size
        self.l, self.u = sum(labeled_selector), len(labeled_selector) - sum(labeled_selector)
        self.selector_l = diag(array([i < self.l for i in range(self.l + self.u)]))
        
        # Gradient descent parameters
        # x : initial value of x
        learning_rate, max_iter, y = (
            5e-3 if learning_rate == 'default' else learning_rate,
            30 if max_iter == 'default' else max_iter,
            array(self.y_fit, copy=True).astype(float) if starting_point == 'default' else starting_point
        )
        
        print('#'*20)
        print('Beginning of fitting.')
        
        # From https://github.com/GRYE/Nesterov-accelerated-gradient-descent/blob/master/nesterov_method.py
        
        lambda_prev = 0
        lambda_curr = 1
        gamma = 1
        y_prev = y
        
        # Gradient descent loop
        for i in range(max_iter):
            
            gradient = self._gradient_(y)
            
            if linalg.norm(gradient) < self.eps:
                break
            
            y_curr = y - learning_rate * gradient
            y = (1 - gamma) * y_curr + gamma * y_prev
            y_prev = y_curr
            
            lambda_tmp = lambda_curr
            lambda_curr = (1 + math.sqrt(1 + 4 * lambda_prev * lambda_prev)) / 2
            lambda_prev = lambda_tmp
            
            gamma = (1 - lambda_prev) / lambda_curr
            
            print("Round :", i, "Magnitude :", linalg.norm(gradient), "Energy :", self._energy_(y))
            
        return y
    
    def leaveOneGroupOutValidation(self, X, y, groups, learning_rate='default', max_iter='default', starting_point='default'):
        """ Categories : label array descriptor """
        
        def make_selector(group_name):
            return (groups == group_name) *  (y != 0)
        
        errors = 0
        
        for group in set(groups):
            
            selector = make_selector(group)
            group_size = sum(selector)
            
            if group_size == 0:
                continue
                
            X_tmp, y_tmp = array(X, copy=True), array(y, copy=True)
            y_tmp[selector] = 0.
            
            X_in = concatenate( (X_tmp[y_tmp != 0], X_tmp[(y_tmp == 0) * (selector == False)], X_tmp[selector]) )
            y_in = concatenate( (y_tmp[y_tmp != 0], y_tmp[(y_tmp == 0) * (selector == False)], y_tmp[selector]) )
                
            print("#"*20)
            print("Beginning group", group, "of size", group_size)
                
            y_pred = self.fit(X_in, y_in, learning_rate, max_iter, starting_point)
            step_errors = sum(y[selector] != ((y_pred[-group_size:] >= 0) * 2 - 1))
            
            print(step_errors, "error(s) at this step.")
            print("Truth :", y[selector])
            print("Prediction :", y_pred[-group_size:])
            
        print("Global error :", errors/len(self))
        
        

In [204]:
g = GraphEnergyClassifier(eps=1e-5)

gene_pairs = array(df_data)[:,-1]
g.leaveOneGroupOutValidation(X, y, gene_pairs, learning_rate=1e-2, max_iter=100)

####################
Beginning group FOXL2/GALT of size 1
####################
Beginning of fitting.
Round : 0 Magnitude : 16.3653213913 Energy : 8312.12337819
Round : 1 Magnitude : 16.3653213913 Energy : 8308.84854633
Round : 2 Magnitude : 9.96510247488 Energy : 8307.88523374
Round : 3 Magnitude : 6.06790816755 Energy : 8307.67913581
Round : 4 Magnitude : 2.78841559346 Energy : 8307.78469393
Round : 5 Magnitude : 1.13526684668 Energy : 8307.9153878
Round : 6 Magnitude : 0.143562100517 Energy : 8307.96803678
Round : 7 Magnitude : 0.17468587357 Energy : 8307.97766227
Round : 8 Magnitude : 0.229682137949 Energy : 8307.96508518
Round : 9 Magnitude : 0.157641392956 Energy : 8307.94937688
Round : 10 Magnitude : 0.0654371965842 Energy : 8307.93951994
Round : 11 Magnitude : 0.00622756286525 Energy : 8307.93474445
Round : 12 Magnitude : 0.0228545149918 Energy : 8307.93432723
Round : 13 Magnitude : 0.0254079570732 Energy : 8307.93575637
Round : 14 Magnitude : 0.0166699394939 Energy : 8307.93742

Round : 3 Magnitude : 1.3793817341 Energy : 8221.58086812
Round : 4 Magnitude : 0.633874051897 Energy : 8221.63303016
Round : 5 Magnitude : 0.258073508761 Energy : 8221.66780304
Round : 6 Magnitude : 0.0326351246086 Energy : 8221.67951535
Round : 7 Magnitude : 0.039710308158 Energy : 8221.68156659
Round : 8 Magnitude : 0.0522122841988 Energy : 8221.67888126
Round : 9 Magnitude : 0.0358356870237 Energy : 8221.67546442
Round : 10 Magnitude : 0.0148754515076 Energy : 8221.67328217
Round : 11 Magnitude : 0.00141567509383 Energy : 8221.67221372
Round : 12 Magnitude : 0.00519538194233 Energy : 8221.67212002
Round : 13 Magnitude : 0.00577584085325 Energy : 8221.67244075
Round : 14 Magnitude : 0.00378947891288 Energy : 8221.6728146
Round : 15 Magnitude : 0.00147574719684 Energy : 8221.67308915
Round : 16 Magnitude : 0.000222363214571 Energy : 8221.67319525
Round : 17 Magnitude : 0.000878333794902 Energy : 8221.67319237
Round : 18 Magnitude : 0.000860554167646 Energy : 8221.67313663
Round : 19 

Round : 4 Magnitude : 4.03384087043 Energy : 8134.42355753
Round : 5 Magnitude : 1.64232541796 Energy : 8134.59400025
Round : 6 Magnitude : 0.207683054802 Energy : 8134.67110648
Round : 7 Magnitude : 0.252708031739 Energy : 8134.68553457
Round : 8 Magnitude : 0.332267972334 Energy : 8134.66670092
Round : 9 Magnitude : 0.228050759459 Energy : 8134.64340997
Round : 10 Magnitude : 0.0946642382313 Energy : 8134.62893542
Round : 11 Magnitude : 0.00900905792809 Energy : 8134.62196396
Round : 12 Magnitude : 0.0330623156978 Energy : 8134.6213562
Round : 13 Magnitude : 0.0367562338685 Energy : 8134.6234389
Round : 14 Magnitude : 0.024115445128 Energy : 8134.62587519
Round : 15 Magnitude : 0.00939134413083 Energy : 8134.62767033
Round : 16 Magnitude : 0.00141507263182 Energy : 8134.62836539
Round : 17 Magnitude : 0.00558953115236 Energy : 8134.62834654
Round : 18 Magnitude : 0.00547638535172 Energy : 8134.62798131
Round : 19 Magnitude : 0.00328341993526 Energy : 8134.62758276
Round : 20 Magnitud

Round : 3 Magnitude : 6.06790816755 Energy : 8307.67913581
Round : 4 Magnitude : 2.78841559346 Energy : 8307.78469393
Round : 5 Magnitude : 1.13526684668 Energy : 8307.9153878
Round : 6 Magnitude : 0.143562100517 Energy : 8307.96803678
Round : 7 Magnitude : 0.17468587357 Energy : 8307.97766227
Round : 8 Magnitude : 0.229682137949 Energy : 8307.96508518
Round : 9 Magnitude : 0.157641392956 Energy : 8307.94937688
Round : 10 Magnitude : 0.0654371965842 Energy : 8307.93951994
Round : 11 Magnitude : 0.00622756286525 Energy : 8307.93474445
Round : 12 Magnitude : 0.0228545149918 Energy : 8307.93432723
Round : 13 Magnitude : 0.0254079570732 Energy : 8307.93575637
Round : 14 Magnitude : 0.0166699394939 Energy : 8307.93742598
Round : 15 Magnitude : 0.0064918203913 Energy : 8307.93865474
Round : 16 Magnitude : 0.000978177057284 Energy : 8307.93913017
Round : 17 Magnitude : 0.00386379540641 Energy : 8307.93911728
Round : 18 Magnitude : 0.00378558272405 Energy : 8307.93886748
Round : 19 Magnitude :

Round : 5 Magnitude : 1.51799034476 Energy : 8192.62499907
Round : 6 Magnitude : 0.191960051591 Energy : 8192.69605434
Round : 7 Magnitude : 0.233576335134 Energy : 8192.70927603
Round : 8 Magnitude : 0.307113053457 Energy : 8192.69201306
Round : 9 Magnitude : 0.210785784102 Energy : 8192.67061382
Round : 10 Magnitude : 0.0874975191022 Energy : 8192.65728384
Round : 11 Magnitude : 0.00832701168764 Energy : 8192.65085445
Round : 12 Magnitude : 0.0305592761679 Energy : 8192.65029365
Round : 13 Magnitude : 0.033973539904 Energy : 8192.65221522
Round : 14 Magnitude : 0.0222897438374 Energy : 8192.65446232
Round : 15 Magnitude : 0.00868035625524 Energy : 8192.65611758
Round : 16 Magnitude : 0.00130794212202 Energy : 8192.65675836
Round : 17 Magnitude : 0.00516636607344 Energy : 8192.65674099
Round : 18 Magnitude : 0.00506178617052 Energy : 8192.65640428
Round : 19 Magnitude : 0.00303484297639 Energy : 8192.65603683
Round : 20 Magnitude : 0.000821436196191 Energy : 8192.65581254
Round : 21 M

KeyboardInterrupt: 