In [2]:
from __future__ import division
from __future__ import print_function

import pickle
import random
import numpy as np
import pandas as pd

from time import time
from tqdm import tqdm
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

In [3]:
with open('data/dti_store/graph_2.pkl') as f:
    graph = pickle.load(f)
num_u, num_v, u_nodes, v_nodes, y, u_feat, v_feat = graph
# _, _, _, _, _, u_feat, v_feat = graph

In [4]:
u_nodes.shape, v_nodes.shape, y.shape, u_feat.shape, v_feat.shape

((104809,), (104809,), (104809,), (1862, 881), (1554, 876))

In [5]:
 u_nodes.min(), u_nodes.max(), v_nodes.min(), v_nodes.max()

(0, 1861, 0, 1553)

In [6]:
df = np.vstack([u_nodes, v_nodes, y])

In [7]:
df, df.shape

(array([[1226, 1686, 1423, ...,  114,  960, 1306],
        [1019, 1217,  519, ...,  264, 1327, 1171],
        [   0,    0,    0, ...,    0,    0,    0]]), (3, 104809))

In [8]:
df_t = df.T

In [9]:
df_t

array([[1226, 1019,    0],
       [1686, 1217,    0],
       [1423,  519,    0],
       ...,
       [ 114,  264,    0],
       [ 960, 1327,    0],
       [1306, 1171,    0]])

In [10]:
df = pd.DataFrame(df_t, columns=['u_node', 'v_node', 'y'])

In [11]:
df.head()

Unnamed: 0,u_node,v_node,y
0,1226,1019,0
1,1686,1217,0
2,1423,519,0
3,708,2,0
4,412,1552,0


In [12]:
df_pos = df[df['y'] == 1]
df_neg = df[df['y'] == 0]

In [13]:
df_pos.shape, df_neg.shape

((4809, 3), (100000, 3))

In [15]:
M = 50
R = 0.9
NP_RATIO = 5

In [16]:
df_neg_sampled = df_neg.sample(80)

In [17]:
df_neg_sampled

Unnamed: 0,u_node,v_node,y
99468,925,239,0
57346,1331,515,0
15240,657,1067,0
6635,1697,1539,0
66281,1307,881,0
43414,287,380,0
7770,1392,694,0
36614,409,437,0
65753,538,559,0
84775,843,911,0


In [19]:
df_new = pd.concat([df_pos, df_neg_sampled])
df_new

Unnamed: 0,u_node,v_node,y
27,785,774,1
29,1181,1019,1
44,1758,671,1
53,219,103,1
75,1795,784,1
93,1447,790,1
140,305,94,1
151,25,874,1
198,262,22,1
204,33,1074,1


In [20]:
u_feat, v_feat

(<1862x881 sparse matrix of type '<type 'numpy.float32'>'
 	with 218294 stored elements in Compressed Sparse Row format>,
 <1554x876 sparse matrix of type '<type 'numpy.float32'>'
 	with 2344 stored elements in Compressed Sparse Row format>)

In [21]:
u_feat = u_feat.toarray()
v_feat = v_feat.toarray()

In [22]:
u_feat_headers = ['d' + str(i + 1) for i in range(u_feat.shape[1])]
v_feat_headers = ['t' + str(i + 1) for i in range(v_feat.shape[1])]

In [23]:
df_u = pd.DataFrame(u_feat, columns=u_feat_headers)
df_v = pd.DataFrame(v_feat, columns=v_feat_headers)

In [24]:
df_u

Unnamed: 0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,...,d872,d873,d874,d875,d876,d877,d878,d879,d880,d881
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
df_v.head()

Unnamed: 0,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,...,t867,t868,t869,t870,t871,t872,t873,t874,t875,t876
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
subspace_u = random.sample(range(df_u.shape[1]), 10)
subspace_v = random.sample(range(df_v.shape[1]), 10)

head_u = ['d' + str(i+1) for i in subspace_u]
head_v = ['t' + str(i+1) for i in subspace_v]

In [27]:
head_u, head_v

(['d514',
  'd180',
  'd476',
  'd144',
  'd706',
  'd474',
  'd577',
  'd789',
  'd392',
  'd375'],
 ['t635',
  't775',
  't751',
  't130',
  't211',
  't646',
  't619',
  't105',
  't145',
  't77'])

In [28]:
df_u_sub = df_u[head_u]
df_v_sub = df_v[head_v]

In [29]:
df_u_sub

Unnamed: 0,d514,d180,d476,d144,d706,d474,d577,d789,d392,d375
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
6,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
7,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
9,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
pca_u = PCA(n_components=6)
df_u_sub = pca_u.fit_transform(df_u_sub)

In [31]:
pca_v = PCA(n_components=6)
df_v_sub = pca_v.fit_transform(df_v_sub)

In [32]:
df_u_sub.shape, df_v_sub.shape

((1862, 6), (1554, 6))

In [33]:
df_new.shape

(4889, 3)

In [34]:
data = []
labels = []

for _, row in df_new.iterrows():
    try:
        data.append(np.concatenate([df_u_sub[row['u_node']], df_v_sub[row['v_node']]], axis=0))
        labels.append(row['y'])
    except:
        print (row['u_node'], row['v_node'])
X = np.vstack(data)
labels = np.vstack(labels)

In [35]:
X.shape, labels.shape

((4889, 12), (4889, 1))

In [36]:
X = data

In [37]:
X

[array([ 1.13250560e+00,  8.37814500e-02,  1.51050987e-01,  5.11873309e-02,
         5.13943899e-01, -3.16621972e-03, -5.70814170e-03, -3.83988144e-03,
        -3.29671866e-03, -2.67129535e-03, -4.76969712e-15,  2.60890638e-15]),
 array([ 8.95792764e-01, -8.50229161e-03, -1.47327993e-01, -1.09296820e-01,
        -5.39305815e-01,  8.30833887e-01, -5.70814170e-03, -3.83988144e-03,
        -3.29671866e-03, -2.67129535e-03, -4.76969712e-15,  2.60890638e-15]),
 array([-6.10350356e-01, -2.20256252e-01, -4.92499901e-01, -5.28909270e-01,
         7.16426684e-02,  9.42290775e-01, -5.70814170e-03, -3.83988144e-03,
        -3.29671866e-03, -2.67129535e-03, -4.76969712e-15,  2.60890638e-15]),
 array([-1.01969917e+00,  4.34531851e-01, -1.22247580e-01,  1.28288330e-01,
         1.24134029e-01,  3.33746418e-02, -5.70814170e-03, -3.83988144e-03,
        -3.29671866e-03, -2.67129535e-03, -4.76969712e-15,  2.60890638e-15]),
 array([-6.44928567e-01, -1.70034596e-01, -5.29364072e-01, -4.28481217e-01,
    

In [38]:
labels = np.reshape(labels, (labels.shape[0],))

In [39]:
labels

array([1, 1, 1, ..., 0, 0, 0])

In [40]:
dt = DecisionTreeClassifier()

In [41]:
dt.fit(X, labels)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [42]:
base_learner = {'clf': dt, 'u_cols': head_u, 'v_cols': head_v}

In [43]:
base_learner['clf'].predict([[0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10, 0.11, 0.12]])

array([1])

In [44]:
base_learner['clf']

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [75]:
class EnsemDT:
    
    def __init__(self, n_estimators=50, dim_red_ratio=0.9, 
                 np_ratio=5, reduce_dims=True, n_components=100,
                 max_depth=None):
        
        self.n_estimators = n_estimators
        self.dim_red_ratio = dim_red_ratio
        self.np_ratio = np_ratio
        self.reduce_dims = reduce_dims
        self.n_components = n_components
        self.max_depth = max_depth
        
        self.clfs = list()
        
    def fit(self, df_pos, df_neg, df_u, df_v):
        self.num_pos = df_pos.shape[0]
        self.df_u = df_u
        self.df_v = df_v
        
        self.training_time = time()
        
        for i in tqdm(range(self.n_estimators), desc='Training model...', unit='base learner'):
            df_neg_sampled = df_neg.sample(self.np_ratio*self.num_pos)
            
            training_set = pd.concat([df_neg_sampled, df_pos])
            
            subspace_u = random.sample(range(self.df_u.shape[1]), 
                                       int(self.dim_red_ratio*self.df_u.shape[1]))
            subspace_v = random.sample(range(self.df_v.shape[1]),
                                       int(self.dim_red_ratio*self.df_v.shape[1]))

            head_u = ['d' + str(i+1) for i in subspace_u]
            head_v = ['t' + str(i+1) for i in subspace_v]
            
            df_u_sub = self.df_u[head_u]
            df_v_sub = self.df_v[head_v]
            
            if self.reduce_dims:
                pca_u = PCA(n_components=self.n_components)
                pca_v = PCA(n_components=self.n_components)

                df_u_sub = pca_u.fit_transform(df_u_sub)
                df_v_sub = pca_v.fit_transform(df_v_sub)
            
            data = []
            labels = []

            for _, row in training_set.iterrows():
                try:
                    data.append(np.concatenate([df_u_sub[row['u_node']], df_v_sub[row['v_node']]], axis=0))
                    labels.append(row['y'])
                except:
                    print ("Skipping " + str(row['u_node']) + " " + str(row['v_node']) + "...")
            y = np.vstack(labels)
            y = np.reshape(y, (y.shape[0],))
                    
            X = np.vstack(data)
            
            dt = DecisionTreeClassifier(max_depth=self.max_depth)
            dt.fit(X, y)
            
            base_learner = {'clf': dt, 
                            'u_cols': head_u, 
                            'v_cols': head_v}
            
            self.clfs.append(base_learner)
            
        self.training_time = time() - self.training_time
        
    def predict(self, df_test):
        preds = list()
        
        for i in tqdm(range(self.n_estimators), desc='Testing model...', unit='base learner'):
            base_learner = self.clfs[i]
            
            head_u = base_learner['u_cols']
            head_v = base_learner['v_cols']
            
            df_u_sub = self.df_u[head_u]
            df_v_sub = self.df_v[head_v]
            
            if self.reduce_dims:
                pca_u = PCA(n_components=self.n_components)
                pca_v = PCA(n_components=self.n_components)

                df_u_sub = pca_u.fit_transform(df_u_sub)
                df_v_sub = pca_v.fit_transform(df_v_sub)
            
            data = []
            labels = []

            for _, row in df_test.iterrows():
                try:
                    data.append(np.concatenate([df_u_sub[row['u_node']], df_v_sub[row['v_node']]], axis=0))
                    labels.append(row['y'])
                except:
                    print ("Skipping " + str(row['u_node']) + " " + str(row['v_node']) + "...")
            y_test = np.vstack(labels)
            y_test = np.reshape(y_test, (y_test.shape[0],))
                    
            X_test = np.vstack(data)
            
            clf = base_learner['clf']
            pred = clf.predict(X_test)
            preds.append(pred)
            
        print (pred)
        return y_test

In [46]:
ensem_dt = EnsemDT(max_depth=2)

In [47]:
ensem_dt.fit(df_pos, df_neg, df_u, df_v)

Training model...: 100%|██████████| 50/50 [04:25<00:00,  5.30s/base learner]


In [48]:
ensem_dt.training_time

265.8094959259033

In [50]:
df_pos.shape, df_neg.shape, df_u.shape, df_v.shape

((4809, 3), (100000, 3), (1862, 881), (1554, 876))

In [51]:
df_pos.head()

Unnamed: 0,u_node,v_node,y
27,785,774,1
29,1181,1019,1
44,1758,671,1
53,219,103,1
75,1795,784,1


In [52]:
df_neg.head()

Unnamed: 0,u_node,v_node,y
0,1226,1019,0
1,1686,1217,0
2,1423,519,0
3,708,2,0
4,412,1552,0


In [53]:
df_u.head()

Unnamed: 0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,...,d872,d873,d874,d875,d876,d877,d878,d879,d880,d881
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
df_v.head()

Unnamed: 0,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,...,t867,t868,t869,t870,t871,t872,t873,t874,t875,t876
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
def train_test_split_ensemdt(df_pos, df_neg, test_size=0.2, shuffle=True):
    # Remove y from pos and neg set
    y_pos = df_pos['y']
    y_neg = df_neg['y']
    
    df_pos_split = df_pos.drop(['y'], axis=1)
    df_neg_split = df_neg.drop(['y'], axis=1)
    
    # Split into pos and neg train and test sets
    X_train_pos, X_test_pos, y_train_pos, y_test_pos = train_test_split(df_pos_split, 
                                                    y_pos, test_size=test_size, random_state=42)
    X_train_neg, X_test_neg, y_train_neg, y_test_neg = train_test_split(df_neg_split, 
                                                    y_neg, test_size=test_size, random_state=42)
    
    # Recombine to form test set
    X_test_pos['y'] = y_test_pos
    X_test_neg['y'] = y_test_neg
    
    X_test = pd.concat([X_test_pos, X_test_neg])
    
    # Re-enter test labels
    X_train_pos['y'] = y_train_pos
    X_train_neg['y'] = y_train_neg
    
    # Shuffle test set
    if shuffle:
        X_test = X_test.sample(frac=1)
        
    return X_train_pos, X_train_neg, X_test

In [56]:
X_train_pos, X_train_neg, X_test = train_test_split_ensemdt(df_pos, df_neg)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-

In [58]:
X_train_pos.shape, X_train_neg.shape, X_test.shape

((3847, 3), (80000, 3), (20962, 3))

In [59]:
X_train_pos.head()

Unnamed: 0,u_node,v_node,y
8971,1584,364,1
99206,5,348,1
77296,57,246,1
51621,19,1240,1
100149,1497,317,1


In [61]:
X_train_neg.head()

Unnamed: 0,u_node,v_node,y
78827,859,879,0
51319,408,436,0
47094,14,180,0
14232,640,1550,0
97172,1849,444,0


In [66]:
X_test.head()

Unnamed: 0,u_node,v_node,y
22592,1811,520,0
103032,57,1226,0
72474,563,453,0
18290,685,557,0
6612,186,1120,0


In [77]:
ensem_dt = EnsemDT(max_depth=1)
ensem_dt.fit(X_train_pos, X_train_neg, df_u, df_v)



Training model...:   0%|          | 0/50 [00:00<?, ?base learner/s][A[A

Training model...:   2%|▏         | 1/50 [00:04<03:47,  4.64s/base learner][A[A

Training model...:   4%|▍         | 2/50 [00:09<03:44,  4.68s/base learner][A[A

Training model...:   6%|▌         | 3/50 [00:13<03:34,  4.56s/base learner][A[A

Training model...:   8%|▊         | 4/50 [00:17<03:22,  4.41s/base learner][A[A

Training model...:  10%|█         | 5/50 [00:21<03:13,  4.30s/base learner][A[A

Training model...:  12%|█▏        | 6/50 [00:27<03:27,  4.71s/base learner][A[A

Training model...:  14%|█▍        | 7/50 [00:34<03:56,  5.49s/base learner][A[A

Training model...:  16%|█▌        | 8/50 [00:40<03:56,  5.63s/base learner][A[A

KeyboardInterrupt: 

In [73]:
ensem_dt.predict(X_test)


Testing model...:   0%|          | 0/50 [00:00<?, ?base learner/s][A

UnboundLocalError: local variable 'X_test' referenced before assignment