In [5]:
from __future__ import division
from __future__ import print_function

import click
import pickle
import random
import numpy as np
import pandas as pd
import concurrent.futures

from time import time
from tqdm import tqdm
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [2]:
DATA_PATH = 'data/dti_store/graph_2.pkl'

In [16]:
def load_data_ensemdt(data_path):
    print ("Loading data from disk...")
    
    with open(data_path) as f:
        graph = pickle.load(f)
    num_u, num_v, u_nodes, v_nodes, y, u_feat, v_feat = graph
    
    print ("Total no. of nodes =", y.shape[0])
    print ("Shape of drug feature tensor =", u_feat.shape)
    print ("Shape of target feature tensor =", v_feat.shape)
    print ("")
    
    df = np.vstack([u_nodes, v_nodes, y])
    df_transpose = df.T
    df = pd.DataFrame(df_transpose, columns=['u_node', 'v_node', 'y'])
    
    df_pos = df[df['y'] == 1]
    df_neg = df[df['y'] == 0]
    
    u_feat = u_feat.toarray()
    v_feat = v_feat.toarray()
    
    u_feat_headers = ['d' + str(i + 1) for i in range(u_feat.shape[1])]
    v_feat_headers = ['t' + str(i + 1) for i in range(v_feat.shape[1])]
    
    df_u = pd.DataFrame(u_feat, columns=u_feat_headers)
    df_v = pd.DataFrame(v_feat, columns=v_feat_headers)
    
    print ("Shape of df_u =", df_u.shape)
    print ("Shape of df_v =", df_v.shape)
    print ("Shape of df_pos =", df_pos.shape)
    print ("Shape of df_neg =", df_neg.shape)

    return df_pos, df_neg, df_u, df_v


def train_test_split_ensemdt(df_pos, df_neg, test_size=0.2, shuffle=True):
    # Remove y from pos and neg set
    y_pos = df_pos['y']
    y_neg = df_neg['y']
    
    df_pos_split = df_pos.drop(['y'], axis=1)
    df_neg_split = df_neg.drop(['y'], axis=1)
    
    # Split into pos and neg train and test sets
    X_train_pos, X_test_pos, y_train_pos, y_test_pos = train_test_split(df_pos_split, 
                                                    y_pos, test_size=test_size, random_state=42)
    X_train_neg, X_test_neg, y_train_neg, y_test_neg = train_test_split(df_neg_split, 
                                                    y_neg, test_size=test_size, random_state=42)
    
    # Recombine to form test set
    X_test_pos['y'] = y_test_pos
    X_test_neg['y'] = y_test_neg
    
    X_test = pd.concat([X_test_pos, X_test_neg])
    
    # Re-enter test labels
    X_train_pos['y'] = y_train_pos
    X_train_neg['y'] = y_train_neg
    
    # Shuffle test set
    if shuffle:
        X_test = X_test.sample(frac=1)
        
    y_test = np.array(X_test['y'])
    X_test = X_test.drop(['y'], axis=1)
    
    return X_train_pos, X_train_neg, X_test, y_test

In [17]:
df_pos, df_neg, df_u, df_v = load_data_ensemdt(DATA_PATH)
X_train_pos, X_train_neg, X_test, y_test = train_test_split_ensemdt(df_pos, df_neg)

Loading data from disk...
Total no. of nodes = 104809
Shape of drug feature tensor = (1862, 881)
Shape of target feature tensor = (1554, 876)

Shape of df_u = (1862, 881)
Shape of df_v = (1554, 876)
Shape of df_pos = (4809, 3)
Shape of df_neg = (100000, 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [22]:
class EnsemDT:
    
    def __init__(self, n_estimators=50, dim_red_ratio=0.9, 
                 np_ratio=5, reduce_dims=True, n_components=100,
                 max_depth=None):
        
        self.n_estimators = n_estimators
        self.dim_red_ratio = dim_red_ratio
        self.np_ratio = np_ratio
        self.reduce_dims = reduce_dims
        self.n_components = n_components
        self.max_depth = max_depth
        
        self.clfs = list()
        
    def fit(self, df_pos, df_neg, df_u, df_v):
        self.num_pos = df_pos.shape[0]
        self.df_u = df_u
        self.df_v = df_v
        
        self.training_time = time()
        
        for i in tqdm(range(self.n_estimators), desc='Training model...', unit='base learner'):
            df_neg_sampled = df_neg.sample(self.np_ratio*self.num_pos)
            
            training_set = pd.concat([df_neg_sampled, df_pos])
            
            subspace_u = random.sample(range(self.df_u.shape[1]), 
                                       int(self.dim_red_ratio*self.df_u.shape[1]))
            subspace_v = random.sample(range(self.df_v.shape[1]),
                                       int(self.dim_red_ratio*self.df_v.shape[1]))

            head_u = ['d' + str(i+1) for i in subspace_u]
            head_v = ['t' + str(i+1) for i in subspace_v]
            
            df_u_sub = self.df_u[head_u]
            df_v_sub = self.df_v[head_v]
            
            if self.reduce_dims:
                pca_u = PCA(n_components=self.n_components)
                pca_v = PCA(n_components=self.n_components)

                df_u_sub = pca_u.fit_transform(df_u_sub)
                df_v_sub = pca_v.fit_transform(df_v_sub)
            
            data = []
            labels = []

            for _, row in training_set.iterrows():
                try:
                    data.append(np.concatenate([df_u_sub[row['u_node']], df_v_sub[row['v_node']]], axis=0))
                    labels.append(row['y'])
                except:
                    print ("Skipping " + str(row['u_node']) + " " + str(row['v_node']) + "...")
            y = np.vstack(labels)
            y = np.reshape(y, (y.shape[0],))
                    
            X = np.vstack(data)
            
            dt = DecisionTreeClassifier(max_depth=self.max_depth)
            dt.fit(X, y)
            
            base_learner = {'clf': dt, 
                            'u_cols': head_u, 
                            'v_cols': head_v}
            
            self.clfs.append(base_learner)
            
        self.training_time = time() - self.training_time
        
    
    def predict(self, df_test):
        preds = list()
        
        for i in tqdm(range(self.n_estimators), desc='Testing model...', unit='base learner'):
            base_learner = self.clfs[i]
            
            head_u = base_learner['u_cols']
            head_v = base_learner['v_cols']
            
            df_u_sub = self.df_u[head_u]
            df_v_sub = self.df_v[head_v]
            
            if self.reduce_dims:
                pca_u = PCA(n_components=self.n_components)
                pca_v = PCA(n_components=self.n_components)

                df_u_sub = pca_u.fit_transform(df_u_sub)
                df_v_sub = pca_v.fit_transform(df_v_sub)
            
            data = []

            for _, row in df_test.iterrows():
                try:
                    data.append(np.concatenate([df_u_sub[row['u_node']], df_v_sub[row['v_node']]], axis=0))
                except:
                    print ("Skipping " + str(row['u_node']) + " " + str(row['v_node']) + "...")
                    
            X_test = np.vstack(data)
            
            clf = base_learner['clf']
            pred = clf.predict(X_test)
            preds.append(pred)
            
        preds = np.vstack(preds)
        final_preds = np.sum(preds, axis=0).astype(np.float32)
        final_preds /= self.n_estimators
        
        return final_preds
    
    def predict_proba(self, df_test):
        preds = list()
        
        for i in tqdm(range(self.n_estimators), desc='Testing model...', unit='base learner'):
            base_learner = self.clfs[i]
            
            head_u = base_learner['u_cols']
            head_v = base_learner['v_cols']
            
            df_u_sub = self.df_u[head_u]
            df_v_sub = self.df_v[head_v]
            
            if self.reduce_dims:
                pca_u = PCA(n_components=self.n_components)
                pca_v = PCA(n_components=self.n_components)

                df_u_sub = pca_u.fit_transform(df_u_sub)
                df_v_sub = pca_v.fit_transform(df_v_sub)
            
            data = []

            for _, row in df_test.iterrows():
                try:
                    data.append(np.concatenate([df_u_sub[row['u_node']], df_v_sub[row['v_node']]], axis=0))
                except:
                    print ("Skipping " + str(row['u_node']) + " " + str(row['v_node']) + "...")
                    
            X_test = np.vstack(data)
            
            clf = base_learner['clf']
            pred = clf.predict_proba(X_test)[:, 1]
            preds.append(pred)
            
        preds = np.vstack(preds)
        final_preds = np.sum(preds, axis=0).astype(np.float32)
        final_preds /= self.n_estimators
        
        return final_preds

In [None]:
X_train_pos.shape, X_train_neg.shape, X_test.shape

In [23]:
ensem_dt = EnsemDT(max_depth=1)
ensem_dt.fit(X_train_pos, X_train_neg, df_u, df_v)

Training model...: 100%|██████████| 50/50 [03:22<00:00,  4.00s/base learner]


In [24]:
pred = ensem_dt.predict_proba(X_test)

Testing model...: 100%|██████████| 50/50 [02:37<00:00,  3.11s/base learner]


In [14]:
y_test = y_test.data

  """Entry point for launching an IPython kernel.


In [26]:
roc_auc_score(y_test, pred)

0.6016004937629937