In [3]:
from __future__ import print_function
from __future__ import division

import pickle
import numpy as np
import pandas as pd
import scipy.sparse as sp
import matplotlib.pyplot as plt

In [5]:
with open('data/dti_store/graph_2.pkl') as f:
    graph = pickle.load(f)
num_u, num_v, u_nodes, v_nodes, ratings, u_feat, v_feat = graph

In [6]:
u_feat = u_feat.toarray()
v_feat = v_feat.toarray()

In [7]:
np.min(u_nodes), np.max(u_nodes), np.min(v_nodes), np.max(v_nodes)

(0, 1861, 0, 1553)

In [8]:
X = []
for i in range(len(u_nodes)):
    x = np.concatenate([u_feat[u_nodes[i]], v_feat[v_nodes[i]]], axis=0)
    X.append(x)
X = np.vstack(X)

In [12]:
X.shape

(104809, 1757)

In [10]:
y = ratings

In [11]:
y.shape

(104809,)

In [4]:
from time import time
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, shuffle=True)

In [35]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((83847, 1757), (20962, 1757), (83847,), (20962,))

In [55]:
t0 = time()
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
print ("Time taken =", time() - t0, "s")

Time taken = 169.81208396 s


In [50]:
pred = dt.predict_proba(X_test)[:, 1]

In [14]:
from sklearn.metrics import roc_auc_score

In [51]:
roc_auc_score(y_test, pred)

0.7833585758835759

In [48]:
pred

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [54]:
t1 = time()
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print ("Time taken =", time() - t1, "s")

Time taken = 11.9432201385 s


In [53]:
pred = rf.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, pred)

0.8147317567567568

In [1]:
from tqdm import tqdm

class EnsemDT():
    
    def __init__(self, subset_features=False,
                    num_base_learners=100,
                    np_ratio=5, feat_dim=100,
                    max_depth=None):
        """ EnsemDT class from paper by Ezzat et al.(2017).
            It is a bagging ensemble of Decision Trees with
            a focus on class imbalance.
            
            :params subset_features: Set true to use feature
                subsetting.
            :params num_base_learners: Number of base learners
                (decision trees) to use.
            :params np_ratio: positive to negative samples
                ratio.
            :params feat_dim: Number of features for 
                subsetting.
            :params max_depth: Maximum depth of each
                individual learner. """
        
        self.num_base_learners = num_base_learners
        self.np_ratio = np_ratio
        self.feat_dim = feat_dim
        self.subset_features = subset_features
        self.max_depth = max_depth
        
        clfs = list()
        for i in range(self.num_base_learners):
            clfs.append(DecisionTreeClassifier(max_depth=self.max_depth))
        self.clfs = clfs
        
        self.clf_fit = False
        
    def fit(self, X_pos, X_neg):
        """ Fit EnsemDT on the dataset.
            
            X_pos and X_neg must be DataFrames
            with label as a column 'y' in each of
            them.
            
            :params X_pos: Dataset with +ve samples.
            :params X_neg: Dataset with -ve samples.
        """
        
        self.X_pos = X_pos
        self.X_neg = X_neg
        
        self.y_pos = X_pos['y']
        self.y_neg = X_neg['y']
        
        self.num_pos = X_pos.shape[0]
        self.num_neg = X_neg.shape[0]
         
        for i in tqdm(range(self.num_base_learners),
                     desc='Training learners...',
                     unit='learners'):
            
            # Random sampling
            X_neg_i = self.X_neg.sample(self.num_pos * self.np_ratio)
            X_pos_i = self.X_pos
            
            # Merge dataset
            X_i = pd.concat([X_neg_i, X_pos_i])
            y_i = X_i['y']
            X_i.drop(['y'], axis=1, inplace=True)
            
            # Feature subsetting
            if self.subset_features:
                X_i = X_i.sample(self.feat_dim, 
                                 axis=1)
            
            self.clfs[i].fit(X_i, y_i)
            
        self.clf_fit = True
            
    def get_scores(self, X_val):
        """ Returns scores of classes. The
            score is directly related to the class
            predicted. 
            
            :params X_val: Validation set (or test). """
        
        if not self.clf_fit:
            raise RuntimeError('Call clf.fit before clf.predict.')
        
        # Create predictions from learners
        preds = list()
        for i in range(self.num_base_learners):
            pred = self.clfs[i].predict(X_val)
            preds.append(pred)
            
        # Average results
        preds = np.vstack(preds)
        preds = preds.T
        
        scores = list()
        for pred in preds:
            scores.append(float(sum(pred))/float(preds.shape[1]))
            
        return scores
    
    def predict(self, X_val):
        """ Predict labels for the given validation
            set (0 or 1). Calls the get_scores function
            for prediction. 
            
            :params X_val: Validation set (or test). """
        
        # Get scores
        preds = list()
        scores = self.get_scores(X_val)

        # Round to predictions
        for score in scores:
            preds.append(round(score))
    
        # Read as numpy array
        preds = np.array(preds).astype('int32')
        
        return preds

In [17]:
X_train, X_test, y_train, y_test

(array([[1., 0., 0., ..., 0., 0., 0.],
        [1., 1., 0., ..., 0., 0., 0.],
        [1., 1., 1., ..., 0., 0., 0.],
        ...,
        [1., 1., 1., ..., 0., 0., 0.],
        [1., 1., 1., ..., 0., 0., 0.],
        [1., 1., 0., ..., 0., 0., 0.]], dtype=float32),
 array([[1., 1., 1., ..., 0., 0., 0.],
        [1., 1., 1., ..., 0., 0., 0.],
        [1., 1., 0., ..., 0., 0., 0.],
        ...,
        [1., 1., 1., ..., 0., 0., 0.],
        [1., 1., 0., ..., 0., 0., 0.],
        [1., 1., 1., ..., 0., 0., 0.]], dtype=float32),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([0, 0, 0, ..., 0, 0, 0]))