# Classification based on node2vec embeddings and static features

In [1]:
import pickle
import numpy as np

### Train-test split

In [16]:
# Retrieve node embeddings and corresponding subjects
from gensim.models import KeyedVectors
model = KeyedVectors.load('word2vec_all.emb')
node_embeddings = []
node_targets = []
with open('train_data_all.pickle', 'rb') as f:
    train_data = pickle.load(f)
    f.close()
with open('train_index_all.pickle', 'rb') as f:
    list_index = pickle.load(f)
    f.close()
with open('train_target_all.pickle', 'rb') as f:
    list_target = pickle.load(f)
    f.close()
for idx in list(model.wv.vocab):
    if list(list_index).index(int(idx)):
        index = list(list_index).index(int(idx))
        list_data_vector = np.concatenate((list(model.wv.get_vector(idx)), train_data[index])) 
        node_embeddings.append(list_data_vector)
        node_targets.append(list_target[index])
print(len(node_targets))
print(len(node_embeddings))
print(len(train_data[index]))

27525
27525
103


In [17]:
# X will hold the 128-dimensional input features
X = node_embeddings
# y holds the corresponding target values
y = np.array(node_targets)

In [18]:
from sklearn.model_selection import train_test_split
import numpy as np

In [19]:
def create_train_test_split(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 42)
    print("Train size:", len(X_train))
    print("Test size:", len(X_test))
    print("Classes ratio:", np.mean(y_train))
    return X_train, X_test, y_train, y_test

In [20]:
X_train, X_test, y_train, y_test = create_train_test_split(X, y)

Train size: 22020
Test size: 5505
Classes ratio: 0.48555858310626704


In [21]:
from sklearn import preprocessing
print(X_train)
scaler = preprocessing.StandardScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



### Sklearn classification models - learn from scratch

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [23]:
models_to_consider = [("LogReg", 
                       LogisticRegression(class_weight = "balanced", max_iter = 10000), 
                       {"solver" : ("lbfgs",),
                        'C':[1, 5, 10]}),
                      ("RandomForestClassifier", 
                       RandomForestClassifier(class_weight = "balanced"), 
                       {"n_estimators":[10, 50, 100]})]

In [24]:
def learn_and_compute_results(models_to_consider, X_train, X_test, y_train, y_test, 
                              print_classification_report = False):
    res = dict()
    for (model_name, model, model_params) in models_to_consider:
        clf = GridSearchCV(model, model_params, cv = 5, scoring = 'roc_auc')
        clf.fit(X_train, y_train)
        res[model_name] = clf.score(X_test, y_test)
        print(model_name, "test ROC AUC:", res[model_name])
        if print_classification_report:
            y_pred = clf.predict(X_test)
            print(classification_report(y_test, y_pred))
    return res

In [25]:
learn_and_compute_results(models_to_consider, X_train, X_test, y_train, y_test)

LogReg test ROC AUC: 0.9947638923235284
RandomForestClassifier test ROC AUC: 0.9922597760403787


{'LogReg': 0.9947638923235284, 'RandomForestClassifier': 0.9922597760403787}

### Combine classification models for node2vec and on static features

In [26]:
with open('best_node2vec_model_all.pickle', "rb") as f:
    clf_node2vec = pickle.load(f)

In [27]:
with open('best_static_model_all.pickle', "rb") as f:
    clf_static = pickle.load(f)

In [30]:
print(np.array(X_train).shape)
print(np.array(X_test).shape)
X_train_static = clf_static["scaler"].transform(X_train[:, :103])
X_train_node2vec = X_train[:, 103:]
X_test_static = clf_static["scaler"].transform(X_test[:, :103])
X_test_node2vec = X_test[:,  103:]

(22020, 231)
(5505, 231)


In [31]:
print("Pretrained static model ROC AUC: ", clf_static["classifier"].score(X_test_static, y_test))
print("Pretrained node2vec model ROC AUC: ", clf_node2vec.score(X_test_node2vec, y_test))

Pretrained static model ROC AUC:  0.5385171023902976
Pretrained node2vec model ROC AUC:  0.5186194368755677


In [32]:
train_node2vec_proba = clf_node2vec.predict_proba(X_train_node2vec)
train_static_proba = clf_static["classifier"].predict_proba(X_train_static)
test_node2vec_proba = clf_node2vec.predict_proba(X_test_node2vec)
test_static_proba = clf_static["classifier"].predict_proba(X_test_static)

In [33]:
train_concat = np.vstack([train_node2vec_proba[:,0], train_static_proba[:,0]]).T
test_concat = np.vstack([test_node2vec_proba[:,0], test_static_proba[:,0]]).T

In [34]:
models_to_consider = [("LogReg", 
                       LogisticRegression(class_weight = "balanced", max_iter = 1000), 
                       {"solver" : ("lbfgs",),
                        'C':[1, 5, 10]}),
                      ("RandomForestClassifier", 
                       RandomForestClassifier(class_weight = "balanced"), 
                       {"n_estimators":[10, 50, 100]})]

In [35]:
res = dict()
clf_grid = dict()

for (model_name, model, model_params) in models_to_consider:
    print(model_name)
    clf_grid[model_name] = GridSearchCV(model, model_params, cv = 5, scoring = 'roc_auc')
    clf_grid[model_name].fit(train_concat, y_train)
    y_pred = clf_grid[model_name].predict(test_concat)
    print("Best ROC AUC: ", clf_grid[model_name].score(test_concat, y_test))

LogReg
Best ROC AUC:  0.5523888180824779
RandomForestClassifier
Best ROC AUC:  0.5454943080100016
