# Baselines

We run link prediction and leadtime prediction heuristic and node2vec baselines in this notebook.

## Heuristic Extraction

We extract the heuristics for each node pair.

In [None]:
import networkx as nx
import pandas as pd
import numpy as np
import pickle
from tqdm.notebook import tqdm

### Positive Samples

In [None]:
# read in data
papers = pd.read_csv('../data/papers_processed.csv')
edges_cc = pd.read_csv('../data/edges_cc_dates.csv')
edges_pc = pd.read_csv('../data/edges_pc_dates.csv')
edges_pp = pd.read_csv('../data/edges_pp_dates.csv')
edges_pp['src'] = edges_pp['src'].astype(str)
edges_pp['dst'] = edges_pp['dst'].astype(str)
valid_concepts = pd.read_csv('../data/valid_concepts.csv')
valid_papers = pd.read_csv('../data/valid_papers.csv')

# read in positive samples
pos = pd.read_csv('../data/sampled_graphs/date_data_pos.csv')
pos.sort_values(by=['end_year', 'end_month'], inplace = True)

# load sampled graph
with open('../data/sampled_undirected.gpickle', 'rb') as handle:
    G = pickle.load(handle)

In [None]:
# drop all nodes/edges not in sampled graph
nodes = set(G.nodes())
papers = papers[papers['id'].isin(nodes)].reset_index(drop = True)
valid_papers = valid_papers[valid_papers['paper'].isin(nodes)].reset_index(drop = True)
valid_concepts = valid_concepts[valid_concepts['CUI'].isin(nodes)].reset_index(drop = True)
edges_cc = edges_cc[(edges_cc['src'].isin(nodes)) & (edges_cc['dst'].isin(nodes))].reset_index(drop = True)
edges_pc = edges_pc[(edges_pc['src'].isin(nodes)) & (edges_pc['dst'].isin(nodes))].reset_index(drop = True)
edges_pp = edges_pp[(edges_pp['src'].isin(nodes)) & (edges_pp['dst'].isin(nodes))].reset_index(drop = True)

In [None]:
# extract heuristics at formation date - leadtime for each positive node pair
DyG = nx.Graph()
DyG.add_nodes_from(valid_concepts['CUI'].values.tolist(), type='concept')

list_cc = []

papers_index = 0
cc_index = 0
pc_index = 0
pp_index = 0
for i in tqdm(range(len(pos.values))):
    
    row = pos.values[i]
    date = (row[6], row[7])
        
    if(papers_index < len(papers)):
        paper = papers.iloc[papers_index]
        while (paper['year'], paper['month']) < date:
            DyG.add_node(paper['id'], type='paper')
            papers_index+=1
            if papers_index<len(papers) :
                paper = papers.iloc[papers_index]
            else:
                break
            
    if(cc_index < len(edges_cc)):
        cc = edges_cc.iloc[cc_index]
        while (cc['year'], cc['month']) < date:
            DyG.add_edge(cc['src'], cc['dst'], type='cc')
            cc_index+=1
            if cc_index<len(edges_cc) :
                cc = edges_cc.iloc[cc_index]
            else:
                break
    
    if(pc_index < len(edges_pc)):
        pc = edges_pc.iloc[pc_index]
        while (pc['year'], pc['month']) < date:
            DyG.add_edge(pc['src'], pc['dst'], type='pc')
            pc_index+=1
            if pc_index<len(edges_pc) :
                pc = edges_pc.iloc[pc_index]
            else:
                break
    
    if(pp_index < len(edges_pp)):
        pp = edges_pp.iloc[pp_index]
        while (pp['year'], pp['month']) < date:
            DyG.add_edge(pp['src'], pp['dst'], type='pp')
            pp_index+=1
            if pp_index<len(edges_pp) :
                pp = edges_pp.iloc[pp_index]
            else:
                break
          
    src = row[0]
    dst = row[1]   
    
    num_neighbors = sum(1 for _ in nx.common_neighbors(DyG, src, dst))
    jaccard = list(nx.jaccard_coefficient(DyG, [(src, dst)]))[0][2]
    pa = list(nx.preferential_attachment(DyG, [(src, dst)]))[0][2]
    aa = list(nx.adamic_adar_index(DyG, [(src, dst)]))[0][2]
    ra = list(nx.resource_allocation_index(DyG, [(src, dst)]))[0][2]
    
    list_cc.append((src, dst, row[2], row[3], row[5], row[6], row[7], num_neighbors, jaccard, pa, aa, ra))

In [None]:
# convert to DataFrame
cc = pd.DataFrame(list_cc, columns = ['src', 'dst', 'year', 'month', 'future_time', 'end_year', 'end_month', 'num_neighbors', 'jaccard', 'pa', 'aa', 'ra'])

# sort by date
cc.sort_values(['year', 'month'], inplace = True)
cc.reset_index(inplace = True, drop =True)

# save
cc.to_csv('../data/sampled_baseline_cc.csv', index = False)

### Negative Samples

In [None]:
# read in data
papers = pd.read_csv('../data/papers_processed.csv')
edges_cc = pd.read_csv('../data/edges_cc_dates.csv')
edges_pc = pd.read_csv('../data/edges_pc_dates.csv')
edges_pp = pd.read_csv('../data/edges_pp_dates.csv')
edges_pp['src'] = edges_pp['src'].astype(str)
edges_pp['dst'] = edges_pp['dst'].astype(str)
valid_concepts = pd.read_csv('../data/valid_concepts.csv')
valid_papers = pd.read_csv('../data/valid_papers.csv')

# read in negative samples
neg = pd.read_csv('../data/sampled_graphs/date_data_neg.csv')
neg.sort_values(by=['end_year', 'end_month'], inplace = True)
    
with open('../data/sampled_undirected.gpickle', 'rb') as handle:
    G = pickle.load(handle)

In [None]:
# drop all nodes/edges not in sampled graph
nodes = set(G.nodes())
papers = papers[papers['id'].isin(nodes)].reset_index(drop = True)
valid_papers = valid_papers[valid_papers['paper'].isin(nodes)].reset_index(drop = True)
valid_concepts = valid_concepts[valid_concepts['CUI'].isin(nodes)].reset_index(drop = True)
edges_cc = edges_cc[(edges_cc['src'].isin(nodes)) & (edges_cc['dst'].isin(nodes))].reset_index(drop = True)
edges_pc = edges_pc[(edges_pc['src'].isin(nodes)) & (edges_pc['dst'].isin(nodes))].reset_index(drop = True)
edges_pp = edges_pp[(edges_pp['src'].isin(nodes)) & (edges_pp['dst'].isin(nodes))].reset_index(drop = True)

In [None]:
# extract heuristics at date - leadtime for each negative node pair

DyG = nx.Graph()
DyG.add_nodes_from(valid_concepts['CUI'].values.tolist(), type='concept')

list_no_cc = []

papers_index = 0
cc_index = 0
pc_index = 0
pp_index = 0
for i in tqdm(range(len(neg.values))):
    
    row = neg.values[i]
    date = (row[5], row[6])
        
    if(papers_index < len(papers)):
        paper = papers.iloc[papers_index]
        while (paper['year'], paper['month']) < date:
            DyG.add_node(paper['id'], type='paper')
            papers_index+=1
            if papers_index<len(papers) :
                paper = papers.iloc[papers_index]
            else:
                break
            
    if(cc_index < len(edges_cc)):
        cc = edges_cc.iloc[cc_index]
        while (cc['year'], cc['month']) < date:
            DyG.add_edge(cc['src'], cc['dst'], type='cc')
            cc_index+=1
            if cc_index<len(edges_cc) :
                cc = edges_cc.iloc[cc_index]
            else:
                break
    
    if(pc_index < len(edges_pc)):
        pc = edges_pc.iloc[pc_index]
        while (pc['year'], pc['month']) < date:
            DyG.add_edge(pc['src'], pc['dst'], type='pc')
            pc_index+=1
            if pc_index<len(edges_pc) :
                pc = edges_pc.iloc[pc_index]
            else:
                break
    
    if(pp_index < len(edges_pp)):
        pp = edges_pp.iloc[pp_index]
        while (pp['year'], pp['month']) < date:
            DyG.add_edge(pp['src'], pp['dst'], type='pp')
            pp_index+=1
            if pp_index<len(edges_pp) :
                pp = edges_pp.iloc[pp_index]
            else:
                break
          
    src = row[0]
    dst = row[1]   
    
    num_neighbors = sum(1 for _ in nx.common_neighbors(DyG, src, dst))
    jaccard = list(nx.jaccard_coefficient(DyG, [(src, dst)]))[0][2]
    pa = list(nx.preferential_attachment(DyG, [(src, dst)]))[0][2]
    aa = list(nx.adamic_adar_index(DyG, [(src, dst)]))[0][2]
    ra = list(nx.resource_allocation_index(DyG, [(src, dst)]))[0][2]
    
    list_no_cc.append((src, dst, row[2], row[3], row[4], row[5], row[6], num_neighbors, jaccard, pa, aa, ra))

In [None]:
# convert to DataFrame
no_cc = pd.DataFrame(list_no_cc, columns = ['src', 'dst', 'year', 'month', 'future_time', 'end_year', 'end_month', 'num_neighbors', 'jaccard', 'pa', 'aa', 'ra'])

# save
no_cc.to_csv('../data/sampled_baseline_no_cc.csv', index = False)

## Heuristic Models - Link Prediction

In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid

In [2]:
# read in data
cc = pd.read_csv('../data/sampled_baseline_cc.csv')
no_cc = pd.read_csv('../data/sampled_baseline_no_cc.csv')

In [3]:
# perform train, val, test split. 1/2019 - 7/2020 = train; 8/2020 = val; 9/2020 - 5/2021 = test
cc_split_1 = cc[(cc['year']==2019) & (cc['month']==1)].index[0]
cc_split_2 = cc[(cc['year']==2020) & (cc['month']==7)].index[-1]+1
cc_split_3 = cc[(cc['year']==2020) & (cc['month']==8)].index[-1]+1
train_cc = cc[cc_split_1:cc_split_2].reset_index(drop=True)
train_no_cc = no_cc[cc_split_1:cc_split_2].reset_index(drop=True)
val_cc = cc[cc_split_2:cc_split_3].reset_index(drop=True)
val_no_cc = no_cc[cc_split_2:cc_split_3].reset_index(drop=True)
test_cc = cc[cc_split_3:].reset_index(drop=True)
test_no_cc = no_cc[cc_split_3:].reset_index(drop=True)

assert len(train_cc) == len(train_no_cc)
assert len(val_cc) == len(val_no_cc)
assert len(test_cc) == len(test_no_cc)

In [4]:
total = len(cc[cc_split_1:])
print("Training : {}; validation : {}; testing : {}".format(len(train_cc)/total, len(val_cc)/total, len(test_cc)/total))

Training : 0.8232219851051783; validation : 0.07425634772004704; testing : 0.10252166717477462


In [5]:
# grid search parameters
param_grid = ParameterGrid({'C': [1, 0.1, 0.01, 0.001, 0.0001], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'max_iter':[1000], 'verbose':[1], 'random_state':[12345]})

### Common Neighbors

In [7]:
# training, validation, and testing data
common_neighbors_train_x = np.concatenate((train_cc['num_neighbors'].values.reshape([-1, 1]), train_no_cc['num_neighbors'].values.reshape([-1, 1])), axis=0)
common_neighbors_train_y = np.concatenate((np.ones(len(train_cc)), np.zeros(len(train_no_cc))), axis = 0)
common_neighbors_val_x = np.concatenate((val_cc['num_neighbors'].values.reshape([-1, 1]), val_no_cc['num_neighbors'].values.reshape([-1, 1])), axis=0)
common_neighbors_val_y = np.concatenate((np.ones(len(val_cc)), np.zeros(len(val_no_cc))), axis = 0)
common_neighbors_test_x = np.concatenate((test_cc['num_neighbors'].values.reshape([-1, 1]), test_no_cc['num_neighbors'].values.reshape([-1, 1])), axis=0)
common_neighbors_test_y = np.concatenate((np.ones(len(test_cc)), np.zeros(len(test_no_cc))), axis = 0)

# scale data
common_neighbors_scaler = StandardScaler()
common_neighbors_train_x = common_neighbors_scaler.fit_transform(common_neighbors_train_x)
common_neighbors_val_x = common_neighbors_scaler.transform(common_neighbors_val_x)
common_neighbors_test_x = common_neighbors_scaler.transform(common_neighbors_test_x)

# shuffle train, val, and test arrays in unison
p_train = np.random.permutation(len(common_neighbors_train_x))
common_neighbors_train_x = common_neighbors_train_x[p_train]
common_neighbors_train_y = common_neighbors_train_y[p_train]
p_val = np.random.permutation(len(common_neighbors_val_x))
common_neighbors_val_x = common_neighbors_val_x[p_val]
common_neighbors_val_y = common_neighbors_val_y[p_val]
p_test = np.random.permutation(len(common_neighbors_test_x))
common_neighbors_test_x = common_neighbors_test_x[p_test]
common_neighbors_test_y = common_neighbors_test_y[p_test]

# fit logistic regression and measure F1 score on validation set, keeping track of the best set of hyperparameters
best_grid = None
best_f1 = 0
for g in param_grid:
    common_neighbors_clf = LogisticRegression(**g).fit(common_neighbors_train_x, common_neighbors_train_y)
    
    predictions = common_neighbors_clf.predict(common_neighbors_val_x)
    f1 = metrics.f1_score(common_neighbors_val_y, predictions)
    
    if f1 > best_f1:
        best_f1 = f1
        best_grid = g

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[LibLinear]convergence after 13 epochs took 0 seconds
convergence after 11 epochs took 0 seconds
[LibLinear]

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


convergence after 16 epochs took 0 seconds
convergence after 11 epochs took 0 seconds
[LibLinear]convergence after 12 epochs took 0 seconds
convergence after 11 epochs took 0 seconds


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


[LibLinear]convergence after 10 epochs took 0 seconds
convergence after 10 epochs took 0 seconds
[LibLinear]rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


rescaling...
rescaling...
rescaling...
convergence after 5 epochs took 0 seconds
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
convergence after 10 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [8]:
common_neighbors_clf = LogisticRegression(**best_grid).fit(common_neighbors_train_x, common_neighbors_train_y)

predictions = common_neighbors_clf.predict(common_neighbors_test_x)
prob_predictions = common_neighbors_clf.predict_proba(common_neighbors_test_x)[:,-1]

accuracy = metrics.accuracy_score(common_neighbors_test_y, predictions)
roc_score = metrics.roc_auc_score(common_neighbors_test_y, prob_predictions)
precision = metrics.precision_score(common_neighbors_test_y, predictions)
recall = metrics.recall_score(common_neighbors_test_y, predictions)
f1 = metrics.f1_score(common_neighbors_test_y, predictions)
print('Accuracy : {}\nROC-AUC : {}\nPrecision : {}\nRecall : {}\nF1 : {}'.format(accuracy, roc_score, precision, recall, f1))

Accuracy : 0.6682242990654206
ROC-AUC : 0.738019632881431
Precision : 0.8173076923076923
Recall : 0.4333050127442651
F1 : 0.5663520266518601


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


### Jaccard

In [9]:
#training, validation, and testing data
jaccard_train_x = np.concatenate((train_cc['jaccard'].values.reshape([-1, 1]), train_no_cc['jaccard'].values.reshape([-1, 1])), axis=0)
jaccard_train_y = np.concatenate((np.ones(len(train_cc)), np.zeros(len(train_no_cc))), axis = 0)
jaccard_val_x = np.concatenate((val_cc['jaccard'].values.reshape([-1, 1]), val_no_cc['jaccard'].values.reshape([-1, 1])), axis=0)
jaccard_val_y = np.concatenate((np.ones(len(val_cc)), np.zeros(len(val_no_cc))), axis = 0)
jaccard_test_x = np.concatenate((test_cc['jaccard'].values.reshape([-1, 1]), test_no_cc['jaccard'].values.reshape([-1, 1])), axis=0)
jaccard_test_y = np.concatenate((np.ones(len(test_cc)), np.zeros(len(test_no_cc))), axis = 0)

# scale data
jaccard_scaler = StandardScaler()
jaccard_train_x = jaccard_scaler.fit_transform(jaccard_train_x)
jaccard_val_x = jaccard_scaler.transform(jaccard_val_x)
jaccard_test_x = jaccard_scaler.transform(jaccard_test_x)

# shuffle train, val, and test arrays in unison
p_train = np.random.permutation(len(jaccard_train_x))
jaccard_train_x = jaccard_train_x[p_train]
jaccard_train_y = jaccard_train_y[p_train]
p_val = np.random.permutation(len(jaccard_val_x))
jaccard_val_x = jaccard_val_x[p_val]
jaccard_val_y = jaccard_val_y[p_val]
p_test = np.random.permutation(len(jaccard_test_x))
jaccard_test_x = jaccard_test_x[p_test]
jaccard_test_y = jaccard_test_y[p_test]

# fit logistic regression and measure F1 score on validation set, keeping track of the best set of hyperparameters
best_grid = None
best_f1 = 0
for g in param_grid:
    jaccard_clf = LogisticRegression(**g).fit(jaccard_train_x, jaccard_train_y)
    
    predictions = jaccard_clf.predict(jaccard_val_x)
    f1 = metrics.f1_score(jaccard_val_y, predictions)
    
    if f1 > best_f1:
        best_f1 = f1
        best_grid = g

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


[LibLinear]convergence after 17 epochs took 0 seconds
convergence after 10 epochs took 0 seconds
[LibLinear]

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 8 epochs took 0 seconds
convergence after 4 epochs took 0 seconds
[LibLinear]convergence after 17 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


convergence after 10 epochs took 0 seconds
[LibLinear]convergence after 12 epochs took 0 seconds
convergence after 10 epochs took 0 seconds


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[LibLinear]rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
convergence after 10 epochs took 0 seconds
convergence after 9 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [10]:
jaccard_clf = LogisticRegression(**best_grid).fit(jaccard_train_x, jaccard_train_y)

predictions = jaccard_clf.predict(jaccard_test_x)
prob_predictions = jaccard_clf.predict_proba(jaccard_test_x)[:,-1]

accuracy = metrics.accuracy_score(jaccard_test_y, predictions)
roc_score = metrics.roc_auc_score(jaccard_test_y, prob_predictions)
precision = metrics.precision_score(jaccard_test_y, predictions)
recall = metrics.recall_score(jaccard_test_y, predictions)
f1 = metrics.f1_score(jaccard_test_y, predictions)
print('Accuracy : {}\nROC-AUC : {}\nPrecision : {}\nRecall : {}\nF1 : {}'.format(accuracy, roc_score, precision, recall, f1))

[LibLinear]Accuracy : 0.6879779099405268
ROC-AUC : 0.7414966950089112
Precision : 0.7983816587997303
Recall : 0.5029736618521665
F1 : 0.617148814177743


### Preferential Attachment

In [11]:
#training, validation, and testing data
pa_train_x = np.concatenate((train_cc['pa'].values.reshape([-1, 1]), train_no_cc['pa'].values.reshape([-1, 1])), axis=0)
pa_train_y = np.concatenate((np.ones(len(train_cc)), np.zeros(len(train_no_cc))), axis = 0)
pa_val_x = np.concatenate((val_cc['pa'].values.reshape([-1, 1]), val_no_cc['pa'].values.reshape([-1, 1])), axis=0)
pa_val_y = np.concatenate((np.ones(len(val_cc)), np.zeros(len(val_no_cc))), axis = 0)
pa_test_x = np.concatenate((test_cc['pa'].values.reshape([-1, 1]), test_no_cc['pa'].values.reshape([-1, 1])), axis=0)
pa_test_y = np.concatenate((np.ones(len(test_cc)), np.zeros(len(test_no_cc))), axis = 0)

# scale data
pa_scaler = StandardScaler()
pa_train_x = pa_scaler.fit_transform(pa_train_x)
pa_val_x = pa_scaler.transform(pa_val_x)
pa_test_x = pa_scaler.transform(pa_test_x)

# shuffle train, val, and test arrays in unison
p_train = np.random.permutation(len(pa_train_x))
pa_train_x = pa_train_x[p_train]
pa_train_y = pa_train_y[p_train]
p_val = np.random.permutation(len(pa_val_x))
pa_val_x = pa_val_x[p_val]
pa_val_y = pa_val_y[p_val]
p_test = np.random.permutation(len(pa_test_x))
pa_test_x = pa_test_x[p_test]
pa_test_y = pa_test_y[p_test]

# fit logistic regression and measure F1 score on validation set, keeping track of the best set of hyperparameters
best_grid = None
best_f1 = 0
for g in param_grid:
    pa_clf = LogisticRegression(**g).fit(pa_train_x, pa_train_y)
    
    predictions = pa_clf.predict(pa_val_x)
    f1 = metrics.f1_score(pa_val_y, predictions)
    
    if f1 > best_f1:
        best_f1 = f1
        best_grid = g

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[LibLinear]convergence after 15 epochs took 0 seconds
convergence after 11 epochs took 0 seconds
[LibLinear]

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


convergence after 15 epochs took 0 seconds
convergence after 11 epochs took 0 seconds
[LibLinear]convergence after 17 epochs took 0 seconds


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 11 epochs took 0 seconds
[LibLinear]convergence after 6 epochs took 0 seconds
convergence after 11 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


[LibLinear]convergence after 12 epochs took 0 seconds
convergence after 9 epochs took 0 seconds


In [12]:
pa_clf = LogisticRegression(**best_grid).fit(pa_train_x, pa_train_y)

predictions = pa_clf.predict(pa_test_x)
prob_predictions = pa_clf.predict_proba(pa_test_x)[:,-1]

accuracy = metrics.accuracy_score(pa_test_y, predictions)
roc_score = metrics.roc_auc_score(pa_test_y, prob_predictions)
precision = metrics.precision_score(pa_test_y, predictions)
recall = metrics.recall_score(pa_test_y, predictions)
f1 = metrics.f1_score(pa_test_y, predictions)
print('Accuracy : {}\nROC-AUC : {}\nPrecision : {}\nRecall : {}\nF1 : {}'.format(accuracy, roc_score, precision, recall, f1))

Accuracy : 0.5276125743415463
ROC-AUC : 0.5468877790041211
Precision : 0.5627413127413128
Recall : 0.24766355140186916
F1 : 0.343952802359882


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


### Adamic-Adar

In [13]:
#training, validation, and testing data
aa_train_x = np.concatenate((train_cc['aa'].values.reshape([-1, 1]), train_no_cc['aa'].values.reshape([-1, 1])), axis=0)
aa_train_y = np.concatenate((np.ones(len(train_cc)), np.zeros(len(train_no_cc))), axis = 0)
aa_val_x = np.concatenate((val_cc['aa'].values.reshape([-1, 1]), val_no_cc['aa'].values.reshape([-1, 1])), axis=0)
aa_val_y = np.concatenate((np.ones(len(val_cc)), np.zeros(len(val_no_cc))), axis = 0)
aa_test_x = np.concatenate((test_cc['aa'].values.reshape([-1, 1]), test_no_cc['aa'].values.reshape([-1, 1])), axis=0)
aa_test_y = np.concatenate((np.ones(len(test_cc)), np.zeros(len(test_no_cc))), axis = 0)

# scale data
aa_scaler = StandardScaler()
aa_train_x = aa_scaler.fit_transform(aa_train_x)
aa_val_x = aa_scaler.transform(aa_val_x)
aa_test_x = aa_scaler.transform(aa_test_x)

# shuffle train, val, and test arrays in unison
p_train = np.random.permutation(len(aa_train_x))
aa_train_x = aa_train_x[p_train]
aa_train_y = aa_train_y[p_train]
p_val = np.random.permutation(len(aa_val_x))
aa_val_x = aa_val_x[p_val]
aa_val_y = aa_val_y[p_val]
p_test = np.random.permutation(len(aa_test_x))
aa_test_x = aa_test_x[p_test]
aa_test_y = aa_test_y[p_test]

# fit logistic regression and measure F1 score on validation set, keeping track of the best set of hyperparameters
best_grid = None
best_f1 = 0
for g in param_grid:
    aa_clf = LogisticRegression(**g).fit(aa_train_x, aa_train_y)
    
    predictions = aa_clf.predict(aa_val_x)
    f1 = metrics.f1_score(aa_val_y, predictions)
    
    if f1 > best_f1:
        best_f1 = f1
        best_grid = g

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[LibLinear]convergence after 16 epochs took 0 seconds
convergence after 7 epochs took 0 seconds
[LibLinear]

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


convergence after 11 epochs took 0 seconds
convergence after 7 epochs took 0 seconds
[LibLinear]convergence after 14 epochs took 0 seconds
convergence after 10 epochs took 0 seconds


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


[LibLinear]convergence after 8 epochs took 0 seconds
convergence after 10 epochs took 0 seconds
[LibLinear]

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
convergence after 8 epochs took 0 seconds
convergence after 5 epochs took 0 seconds


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [14]:
aa_clf = LogisticRegression(**best_grid).fit(aa_train_x, aa_train_y)

predictions = aa_clf.predict(aa_test_x)
prob_predictions = aa_clf.predict_proba(aa_test_x)[:,-1]

accuracy = metrics.accuracy_score(aa_test_y, predictions)
roc_score = metrics.roc_auc_score(aa_test_y, prob_predictions)
precision = metrics.precision_score(aa_test_y, predictions)
recall = metrics.recall_score(aa_test_y, predictions)
f1 = metrics.f1_score(aa_test_y, predictions)
print('Accuracy : {}\nROC-AUC : {}\nPrecision : {}\nRecall : {}\nF1 : {}'.format(accuracy, roc_score, precision, recall, f1))

Accuracy : 0.6607901444350043
ROC-AUC : 0.7332444134209274
Precision : 0.7982663514578409
Recall : 0.43033135089209856
F1 : 0.5592050786640905


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


### Resource Allocation

In [15]:
#training, validation, and testing data
ra_train_x = np.concatenate((train_cc['ra'].values.reshape([-1, 1]), train_no_cc['ra'].values.reshape([-1, 1])), axis=0)
ra_train_y = np.concatenate((np.ones(len(train_cc)), np.zeros(len(train_no_cc))), axis = 0)
ra_val_x = np.concatenate((val_cc['ra'].values.reshape([-1, 1]), val_no_cc['ra'].values.reshape([-1, 1])), axis=0)
ra_val_y = np.concatenate((np.ones(len(val_cc)), np.zeros(len(val_no_cc))), axis = 0)
ra_test_x = np.concatenate((test_cc['ra'].values.reshape([-1, 1]), test_no_cc['ra'].values.reshape([-1, 1])), axis=0)
ra_test_y = np.concatenate((np.ones(len(test_cc)), np.zeros(len(test_no_cc))), axis = 0)

# scale data
ra_scaler = StandardScaler()
ra_train_x = ra_scaler.fit_transform(ra_train_x)
ra_val_x = ra_scaler.transform(ra_val_x)
ra_test_x = ra_scaler.transform(ra_test_x)

# shuffle train, val, and test arrays in unison
p_train = np.random.permutation(len(ra_train_x))
ra_train_x = ra_train_x[p_train]
ra_train_y = ra_train_y[p_train]
p_val = np.random.permutation(len(ra_val_x))
ra_val_x = ra_val_x[p_val]
ra_val_y = ra_val_y[p_val]
p_test = np.random.permutation(len(ra_test_x))
ra_test_x = ra_test_x[p_test]
ra_test_y = ra_test_y[p_test]

# fit logistic regression and measure F1 score on validation set, keeping track of the best set of hyperparameters
best_grid = None
best_f1 = 0
for g in param_grid:
    ra_clf = LogisticRegression(**g).fit(ra_train_x, ra_train_y)
    
    predictions = ra_clf.predict(ra_val_x)
    f1 = metrics.f1_score(ra_val_y, predictions)
    
    if f1 > best_f1:
        best_f1 = f1
        best_grid = g

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


[LibLinear]convergence after 18 epochs took 0 seconds
convergence after 16 epochs took 0 seconds


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


[LibLinear]convergence after 17 epochs took 0 seconds
convergence after 16 epochs took 0 seconds


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


[LibLinear]convergence after 13 epochs took 0 seconds
convergence after 15 epochs took 0 seconds


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


[LibLinear]convergence after 12 epochs took 0 seconds
convergence after 10 epochs took 0 seconds
[LibLinear]convergence after 9 epochs took 0 seconds
convergence after 14 epochs took 0 seconds


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [16]:
ra_clf = LogisticRegression(**best_grid).fit(ra_train_x, ra_train_y)

predictions = ra_clf.predict(ra_test_x)
prob_predictions = ra_clf.predict_proba(ra_test_x)[:,-1]

accuracy = metrics.accuracy_score(ra_test_y, predictions)
roc_score = metrics.roc_auc_score(ra_test_y, prob_predictions)
precision = metrics.precision_score(ra_test_y, predictions)
recall = metrics.recall_score(ra_test_y, predictions)
f1 = metrics.f1_score(ra_test_y, predictions)
print('Accuracy : {}\nROC-AUC : {}\nPrecision : {}\nRecall : {}\nF1 : {}'.format(accuracy, roc_score, precision, recall, f1))

Accuracy : 0.5488530161427357
ROC-AUC : 0.7070537034884854
Precision : 0.6448362720403022
Recall : 0.21750212404418012
F1 : 0.32528589580686146


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


## Node2Vec - Link Prediction

### Training

In [None]:
from nodevectors import Node2Vec

In [None]:
# read in data
with open('../data/sampled_undirected.gpickle', 'rb') as handle:
    G = pickle.load(handle)

cc = pd.read_csv('../data/sampled_baseline_cc.csv')
papers = pd.read_csv('../data/papers_processed.csv')
no_cc = pd.read_csv('../data/sampled_baseline_no_cc.csv')
cc_split = cc[(cc['year']==2020) & (cc['month']==7)].index[-1]+1
test_cc = cc[cc_split:]

In [None]:
#remove testing and validation edges from graph
for i in tqdm(test_cc.index):
    row = test_cc.loc[i]
    src = row['src']
    dst = row['dst']
    G.remove_edge(src, dst)

In [None]:
nodes = set(G.nodes())
papers = papers[papers['id'].isin(nodes)].reset_index(drop = True)
papers_split = papers[(papers['year']==2020) & (papers['month']==7)].index[-1]+1
papers = papers[papers_split:]

In [None]:
#remove papers published after July 2020
for i in tqdm(papers.index):
    row = papers.loc[i]
    G.remove_node(row['id'])

In [None]:
# fit Node2Vec and save model
g2v = Node2Vec(n_components=128, walklen=80, epochs=10, threads=0)
g2v.fit(G)
g2v.save('../data/sampled_node2vec')

### Testing

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

In [None]:
# read in n2v embeddings
g2v = Node2Vec.load('../data/sampled_node2vec.zip')

In [None]:
# read in data
cc = pd.read_csv('../data/sampled_baseline_cc.csv')
no_cc = pd.read_csv('../data/sampled_baseline_no_cc.csv')

# train, val, test split. 1/2019 - 7/2020 = train; 8/2020 = val; 9/2020 - 5/2021 = test
cc_split_1 = cc[(cc['year']==2019) & (cc['month']==1)].index[0]
cc_split_2 = cc[(cc['year']==2020) & (cc['month']==7)].index[-1]+1
cc_split_3 = cc[(cc['year']==2020) & (cc['month']==8)].index[-1]+1
train_cc = cc[cc_split_1:cc_split_2].reset_index(drop=True)
train_no_cc = no_cc[cc_split_1:cc_split_2].reset_index(drop=True)
val_cc = cc[cc_split_2:cc_split_3].reset_index(drop=True)
val_no_cc = no_cc[cc_split_2:cc_split_3].reset_index(drop=True)
test_cc = cc[cc_split_3:].reset_index(drop=True)
test_no_cc = no_cc[cc_split_3:].reset_index(drop=True)

In [None]:
# training data
n2v_train_x = []
for row in tqdm(train_cc.values):
    n2v_train_x.append(np.multiply(g2v.predict(row[0]), g2v.predict(row[1])))
for row in tqdm(train_no_cc.values):
    n2v_train_x.append(np.multiply(g2v.predict(row[0]), g2v.predict(row[1])))
n2v_train_x = np.array(n2v_train_x)

# val data
n2v_val_x = []
for row in tqdm(val_cc.values):
    n2v_val_x.append(np.multiply(g2v.predict(row[0]), g2v.predict(row[1])))
for row in tqdm(val_no_cc.values):
    n2v_val_x.append(np.multiply(g2v.predict(row[0]), g2v.predict(row[1])))
n2v_val_x = np.array(n2v_val_x)

# test data
n2v_test_x = []
for row in tqdm(test_cc.values):
    n2v_test_x.append(np.multiply(g2v.predict(row[0]), g2v.predict(row[1])))
for row in tqdm(test_no_cc.values):
    n2v_test_x.append(np.multiply(g2v.predict(row[0]), g2v.predict(row[1])))
n2v_test_x = np.array(n2v_test_x)

# scale data
n2v_scaler = StandardScaler()
n2v_train_x = n2v_scaler.fit_transform(n2v_train_x)
n2v_val_x = n2v_scaler.transform(n2v_val_x)
n2v_test_x = n2v_scaler.transform(n2v_test_x)

# ground truth
n2v_train_y = np.concatenate((np.ones(len(train_cc)), np.zeros(len(train_no_cc))), axis = 0)
n2v_val_y = np.concatenate((np.ones(len(val_cc)), np.zeros(len(val_no_cc))), axis = 0)
n2v_test_y = np.concatenate((np.ones(len(test_cc)), np.zeros(len(test_no_cc))), axis = 0)

# shuffle train and test arrays in unison
p_train = np.random.permutation(len(n2v_train_x))
n2v_train_x = n2v_train_x[p_train]
n2v_train_y = n2v_train_y[p_train]
p_val = np.random.permutation(len(n2v_val_x))
n2v_val_x = n2v_val_x[p_val]
n2v_val_y = n2v_val_y[p_val]
p_test = np.random.permutation(len(n2v_test_x))
n2v_test_x = n2v_test_x[p_test]
n2v_test_y = n2v_test_y[p_test]

In [None]:
# convert to PyTorch tensor
n2v_train_x = torch.from_numpy(n2v_train_x).type(torch.float32)
n2v_train_y = torch.from_numpy(n2v_train_y).type(torch.float32)
n2v_val_x = torch.from_numpy(n2v_val_x).type(torch.float32)
n2v_val_y = torch.from_numpy(n2v_val_y).type(torch.float32)
n2v_test_x = torch.from_numpy(n2v_test_x).type(torch.float32)
n2v_test_y = torch.from_numpy(n2v_test_y).type(torch.float32)

In [None]:
# 2-layer vanilla neural network
class Net(nn.Module):
    def __init__(self):
        super().__init__()
     
        self.fc1 = nn.Linear(128, 32)
        self.fc2 = nn.Linear(32, 4)
        self.fc3 = nn.Linear(4, 1)
        self.dropout = nn.Dropout()
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        x = torch.sigmoid(x)
        return x

In [None]:
# hyperparameters
EPOCHS = 4
BATCH_SIZE = 16

In [None]:
# train model
device = torch.device('cuda:0')
net = Net().to(device)
optimizer = optim.Adam(net.parameters(), lr = 0.0001, weight_decay=1e-6)

train_loss_vals = []
val_loss_vals = []
for epoch in range(EPOCHS):
    net.train()
    epoch_train_loss = []
    for i in tqdm(range(0, len(n2v_train_x), BATCH_SIZE)):
        batch_x = n2v_train_x[i:i+BATCH_SIZE].to(device)
        batch_y = n2v_train_y[i:i+BATCH_SIZE].to(device).reshape((-1, 1)).float()
        
        optimizer.zero_grad()
        output = net(batch_x)
        
        loss = F.binary_cross_entropy(output, batch_y)
        epoch_train_loss.append(loss.item())
        
        loss.backward()
        optimizer.step()
    
    train_loss_vals.append(sum(epoch_train_loss)/len(epoch_train_loss))
    
    net.eval()
    epoch_val_loss = []
    for i in tqdm(range(0, len(n2v_val_x), BATCH_SIZE)):
        output = net(n2v_val_x[i:i+BATCH_SIZE].to(device)).cpu()
        loss = F.binary_cross_entropy(output, n2v_val_y[i:i+BATCH_SIZE].reshape(-1, 1))
        
        epoch_val_loss.append(loss.item())

    val_loss_vals.append(sum(epoch_val_loss)/len(epoch_val_loss))

    print("Epoch: {}; Train Loss: {}; Val Loss: {}".format(epoch, train_loss_vals[-1], val_loss_vals[-1]))

In [None]:
# plot train loss
plt.plot(np.linspace(1, EPOCHS, EPOCHS).astype(int), train_loss_vals)

In [None]:
# plot val loss
plt.plot(np.linspace(1, EPOCHS, EPOCHS).astype(int), val_loss_vals)

In [None]:
# evaluation metrics
with torch.no_grad():
    prob_predictions = net(n2v_test_x.to(device)).cpu().numpy()
    predictions = (prob_predictions>0.5)
    accuracy = metrics.accuracy_score(n2v_test_y.numpy(), predictions)
    roc_score = metrics.roc_auc_score(n2v_test_y.numpy(), prob_predictions)
    precision = metrics.precision_score(n2v_test_y.numpy(), predictions)
    recall = metrics.recall_score(n2v_test_y.numpy(), predictions)
    f1 = metrics.f1_score(n2v_test_y, predictions)
    print('Accuracy : {}\nROC-AUC : {}\nPrecision : {}\nRecall : {}\nF1 : {}'.format(accuracy, roc_score, precision, recall, f1))

## Heuristics - Leadtime Prediction

In [17]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid

In [22]:
# read in data
cc = pd.read_csv('../data/sampled_baseline_cc.csv')
no_cc = pd.read_csv('../data/sampled_baseline_no_cc.csv')

# train, val, test split. 
cc_split_1 = cc[(cc['year']==2019) & (cc['month']==1)].index[0]
cc_split_2 = cc[(cc['year']==2020) & (cc['month']==7)].index[-1]+1
cc_split_3 = cc[(cc['year']==2020) & (cc['month']==8)].index[-1]+1
train_cc = cc[cc_split_1:cc_split_2].reset_index(drop=True)
train_no_cc = no_cc[cc_split_1:cc_split_2].reset_index(drop=True)
val_cc = cc[cc_split_2:cc_split_3].reset_index(drop=True)
val_no_cc = no_cc[cc_split_2:cc_split_3].reset_index(drop=True)
test_cc = cc[cc_split_3:].reset_index(drop=True)
test_no_cc = no_cc[cc_split_3:].reset_index(drop=True)

assert len(train_cc) == len(train_no_cc)
assert len(val_cc) == len(val_no_cc)
assert len(test_cc) == len(test_no_cc)

total = len(cc[cc_split_1:])
print("Training : {}; validation : {}; testing : {}".format(len(train_cc)/total, len(val_cc)/total, len(test_cc)/total))

Training : 0.8232219851051783; validation : 0.07425634772004704; testing : 0.10252166717477462


In [24]:
# grid search parameters
param_grid = ParameterGrid({'C': [1, 0.1, 0.01, 0.001, 0.0001], 'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'], 'max_iter':[1000], 'verbose':[1], 'random_state':[12345], 'multi_class':['multinomial']})

### Common Neighbors

In [25]:
#training, validation, and testing data
common_neighbors_train_x = np.concatenate((train_cc['num_neighbors'].values.reshape([-1, 1]), train_no_cc['num_neighbors'].values.reshape([-1, 1])), axis=0)
common_neighbors_train_y = np.concatenate((train_cc['future_time'].values, np.zeros(len(train_no_cc))), axis = 0)
common_neighbors_val_x = np.concatenate((val_cc['num_neighbors'].values.reshape([-1, 1]), val_no_cc['num_neighbors'].values.reshape([-1, 1])), axis=0)
common_neighbors_val_y = np.concatenate((val_cc['future_time'].values, np.zeros(len(val_no_cc))), axis = 0)
common_neighbors_test_x = np.concatenate((test_cc['num_neighbors'].values.reshape([-1, 1]), test_no_cc['num_neighbors'].values.reshape([-1, 1])), axis=0)
common_neighbors_test_y = np.concatenate((test_cc['future_time'].values, np.zeros(len(test_no_cc))), axis = 0)

# scale data
common_neighbors_scaler = StandardScaler()
common_neighbors_train_x = common_neighbors_scaler.fit_transform(common_neighbors_train_x)
common_neighbors_val_x = common_neighbors_scaler.transform(common_neighbors_val_x)
common_neighbors_test_x = common_neighbors_scaler.transform(common_neighbors_test_x)

# shuffle train, val, and test arrays in unison
p_train = np.random.permutation(len(common_neighbors_train_x))
common_neighbors_train_x = common_neighbors_train_x[p_train]
common_neighbors_train_y = common_neighbors_train_y[p_train]
p_val = np.random.permutation(len(common_neighbors_val_x))
common_neighbors_val_x = common_neighbors_val_x[p_val]
common_neighbors_val_y = common_neighbors_val_y[p_val]
p_test = np.random.permutation(len(common_neighbors_test_x))
common_neighbors_test_x = common_neighbors_test_x[p_test]
common_neighbors_test_y = common_neighbors_test_y[p_test]

# fit logistic regression and measure accuracy score on validation set, keeping track of the best set of hyperparameters
best_grid = None
best_acc = 0
for g in param_grid:
    common_neighbors_clf = LogisticRegression(**g).fit(common_neighbors_train_x, common_neighbors_train_y)
    
    predictions = common_neighbors_clf.predict_proba(common_neighbors_val_x)
    acc = metrics.roc_auc_score(common_neighbors_val_y, predictions, multi_class='ovr')
    
    if acc > best_acc:
        best_acc = acc
        best_grid = g
        

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 22 epochs took 0 seconds
convergence after 12 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 21 epochs took 1 seconds
convergence after 12 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 17 epochs took 0 seconds
convergence after 11 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 12 epochs took 0 seconds
convergence after 11 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
convergence after 11 epochs took 0 seconds
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
convergence after 12 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [26]:
common_neighbors_clf = LogisticRegression(**best_grid).fit(common_neighbors_train_x, common_neighbors_train_y)

predictions = common_neighbors_clf.predict(common_neighbors_test_x)
prob_predictions = common_neighbors_clf.predict_proba(common_neighbors_test_x)

print('Accuracy: {}\nAUC: {}'.format(metrics.accuracy_score(common_neighbors_test_y, predictions), metrics.roc_auc_score(common_neighbors_test_y, prob_predictions, multi_class='ovr')))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy: 0.6191588785046729
AUC: 0.5983486185874572


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished


### Jaccard

In [27]:
#training, validation, and testing data
jaccard_train_x = np.concatenate((train_cc['jaccard'].values.reshape([-1, 1]), train_no_cc['jaccard'].values.reshape([-1, 1])), axis=0)
jaccard_train_y = np.concatenate((train_cc['future_time'].values, np.zeros(len(train_no_cc))), axis = 0)
jaccard_val_x = np.concatenate((val_cc['jaccard'].values.reshape([-1, 1]), val_no_cc['jaccard'].values.reshape([-1, 1])), axis=0)
jaccard_val_y = np.concatenate((val_cc['future_time'].values, np.zeros(len(val_no_cc))), axis = 0)
jaccard_test_x = np.concatenate((test_cc['jaccard'].values.reshape([-1, 1]), test_no_cc['jaccard'].values.reshape([-1, 1])), axis=0)
jaccard_test_y = np.concatenate((test_cc['future_time'].values, np.zeros(len(test_no_cc))), axis = 0)

# scale data
jaccard_scaler = StandardScaler()
jaccard_train_x = jaccard_scaler.fit_transform(jaccard_train_x)
jaccard_val_x = jaccard_scaler.transform(jaccard_val_x)
jaccard_test_x = jaccard_scaler.transform(jaccard_test_x)

# shuffle train, val, and test arrays in unison
p_train = np.random.permutation(len(jaccard_train_x))
jaccard_train_x = jaccard_train_x[p_train]
jaccard_train_y = jaccard_train_y[p_train]
p_val = np.random.permutation(len(jaccard_val_x))
jaccard_val_x = jaccard_val_x[p_val]
jaccard_val_y = jaccard_val_y[p_val]
p_test = np.random.permutation(len(jaccard_test_x))
jaccard_test_x = jaccard_test_x[p_test]
jaccard_test_y = jaccard_test_y[p_test]

# fit logistic regression and measure accuracy score on validation set, keeping track of the best set of hyperparameters
best_grid = None
best_acc = 0
for g in param_grid:
    jaccard_clf = LogisticRegression(**g).fit(jaccard_train_x, jaccard_train_y)
    
    predictions = jaccard_clf.predict_proba(jaccard_val_x)
    acc = metrics.roc_auc_score(jaccard_val_y, predictions, multi_class='ovr')
    
    if acc > best_acc:
        best_acc = acc
        best_grid = g

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 18 epochs took 0 seconds
convergence after 11 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 16 epochs took 0 seconds
convergence after 11 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 16 epochs took 0 seconds
convergence after 11 epochs took 1 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 14 epochs took 0 seconds
convergence after 11 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
convergence after 12 epochs took 0 seconds
convergence after 12 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [28]:
jaccard_clf = LogisticRegression(**best_grid).fit(jaccard_train_x, jaccard_train_y)

predictions = jaccard_clf.predict(jaccard_test_x)
prob_predictions = jaccard_clf.predict_proba(jaccard_test_x)

print('Accuracy: {}\nAUC: {}'.format(metrics.accuracy_score(jaccard_test_y, predictions), metrics.roc_auc_score(jaccard_test_y, prob_predictions, multi_class='ovr')))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy: 0.6257434154630417
AUC: 0.617586850282337


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished


### PA

In [29]:
#training, validation, and testing data
pa_train_x = np.concatenate((train_cc['pa'].values.reshape([-1, 1]), train_no_cc['pa'].values.reshape([-1, 1])), axis=0)
pa_train_y = np.concatenate((train_cc['future_time'].values, np.zeros(len(train_no_cc))), axis = 0)
pa_val_x = np.concatenate((val_cc['pa'].values.reshape([-1, 1]), val_no_cc['pa'].values.reshape([-1, 1])), axis=0)
pa_val_y = np.concatenate((val_cc['future_time'].values, np.zeros(len(val_no_cc))), axis = 0)
pa_test_x = np.concatenate((test_cc['pa'].values.reshape([-1, 1]), test_no_cc['pa'].values.reshape([-1, 1])), axis=0)
pa_test_y = np.concatenate((test_cc['future_time'].values, np.zeros(len(test_no_cc))), axis = 0)

# scale data
pa_scaler = StandardScaler()
pa_train_x = pa_scaler.fit_transform(pa_train_x)
pa_val_x = pa_scaler.transform(pa_val_x)
pa_test_x = pa_scaler.transform(pa_test_x)

# shuffle train, val, and test arrays in unison
p_train = np.random.permutation(len(pa_train_x))
pa_train_x = pa_train_x[p_train]
pa_train_y = pa_train_y[p_train]
p_val = np.random.permutation(len(pa_val_x))
pa_val_x = pa_val_x[p_val]
pa_val_y = pa_val_y[p_val]
p_test = np.random.permutation(len(pa_test_x))
pa_test_x = pa_test_x[p_test]
pa_test_y = pa_test_y[p_test]

# fit logistic regression and measure accuracy score on validation set, keeping track of the best set of hyperparameters
best_grid = None
best_acc = 0
for g in param_grid:
    pa_clf = LogisticRegression(**g).fit(pa_train_x, pa_train_y)
    
    predictions = pa_clf.predict_proba(pa_val_x)
    acc = metrics.roc_auc_score(pa_val_y, predictions, multi_class='ovr')
    
    if acc > best_acc:
        best_acc = acc
        best_grid = g

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 30 epochs took 0 seconds
convergence after 14 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 29 epochs took 1 seconds
convergence after 14 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 26 epochs took 1 seconds
convergence after 14 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 19 epochs took 0 seconds
convergence after 14 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 15 epochs took 0 seconds
convergence after 14 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [30]:
pa_clf = LogisticRegression(**best_grid).fit(pa_train_x, pa_train_y)

predictions = pa_clf.predict(pa_test_x)
prob_predictions = pa_clf.predict_proba(pa_test_x)

print('Accuracy: {}\nAUC: {}'.format(metrics.accuracy_score(pa_test_y, predictions), metrics.roc_auc_score(pa_test_y, prob_predictions, multi_class='ovr')))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 30 epochs took 1 seconds
Accuracy: 0.6274426508071368
AUC: 0.4864146797563461


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished


### AA

In [31]:
#training, validation, and testing data
aa_train_x = np.concatenate((train_cc['aa'].values.reshape([-1, 1]), train_no_cc['aa'].values.reshape([-1, 1])), axis=0)
aa_train_y = np.concatenate((train_cc['future_time'].values, np.zeros(len(train_no_cc))), axis = 0)
aa_val_x = np.concatenate((val_cc['aa'].values.reshape([-1, 1]), val_no_cc['aa'].values.reshape([-1, 1])), axis=0)
aa_val_y = np.concatenate((val_cc['future_time'].values, np.zeros(len(val_no_cc))), axis = 0)
aa_test_x = np.concatenate((test_cc['aa'].values.reshape([-1, 1]), test_no_cc['aa'].values.reshape([-1, 1])), axis=0)
aa_test_y = np.concatenate((test_cc['future_time'].values, np.zeros(len(test_no_cc))), axis = 0)

# scale data
aa_scaler = StandardScaler()
aa_train_x = aa_scaler.fit_transform(aa_train_x)
aa_val_x = aa_scaler.transform(aa_val_x)
aa_test_x = aa_scaler.transform(aa_test_x)

# shuffle train, val, and test arrays in unison
p_train = np.random.permutation(len(aa_train_x))
aa_train_x = aa_train_x[p_train]
aa_train_y = aa_train_y[p_train]
p_val = np.random.permutation(len(aa_val_x))
aa_val_x = aa_val_x[p_val]
aa_val_y = aa_val_y[p_val]
p_test = np.random.permutation(len(aa_test_x))
aa_test_x = aa_test_x[p_test]
aa_test_y = aa_test_y[p_test]

# fit logistic regression and measure accuracy score on validation set, keeping track of the best set of hyperparameters
best_grid = None
best_acc = 0
for g in param_grid:
    aa_clf = LogisticRegression(**g).fit(aa_train_x, aa_train_y)
    
    predictions = aa_clf.predict_proba(aa_val_x)
    acc = metrics.roc_auc_score(aa_val_y, predictions, multi_class='ovr')
    
    if acc > best_acc:
        best_acc = acc
        best_grid = g

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 19 epochs took 0 seconds
convergence after 14 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 19 epochs took 0 seconds
convergence after 14 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 16 epochs took 1 seconds
convergence after 14 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 16 epochs took 0 seconds
convergence after 14 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
convergence after 13 epochs took 0 seconds
convergence after 13 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [32]:
aa_clf = LogisticRegression(**best_grid).fit(aa_train_x, aa_train_y)

predictions = aa_clf.predict(aa_test_x)
prob_predictions = aa_clf.predict_proba(aa_test_x)

print('Accuracy: {}\nAUC: {}'.format(metrics.accuracy_score(aa_test_y, predictions), metrics.roc_auc_score(aa_test_y, prob_predictions, multi_class='ovr')))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy: 0.6193712829226848
AUC: 0.5984496989031858


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished


### Resource Allocation

In [33]:
#training, validation, and testing data
ra_train_x = np.concatenate((train_cc['ra'].values.reshape([-1, 1]), train_no_cc['ra'].values.reshape([-1, 1])), axis=0)
ra_train_y = np.concatenate((train_cc['future_time'].values, np.zeros(len(train_no_cc))), axis = 0)
ra_val_x = np.concatenate((val_cc['ra'].values.reshape([-1, 1]), val_no_cc['ra'].values.reshape([-1, 1])), axis=0)
ra_val_y = np.concatenate((val_cc['future_time'].values, np.zeros(len(val_no_cc))), axis = 0)
ra_test_x = np.concatenate((test_cc['ra'].values.reshape([-1, 1]), test_no_cc['ra'].values.reshape([-1, 1])), axis=0)
ra_test_y = np.concatenate((test_cc['future_time'].values, np.zeros(len(test_no_cc))), axis = 0)

# scale data
ra_scaler = StandardScaler()
ra_train_x = ra_scaler.fit_transform(ra_train_x)
ra_val_x = ra_scaler.transform(ra_val_x)
ra_test_x = ra_scaler.transform(ra_test_x)

# shuffle train, val, and test arrays in unison
p_train = np.random.permutation(len(ra_train_x))
ra_train_x = ra_train_x[p_train]
ra_train_y = ra_train_y[p_train]
p_val = np.random.permutation(len(ra_val_x))
ra_val_x = ra_val_x[p_val]
ra_val_y = ra_val_y[p_val]
p_test = np.random.permutation(len(ra_test_x))
ra_test_x = ra_test_x[p_test]
ra_test_y = ra_test_y[p_test]

# fit logistic regression and measure accuracy score on validation set, keeping track of the best set of hyperparameters
best_grid = None
best_acc = 0
for g in param_grid:
    ra_clf = LogisticRegression(**g).fit(ra_train_x, ra_train_y)
    
    predictions = ra_clf.predict_proba(ra_val_x)
    acc = metrics.roc_auc_score(ra_val_y, predictions, multi_class='ovr')
    
    if acc > best_acc:
        best_acc = acc
        best_grid = g

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 59 epochs took 0 seconds
convergence after 18 epochs took 1 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 58 epochs took 0 seconds
convergence after 18 epochs took 1 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 33 epochs took 0 seconds
convergence after 18 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 32 epochs took 0 seconds
convergence after 13 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 14 epochs took 0 seconds
convergence after 14 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [34]:
ra_clf = LogisticRegression(**best_grid).fit(ra_train_x, ra_train_y)

predictions = ra_clf.predict(ra_test_x)
prob_predictions = ra_clf.predict_proba(ra_test_x)

print('Accuracy: {}\nAUC: {}'.format(metrics.accuracy_score(ra_test_y, predictions), metrics.roc_auc_score(ra_test_y, prob_predictions, multi_class='ovr')))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy: 0.6248937977909941
AUC: 0.5909136332898937


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
