In [99]:
%cd /scratch/bruingjde/SNAM2021-code/

import os
import typing

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import sklearn.linear_model
import sklearn.model_selection
import sklearn.metrics
import sklearn.pipeline
import sklearn.preprocessing
from tqdm.auto import tqdm

import tlp

/scratch/bruingjde/SNAM2021-code


In [5]:
tlp.Experiment._fields

('feature',
 'time_aware',
 'aggregation_strategy',
 'time_strategy',
 'nodepair_strategy')

In [8]:
filename = 'auc_all_features.float'
features = None

In [70]:
entries = sorted(os.scandir('data'), key=lambda x: x.name)

X_all = dict()
pipe_all = dict()
auc_all = dict()
cols_all = dict()

for entry in entries:
  index = int(entry.name)
  X_dict = dict()

  # If features are not calculated yet, quit.
  if not os.path.isdir(os.path.join(entry.path, 'features')): 
    tlp.print_status(f'#{index} No feature construction yet')
    continue

  tlp.print_status(f'#{index}')
  # Get all calculated features.
  for file in os.scandir(os.path.join(entry.path, 'features')):
    if features is None or file.name in features:
      X_dict.update(joblib.load(file.path))
      
  X = pd.DataFrame(X_dict)
  cols = list(X_dict.keys())

  # Get targets
  y = np.load(os.path.join(entry.path, 'targets_sampled.npy'))

  # Fit and predict pipeline
  X_train, X_test, y_train, y_test = (
    sklearn.model_selection.train_test_split(X, y))
  pipe = sklearn.pipeline.make_pipeline(
    sklearn.preprocessing.StandardScaler(),
    sklearn.linear_model.LogisticRegression(max_iter=10000)) # type: ignore
  pipe.fit(X_train, y_train)
  auc = sklearn.metrics.roc_auc_score(
    y_true=y_test, y_score=pipe.predict_proba(X_test)[:,1]) # type: ignore
  
  X_all[index] = pd.DataFrame(X_dict)  
  cols_all[index] = cols
  pipe_all[index] = pipe
  auc_all[index] = auc

2021-02-26 11:34:20.303968 #1
2021-02-26 11:34:21.118247 #2
2021-02-26 11:34:21.934172 #3
2021-02-26 11:34:22.421441 #4
2021-02-26 11:34:23.163326 #5
2021-02-26 11:34:24.018804 #6
2021-02-26 11:34:25.082042 #7
2021-02-26 11:34:25.859987 #8
2021-02-26 11:34:26.660716 #9
2021-02-26 11:34:27.508427 #10 No feature construction yet
2021-02-26 11:34:27.509066 #11 No feature construction yet
2021-02-26 11:34:27.509273 #12
2021-02-26 11:34:28.335411 #13
2021-02-26 11:34:29.491228 #14
2021-02-26 11:34:30.296896 #16
2021-02-26 11:34:30.997888 #18
2021-02-26 11:34:31.735205 #19
2021-02-26 11:34:32.140473 #20
2021-02-26 11:34:32.837558 #21
2021-02-26 11:34:34.055145 #22
2021-02-26 11:34:34.676440 #23
2021-02-26 11:34:35.362508 #24 No feature construction yet
2021-02-26 11:34:35.362808 #25 No feature construction yet
2021-02-26 11:34:35.363127 #28
2021-02-26 11:34:36.084276 #29
2021-02-26 11:34:36.778726 #30
2021-02-26 11:34:37.669132 #31


In [101]:
dfs = []
for idx in tqdm(X_all.keys()):
  df = pd.DataFrame(cols_all[idx])
  df['rank'] = np.mean(scipy.stats.rankdata(pipe_all[idx]['logisticregression'].coef_[0] * X_all[idx].values, axis=1), axis=0)
  df['dataset'] = idx
  dfs.append(df)

100%|██████████| 23/23 [00:18<00:00,  1.23it/s]


In [104]:
df = pd.concat(dfs)

In [113]:
df.groupby([col for col in df.columns if col not in ['rank', 'dataset']])['rank'].mean().sort_values().reset_index()

Unnamed: 0,feature,time_aware,aggregation_strategy,time_strategy,nodepair_strategy,rank
0,,True,sum,lin,sum,16.3155
1,,True,sum,sqrt,sum,16.897454
2,,True,sum,sqrt,max,20.022483
3,,True,sum,lin,max,20.598726
4,,True,max,lin,sum,20.880383
5,,True,sum,lin,min,22.167137
6,,True,sum,sqrt,min,22.231461
7,,True,median,exp,min,23.280017
8,,True,median,exp,sum,23.288852
9,,True,max,lin,max,23.604496


In [115]:
df.groupby(['feature', 'time_aware'])['rank'].mean().sort_values().reset_index()

Unnamed: 0,feature,time_aware,rank
0,sp,False,12.80083
1,,True,31.592052
2,AA,True,31.627441
3,aa,False,42.112274


In [116]:
df.groupby(['time_strategy'])['rank'].mean().sort_values().reset_index()

Unnamed: 0,time_strategy,rank
0,lin,27.264865
1,sqrt,31.498256
2,exp,36.034269
