# Feature generation

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import numpy as np
import pandas as pd

Load parameter settings

In [3]:
from lynks.helpers import load_config

In [4]:
experiment_config_dir = Path("../configs")

feat_gen_config_path = experiment_config_dir.joinpath("experiments.toml")

In [5]:
configuration = load_config(feat_gen_config_path)

In [6]:
configuration

{'dataset_name': 'ogbl-collab',
 'preprocessing': {'n_samples': 2020, 'balancing': True},
 'features': {'common_neighbors_count': False,
  'common_neighbor_centrality': False,
  'jaccard_coefficient': True,
  'adamic_adar_index': True,
  'preferential_attachment': True,
  'resource_allocation_index': True,
  'scaling': 'normalise'}}

In [7]:
from lynks.pipeline import create_pipeline

from lynks.data import Dataset
from lynks.data import create_formatter
from lynks.data import create_graph_builder

In [8]:
from ogb.linkproppred import LinkPropPredDataset

In [9]:
lp_dataset = LinkPropPredDataset(configuration['dataset_name'])

split_edge = lp_dataset.get_edge_split()

train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], split_edge["test"]

In [10]:
train_ds = Dataset(lpdata=train_edge)

I am custom you know


In [11]:
train_ds.edges.shape

(1179052, 2)

In [12]:
train_ds.edges[:10, :]

array([[150989, 224881],
       [150989, 224881],
       [180078, 199043],
       [ 49077, 199043],
       [ 49077, 199043],
       [ 49077, 199043],
       [ 72822, 199043],
       [ 22617, 162495],
       [ 34959, 115736],
       [168022, 128982]])

In [13]:
pd.DataFrame(train_ds.edges)

Unnamed: 0,0,1
0,150989,224881
1,150989,224881
2,180078,199043
3,49077,199043
4,49077,199043
...,...,...
1179047,32447,221741
1179048,103044,221741
1179049,60425,221741
1179050,135758,221741


In [14]:
np.ones(train_ds.edges.shape[0])

array([1., 1., 1., ..., 1., 1., 1.])

In [15]:
preprocessing_config = configuration["preprocessing"]

data_preprocessing_pipeline = create_pipeline([
    create_formatter(),
    create_graph_builder()
])


In [16]:
data_processed = data_preprocessing_pipeline(train_edge)



I am custom you know


In [17]:
# data_processed.graph.edges

In [18]:
data_processed

<lynks.data.Dataset at 0x7fa5711ca3a0>

In [26]:
sorted(data_processed.edges[:100])

[(29838, 259),
 (29838, 46339),
 (29838, 93626),
 (29838, 115725),
 (29838, 141569),
 (29838, 165828),
 (29838, 177433),
 (150989, 13788),
 (150989, 29838),
 (150989, 35939),
 (150989, 39863),
 (150989, 46339),
 (150989, 52716),
 (150989, 60916),
 (150989, 68107),
 (150989, 69980),
 (150989, 73691),
 (150989, 74460),
 (150989, 87235),
 (150989, 87299),
 (150989, 98399),
 (150989, 101448),
 (150989, 133659),
 (150989, 158417),
 (150989, 161820),
 (150989, 163097),
 (150989, 165828),
 (150989, 168681),
 (150989, 170670),
 (150989, 191805),
 (150989, 193953),
 (150989, 194742),
 (150989, 202539),
 (150989, 206640),
 (150989, 209832),
 (150989, 211184),
 (150989, 213318),
 (150989, 218928),
 (150989, 222757),
 (150989, 224881),
 (150989, 230381),
 (191805, 13788),
 (191805, 52716),
 (191805, 66449),
 (191805, 73691),
 (191805, 75337),
 (191805, 77018),
 (191805, 98399),
 (191805, 101448),
 (191805, 119799),
 (191805, 130827),
 (191805, 157836),
 (191805, 168681),
 (191805, 194742),
 (19180

In [20]:
# data_processed

In [21]:
from lynks.features import create_balancer
from lynks.features import create_featuriser
from lynks.features import create_feature_transform

In [22]:
feature_config = configuration["features"]

raw_feature_pipeline = create_featuriser(
    common_neighbors_count=feature_config["common_neighbors_count"],
    common_neighbor_centrality=feature_config["common_neighbor_centrality"],
    jaccard_coefficient=feature_config["jaccard_coefficient"],
    adamic_adar_index=feature_config["adamic_adar_index"],
    preferential_attachment=feature_config["preferential_attachment"],
    resource_allocation_index=feature_config["resource_allocation_index"],
    verbose=1
)

In [23]:
feature_pipeline = create_pipeline([
    create_balancer(n_samples=preprocessing_config['n_samples']),
    raw_feature_pipeline,
    create_feature_transform(scaling=feature_config["scaling"])
])

In [24]:
# %%time
data_feats = feature_pipeline(data_processed)

2020
Computing jaccard_coefficient...
Computing adamic_adar_index...
Computing preferential_attachment...
Computing resource_allocation_index...
Length feat list = 4 by 2020
Shape feat array (2020, 4)


In [25]:
data_feats.features

array([[-0.34312602, -0.06556203, -0.18228228,  0.55476807],
       [-0.31967476, -0.67751473, -0.37607986, -0.72819797],
       [ 1.2792588 ,  1.58573807, -0.12355575,  0.55683449],
       ...,
       [-0.62966536, -0.97034154, -0.37279808, -1.08446877],
       [-0.47789117, -0.583342  , -0.36157095, -0.47579235],
       [-0.43777445, -0.53343418, -0.24929963, -0.57775323]])

## multiprocessing

In [25]:
%%time
data_processed = data_preprocessing_pipeline(train_edge)

I am custom you know
CPU times: user 16 s, sys: 264 ms, total: 16.2 s
Wall time: 16.2 s


In [26]:
from lynks.data import split_dataset
from lynks.data import merge_datasets

In [27]:
parallel_datasets = split_dataset(data_processed, n_slices=8)

In [28]:
from multiprocessing.pool import ThreadPool as Pool

In [29]:
%%time
with Pool(8) as p:
    featurized_datasets = p.map(feature_pipeline, parallel_datasets)
    
dataset_featurised = merge_datasets(featurized_datasets)

Computing jaccard_coefficient...
Computing jaccard_coefficient...Computing jaccard_coefficient...
Computing jaccard_coefficient...
Computing jaccard_coefficient...Computing jaccard_coefficient...

Computing jaccard_coefficient...

Computing jaccard_coefficient...
Computing adamic_adar_index...
Computing adamic_adar_index...
Computing adamic_adar_index...Computing adamic_adar_index...

Computing adamic_adar_index...
Computing adamic_adar_index...
Computing adamic_adar_index...
Computing adamic_adar_index...
Computing preferential_attachment...
Computing preferential_attachment...
Computing preferential_attachment...
Computing preferential_attachment...
Computing preferential_attachment...
Computing preferential_attachment...
Computing resource_allocation_index...
Computing resource_allocation_index...
Computing resource_allocation_index...
Computing resource_allocation_index...Computing preferential_attachment...

Computing resource_allocation_index...Computing resource_allocation_index

In [30]:
dataset_featurised.features

array([[-0.80866112, -0.23605544, -0.04802622,  0.19968591],
       [-0.80866112, -0.23605544, -0.04802622,  0.19968591],
       [-0.80007935, -0.53540083, -0.15406694, -0.81960741],
       ...,
       [ 1.59355561, -0.25307453, -0.31955875,  0.0995563 ],
       [ 1.59355561, -0.25307453, -0.31955875,  0.0995563 ],
       [-1.07825398, -0.8134851 , -0.31328275, -1.01527351]])

In [31]:
featurized_datasets[1].features

array([[ 0.45537745,  0.42042216, -0.15577179,  0.22701484],
       [ 0.45537745,  0.42042216, -0.15577179,  0.22701484],
       [-0.88559978, -0.3508786 , -0.25022176, -0.06474345],
       ...,
       [ 2.22584581,  0.36904341, -0.26569356,  0.53544834],
       [ 2.22584581,  0.36904341, -0.26569356,  0.53544834],
       [ 2.22584581,  0.36904341, -0.26569356,  0.53544834]])

In [32]:
data_feats.features

array([[-0.81693895, -0.2333251 , -0.03434316,  0.21727714],
       [-0.81693895, -0.2333251 , -0.03434316,  0.21727714],
       [-0.80835928, -0.53795413, -0.14875444, -0.81060635],
       ...,
       [ 1.57992053, -0.2586131 , -0.31374755,  0.10758432],
       [ 1.57992053, -0.2586131 , -0.31374755,  0.10758432],
       [-1.07909545, -0.80886023, -0.30792663, -1.10440626]])

In [34]:
# len(merged_dataset.edges)