# Feature generation

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import networkx as nx
nx.test()

                              [ 24%]
algorithms/shortest_paths/tests/test_dense_numpy.py ......                                                           [ 24%]
algorithms/shortest_paths/tests/test_generic.py ......................                                               [ 25%]
algorithms/shortest_paths/tests/test_unweighted.py ............                                                      [ 25%]
algorithms/shortest_paths/tests/test_weighted.py .........................................                           [ 26%]
algorithms/tests/test_asteroidal.py .                                                                                [ 26%]
algorithms/tests/test_boundary.py .............                                                                      [ 26%]
algorithms/tests/test_bridges.py .....                                                                               [ 26%]
algorithms/tests/test_chains.py ....                                                           

True

In [3]:
from pathlib import Path
import numpy as np
import pandas as pd

Load parameter settings

In [4]:
from lynks.helpers import load_config

In [9]:
experiment_config_dir = Path("../configs")

feat_gen_config_path = experiment_config_dir.joinpath("experiments.toml")

In [10]:
configuration = load_config(feat_gen_config_path)

In [11]:
configuration

{'dataset_name': 'ogbl-collab',
 'preprocessing': {'n_samples': 100, 'sampling_fn': 'random'},
 'features': {'common_neighbors_count': False,
  'common_neighbor_centrality': False,
  'jaccard_coefficient': True,
  'adamic_adar_index': True,
  'preferential_attachment': True,
  'resource_allocation_index': True,
  'scaling': 'normalise'}}

In [12]:
from ogb.linkproppred import LinkPropPredDataset

In [13]:
lp_dataset = LinkPropPredDataset(configuration['dataset_name'])

split_edge = lp_dataset.get_edge_split()

train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], split_edge["test"]

Downloading https://snap.stanford.edu/ogb/data/linkproppred/collab.zip
Downloaded 0.11 GB: 100%|██████████| 117/117 [00:58<00:00,  1.99it/s]
Extracting dataset/collab.zip
Loading necessary files...
This might take a while.
100%|██████████| 1/1 [00:00<00:00, 11.80it/s]Processing graphs...
Saving...



In [14]:
from lynks.helpers import create_pipeline

from lynks.data import build_graph
from lynks.sampling import create_sampler

In [16]:
train_samples = list(map(tuple, train_edge["edge"]))
train_data = (train_edge, train_samples)

In [18]:
train_samples = list(map(tuple, train_edge["edge"]))
train_weights = train_edge["weight"]
train_years = train_edge["year"]


In [29]:
from lynks.features import create_graph_topology_featuriser
from lynks.features import create_feature_transform
from lynks.features import create_feature_formatter

In [23]:
feature_config = configuration["features"]

topo_feature_pipeline = create_graph_topology_featuriser(
    graph_backbone=build_graph(train_samples, train_weights, train_years),
    common_neighbors_count=feature_config["common_neighbors_count"],
    common_neighbor_centrality=feature_config["common_neighbor_centrality"],
    jaccard_coefficient=feature_config["jaccard_coefficient"],
    adamic_adar_index=feature_config["adamic_adar_index"],
    preferential_attachment=feature_config["preferential_attachment"],
    resource_allocation_index=feature_config["resource_allocation_index"],
    verbose=1
)

In [24]:
_, features, feat_labels = topo_feature_pipeline(train_samples)

2021-05-02 17:20:34,462 lynks.features INFO     Computing jaccard_coefficient...
2021-05-02 17:22:30,193 lynks.features INFO     Computing adamic_adar_index...
2021-05-02 17:24:57,722 lynks.features INFO     Computing preferential_attachment...
2021-05-02 17:25:04,110 lynks.features INFO     Computing resource_allocation_index...


In [26]:
features

[array([0.07042254, 0.07042254, 0.07272727, ..., 0.71428571, 0.71428571,
        0.        ]),
 array([2.51592912, 2.51592912, 1.18425534, ..., 2.40538366, 2.40538366,
        0.        ]),
 array([1428, 1428,  858, ...,   36,   36,   65]),
 array([0.68373016, 0.68373016, 0.15198788, ..., 0.62698413, 0.62698413,
        0.        ])]

In [30]:
feat_formatter = create_feature_formatter()

In [32]:
feats_np, feat_labels_np = feat_formatter(features, feat_labels)


In [34]:
feats_np.shape

(1179052, 4)

In [35]:
from lynks.model import create_train_RandomForestClassifier

In [None]:

train_model_pipeline = create_pipeline([
    create_train_RandomForestClassifier(101)
])

In [None]:
rf = train_model_pipeline()

In [None]:
rf

Validation data

In [36]:
validation_samples = list(map(tuple, valid_edge["edge"][:200,:]))

In [37]:
_, valid_feat, valid_feat_labels = topo_feature_pipeline(validation_samples)

Featurise

In [41]:
# valid_feat_labels

In [None]:
validation_ds_feat = feature_pipeline(validation_ds)

In [None]:
# infer_pipeline = create_pipeline([
#     rf
# ])

In [None]:
validation_ds.feature_names

In [None]:
predictions = rf(validation_ds.features[:, [0,2,3]])

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.metrics import roc_auc_score

In [None]:
predictions

In [None]:
confusion_matrix(validation_ds.labels, predictions)

In [None]:
print(classification_report(validation_ds.labels, predictions))

## Multiprocessing

In [None]:
# %%time
# data_processed = data_preprocessing_pipeline(train_edge)

In [None]:
# from lynks.data import split_dataset
# from lynks.data import merge_datasets

In [None]:
# parallel_datasets = split_dataset(data_processed, n_slices=8)

In [None]:
# from multiprocessing.pool import ThreadPool as Pool

In [None]:
# %%time
# with Pool(8) as p:
#     featurized_datasets = p.map(feature_pipeline, parallel_datasets)
    
# dataset_featurised = merge_datasets(featurized_datasets)

In [2]:
import numpy as np

In [3]:
a = np.array([[0,1,3]]).T

In [7]:
a.shape

(3, 1)

In [5]:
b = 2*a

In [6]:
np.concatenate([a, b], axis=1).shape

(3, 2)