# Pipeline Graph Embeddings
The purpose of this notebook is to apply graph embeddings in our pipeline

In [1]:
# imports
import os
import pandas as pd
from src import configuration as config
from src.pipeline.pipeline_factory import PipelineFactory, ModelType, EvaluationType

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load the data
train_df = config.load_traindata_for_regression()
pipelineFactory = PipelineFactory()

# create the baseline pipeline
pipeline = pipelineFactory.create_pipeline(train_df,
                                            ModelType.REGRE_BASELINE,
                                            verbose_level=1,
                                            evaluation=EvaluationType.BASIC)
pipeline.run()

Starting pipeline using method: EvaluationType.BASIC
Finished running the pipeline
Evaluation metrics:
    validation_rmse: 0.2206 [std=0.]
    validation_mae: 0.1783 [std=0.]
    validation_r2: -0.005 [std=0.]
    validation_average_spearman: 0. [std=0.]


In [3]:
from src.features.encoder_utils import load_graph
graph = load_graph(config.ROOT_DIR / "data/external/graphs/encodings_graph.adjlist")

### Try out Node2Vec

In [4]:
pipeline.clear_steps()
from src.pipeline.pipeline_transformers import Node2VecEmbedding, PrintDataframe, ColumnKeeper
from sklearn.linear_model import LinearRegression
n2v_embedddings_transformer = Node2VecEmbedding(graph=graph, walk_length=20, num_walks=1000, workers=1)

# add the column transformer to the pipeline
pipeline.add_new_step(n2v_embedddings_transformer, "embeddings_transformer")

pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_1")

pipeline.add_new_step(ColumnKeeper(columns=["node2vec_embedding_dim1", "node2vec_embedding_dim2"]),
                                  "column_keeper")

pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_2")

print(pipeline.get_pipeline().named_steps)

pipeline.change_estimator(LinearRegression())

pipeline.run()

{'embeddings_transformer': Node2VecEmbedding(graph=<networkx.classes.graph.Graph object at 0x000001C33E628AD0>), 'print_df_1': PrintDataframe(verbose=1), 'column_keeper': ColumnKeeper(columns=['node2vec_embedding_dim1', 'node2vec_embedding_dim2']), 'print_df_2': PrintDataframe(verbose=1), 'estimator': DummyRegressor()}
Starting pipeline using method: EvaluationType.BASIC


Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 10192.99it/s]
Generating walks (CPU: 1):   0%|          | 0/1000 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1000/1000 [00:03<00:00, 329.53it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring  node2vec_embedding_dim1  \
0    23381    LR  model      F1                -0.002585   
1    23381    LR  model      F1                -0.157983   
2    23381    LR  model      F1                 0.549880   
3    23381    LR  model      F1                -0.176819   
4    23381    LR  model      F1                 0.531572   

   node2vec_embedding_dim2  
0                 0.030684  
1                -0.480326  
2                -0.306108  
3                -0.484077  
4                -0.271438  
----------------------------------------
----------------------------------------
Printing dataframe:
   node2vec_embedding_dim1  node2vec_embedding_dim2
0                -0.002585                 0.030684
1                -0.157983                -0.480326
2                 0.549880                -0.306108
3                -0.176819                -0.484077
4                 0.531572                

Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 14949.99it/s]
Generating walks (CPU: 1): 100%|██████████| 1000/1000 [00:03<00:00, 333.12it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring  node2vec_embedding_dim1  \
0    41007   SVC     no     ACC                 0.169258   
1    41007   SVC     no     ACC                 0.317525   
2    41007   SVC     no     ACC                 0.221448   
3    41007   SVC     no     ACC                 0.355023   
4    41007   SVC     no     ACC                 0.168501   

   node2vec_embedding_dim2  
0                -0.130199  
1                -0.065645  
2                -0.067558  
3                -0.088878  
4                -0.132393  
----------------------------------------
----------------------------------------
Printing dataframe:
   node2vec_embedding_dim1  node2vec_embedding_dim2
0                 0.169258                -0.130199
1                 0.317525                -0.065645
2                 0.221448                -0.067558
3                 0.355023                -0.088878
4                 0.168501                

Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 10920.14it/s]
Generating walks (CPU: 1): 100%|██████████| 1000/1000 [00:03<00:00, 328.84it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring  node2vec_embedding_dim1  \
0    23381    LR  model      F1                 0.037927   
1    23381    LR  model      F1                 0.000874   
2    23381    LR  model      F1                -0.160749   
3    23381    LR  model      F1                -0.013630   
4    23381    LR  model      F1                -0.165405   

   node2vec_embedding_dim2  
0                -0.031644  
1                -0.241206  
2                -0.182149  
3                -0.283871  
4                -0.150638  
----------------------------------------
----------------------------------------
Printing dataframe:
   node2vec_embedding_dim1  node2vec_embedding_dim2
0                 0.037927                -0.031644
1                 0.000874                -0.241206
2                -0.160749                -0.182149
3                -0.013630                -0.283871
4                -0.165405                

Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 12819.65it/s]
Generating walks (CPU: 1): 100%|██████████| 1000/1000 [00:02<00:00, 334.87it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring  node2vec_embedding_dim1  \
0    41007   SVC     no     ACC                -0.164879   
1    41007   SVC     no     ACC                 0.255366   
2    41007   SVC     no     ACC                 0.373415   
3    41007   SVC     no     ACC                 0.177237   
4    41007   SVC     no     ACC                 0.387642   

   node2vec_embedding_dim2  
0                -0.005793  
1                -0.398586  
2                -0.187469  
3                -0.319283  
4                -0.160144  
----------------------------------------
----------------------------------------
Printing dataframe:
   node2vec_embedding_dim1  node2vec_embedding_dim2
0                -0.164879                -0.005793
1                 0.255366                -0.398586
2                 0.373415                -0.187469
3                 0.177237                -0.319283
4                 0.387642                

### Try out Node2Vec with kmeans

In [5]:
pipeline.clear_steps()
from src.pipeline.pipeline_transformers import Node2VecGraphEmbeddingWithKMeans

n2v_embedddings_transformer = Node2VecGraphEmbeddingWithKMeans(graph=graph)

# add the column transformer to the pipeline
pipeline.add_new_step(n2v_embedddings_transformer, "embeddings_transformer")

print(pipeline.get_pipeline().named_steps)

{'embeddings_transformer': Node2VecGraphEmbeddingWithKMeans(graph=<networkx.classes.graph.Graph object at 0x000001C33E628AD0>), 'estimator': LinearRegression()}


In [6]:
# check if the embedding worked
from src.pipeline.pipeline_transformers import PrintDataframe
pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df")
print(pipeline.get_pipeline().named_steps)

{'embeddings_transformer': Node2VecGraphEmbeddingWithKMeans(graph=<networkx.classes.graph.Graph object at 0x000001C33E628AD0>), 'print_df': PrintDataframe(verbose=1), 'estimator': LinearRegression()}


As we can see the application of the kmeans encoder worked

In [7]:
# lets try to get a prediction with a regression model
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from src.pipeline.pipeline_transformers import ColumnKeeper

# only keep selected columns
column_keeper = ColumnKeeper(columns=["encoder_cluster"])

pipeline.add_new_step(column_keeper, "column_keeper")
pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_2")

pipeline.change_estimator(LinearRegression())

print(pipeline.get_pipeline().named_steps)
pipeline.run()

{'embeddings_transformer': Node2VecGraphEmbeddingWithKMeans(graph=<networkx.classes.graph.Graph object at 0x000001C33E628AD0>), 'print_df': PrintDataframe(verbose=1), 'column_keeper': ColumnKeeper(columns=['encoder_cluster']), 'print_df_2': PrintDataframe(verbose=1), 'estimator': LinearRegression()}
Starting pipeline using method: EvaluationType.BASIC


Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 45121.61it/s]
Generating walks (CPU: 1):   0%|          | 0/10 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 79.52it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring  encoder_cluster
0    23381    LR  model      F1                3
1    23381    LR  model      F1                2
2    23381    LR  model      F1                0
3    23381    LR  model      F1                2
4    23381    LR  model      F1                0
----------------------------------------
----------------------------------------
Printing dataframe:
   encoder_cluster
0                3
1                2
2                0
3                2
4                0
----------------------------------------


Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 9934.92it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 85.57it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring  encoder_cluster
0    41007   SVC     no     ACC                4
1    41007   SVC     no     ACC                2
2    41007   SVC     no     ACC                1
3    41007   SVC     no     ACC                2
4    41007   SVC     no     ACC                1
----------------------------------------
----------------------------------------
Printing dataframe:
   encoder_cluster
0                4
1                2
2                1
3                2
4                1
----------------------------------------


Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 17929.48it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 84.12it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring  encoder_cluster
0    23381    LR  model      F1                0
1    23381    LR  model      F1                3
2    23381    LR  model      F1                2
3    23381    LR  model      F1                3
4    23381    LR  model      F1                2
----------------------------------------
----------------------------------------
Printing dataframe:
   encoder_cluster
0                0
1                3
2                2
3                3
4                2
----------------------------------------


Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 15008.24it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 76.49it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring  encoder_cluster
0    41007   SVC     no     ACC                1
1    41007   SVC     no     ACC                0
2    41007   SVC     no     ACC                2
3    41007   SVC     no     ACC                0
4    41007   SVC     no     ACC                2
----------------------------------------
----------------------------------------
Printing dataframe:
   encoder_cluster
0                1
1                0
2                2
3                0
4                2
----------------------------------------
Finished running the pipeline
Evaluation metrics:
    validation_rmse: 0.2206 [std=0.]
    validation_mae: 0.1784 [std=0.]
    validation_r2: -0.005 [std=0.]
    validation_average_spearman: 0.0241 [std=0.]


In this test run the pipeline score is a 0.015. As we are using spearmans R as our metric where the best score is 1.0 or -1.0 we can see that we achieved a very poor score.

### Try out Poincare

In [11]:
pipeline.clear_steps()

pd.set_option('display.expand_frame_repr', False)
# create poincare transformer
from src.pipeline.pipeline_transformers import PoincareEmbedding
poincare_embedddings_transformer = PoincareEmbedding(graph=graph, epochs=100)

# add the column transformer to the pipeline
pipeline.add_new_step(poincare_embedddings_transformer, "embeddings_transformer")

pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_1")

pipeline.add_new_step(ColumnKeeper(columns=["poincare_embedding_dim1", "poincare_embedding_dim2"]),
                                  "column_keeper")

pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_2")

print(pipeline.get_pipeline().named_steps)

pipeline.run()

{'embeddings_transformer': PoincareEmbedding(graph=<networkx.classes.graph.Graph object at 0x000001C33E628AD0>), 'print_df_1': PrintDataframe(verbose=1), 'column_keeper': ColumnKeeper(columns=['poincare_embedding_dim1', 'poincare_embedding_dim2']), 'print_df_2': PrintDataframe(verbose=1), 'estimator': LinearRegression()}
Starting pipeline using method: EvaluationType.BASIC
----------------------------------------
Printing dataframe:
   dataset model tuning scoring  poincare_embedding_dim1  poincare_embedding_dim2
0    23381    LR  model      F1                 0.007151                 0.085302
1    23381    LR  model      F1                 0.095912                 0.152033
2    23381    LR  model      F1                 0.048728                -0.160407
3    23381    LR  model      F1                 0.082937                 0.130173
4    23381    LR  model      F1                 0.042925                -0.138942
----------------------------------------
------------------------------

In [9]:
# using grid search to find the best parameters
param_grid = {
    "embeddings_transformer__epochs": [10, 20]
}

grid_pipeline = pipelineFactory.create_pipeline(train_df,
                                                ModelType.REGRE_BASELINE,
                                                verbose_level=1,
                                                evaluation=EvaluationType.GRID_SEARCH,
                                                param_grid=param_grid,
                                                split_factors=[])
grid_pipeline.add_new_step(poincare_embedddings_transformer, "embeddings_transformer")
grid_pipeline.add_new_step(ColumnKeeper(columns=["poincare_embedding_dim1", "poincare_embedding_dim2"]), "column_keeper")
grid_pipeline.add_new_step(PrintDataframe(verbose=grid_pipeline._verbose_level), "print_df_1")
grid_pipeline.change_estimator(LinearRegression())

grid_pipeline.run()


Starting pipeline using method: EvaluationType.GRID_SEARCH
Performing grid search
Fitting 5 folds for each of 2 candidates, totalling 10 fits


ValueError: Invalid parameter 'epochs' for estimator PoincareEmbedding(graph=<networkx.classes.graph.Graph object at 0x000002465468DDD0>). Valid parameters are: ['graph'].