# Pipeline Graph Embeddings
The purpose of this notebook is to apply graph embeddings in our pipeline

In [1]:
# imports
import os
import pandas as pd
from src import configuration as config
from src.pipeline.pipeline_factory import PipelineFactory, ModelType, EvaluationType

In [2]:
# load the data
train_df = config.load_traindata_for_regression()
pipelineFactory = PipelineFactory()

# create the baseline pipeline
pipeline = pipelineFactory.create_pipeline(train_df,
                                            ModelType.REGRE_BASELINE,
                                            verbose_level=1,
                                            evaluation=EvaluationType.BASIC)
pipeline.run()

Starting pipeline using method: EvaluationType.BASIC
0.0
Finished running the pipeline


In [3]:
from src.features.encoder_utils import load_graph
graph = load_graph(config.ROOT_DIR / "data/external/graphs/encodings_graph.adjlist")

### Try out Node2Vec

In [4]:
pipeline.clear_steps()
from src.pipeline.pipeline_transformers import Node2VecEmbedding, PrintDataframe, ColumnKeeper
n2v_embedddings_transformer = Node2VecEmbedding(graph=graph, walk_length=20, num_walks=1000, workers=1)

# add the column transformer to the pipeline
pipeline.add_new_step(n2v_embedddings_transformer, "embeddings_transformer")

pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_1")

pipeline.add_new_step(ColumnKeeper(columns=["node2vec_embedding_dim1", "node2vec_embedding_dim2"]),
                                  "column_keeper")

pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_2")

print(pipeline.get_pipeline().named_steps)

pipeline.run()

  from .autonotebook import tqdm as notebook_tqdm


{'embeddings_transformer': Node2VecEmbedding(graph=<networkx.classes.graph.Graph object at 0x0000018D1BC4D7D0>), 'print_df_1': PrintDataframe(verbose=1), 'column_keeper': ColumnKeeper(columns=['node2vec_embedding_dim1', 'node2vec_embedding_dim2']), 'print_df_2': PrintDataframe(verbose=1), 'estimator': DummyRegressor()}
Starting pipeline using method: EvaluationType.BASIC


Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 2180.57it/s]
Generating walks (CPU: 1): 100%|██████████| 1000/1000 [00:10<00:00, 96.82it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring       encoder  node2vec_embedding_dim1  \
0    23381    LR  model      F1            BE                 0.085825   
1    23381    LR  model      F1  BUCV10RGLMME                -0.073088   
2    23381    LR  model      F1      BUCV10TE                 0.195790   
3    23381    LR  model      F1   BUCV2RGLMME                -0.063628   
4    23381    LR  model      F1       BUCV2TE                 0.224964   

   node2vec_embedding_dim2  
0                -0.219572  
1                -0.119526  
2                -0.281009  
3                -0.220700  
4                -0.329527  
----------------------------------------
----------------------------------------
Printing dataframe:
   node2vec_embedding_dim1  node2vec_embedding_dim2
0                 0.085825                -0.219572
1                -0.073088                -0.119526
2                 0.195790                -0.281009
3         

Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 2874.69it/s]
Generating walks (CPU: 1): 100%|██████████| 1000/1000 [00:09<00:00, 107.24it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring       encoder  node2vec_embedding_dim1  \
0    41007   SVC     no     ACC            BE                -0.072245   
1    41007   SVC     no     ACC  BUCV10RGLMME                 0.198608   
2    41007   SVC     no     ACC      BUCV10TE                 0.240993   
3    41007   SVC     no     ACC   BUCV2RGLMME                 0.170766   
4    41007   SVC     no     ACC       BUCV2TE                 0.282515   

   node2vec_embedding_dim2  
0                -0.106322  
1                 0.015082  
2                 0.032120  
3                 0.050684  
4                 0.021936  
----------------------------------------
----------------------------------------
Printing dataframe:
   node2vec_embedding_dim1  node2vec_embedding_dim2
0                -0.072245                -0.106322
1                 0.198608                 0.015082
2                 0.240993                 0.032120
3         

### Try out Node2Vec with kmeans

In [5]:
pipeline.clear_steps()
from src.pipeline.pipeline_transformers import Node2VecGraphEmbeddingWithKMeans

n2v_embedddings_transformer = Node2VecGraphEmbeddingWithKMeans(graph=graph)

# add the column transformer to the pipeline
pipeline.add_new_step(n2v_embedddings_transformer, "embeddings_transformer")

print(pipeline.get_pipeline().named_steps)

{'embeddings_transformer': Node2VecGraphEmbeddingWithKMeans(graph=<networkx.classes.graph.Graph object at 0x0000018D1BC4D7D0>), 'estimator': DummyRegressor()}


In [6]:
# check if the embedding worked
from src.pipeline.pipeline_transformers import PrintDataframe
pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df")
print(pipeline.get_pipeline().named_steps)

pipeline.run()

{'embeddings_transformer': Node2VecGraphEmbeddingWithKMeans(graph=<networkx.classes.graph.Graph object at 0x0000018D1BC4D7D0>), 'print_df': PrintDataframe(verbose=1), 'estimator': DummyRegressor()}
Starting pipeline using method: EvaluationType.BASIC


Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 2877.72it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 19.94it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring       encoder  encoder_cluster
0    23381    LR  model      F1            BE                2
1    23381    LR  model      F1  BUCV10RGLMME                3
2    23381    LR  model      F1      BUCV10TE                0
3    23381    LR  model      F1   BUCV2RGLMME                3
4    23381    LR  model      F1       BUCV2TE                0
----------------------------------------


Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 2004.48it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 24.74it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring       encoder  encoder_cluster
0    41007   SVC     no     ACC            BE                1
1    41007   SVC     no     ACC  BUCV10RGLMME                2
2    41007   SVC     no     ACC      BUCV10TE                4
3    41007   SVC     no     ACC   BUCV2RGLMME                2
4    41007   SVC     no     ACC       BUCV2TE                4
----------------------------------------
0.0
Finished running the pipeline


As we can see the application of the kmeans encoder worked

In [7]:
# lets try to get a prediction with a regression model
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from src.pipeline.pipeline_transformers import ColumnKeeper

# only keep selected columns
column_keeper = ColumnKeeper(columns=["encoder_cluster"])

pipeline.add_new_step(column_keeper, "column_keeper")
pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_2")

pipeline.change_estimator(LinearRegression())

print(pipeline.get_pipeline().named_steps)
pipeline.run()

{'embeddings_transformer': Node2VecGraphEmbeddingWithKMeans(graph=<networkx.classes.graph.Graph object at 0x0000018D1BC4D7D0>), 'print_df': PrintDataframe(verbose=1), 'column_keeper': ColumnKeeper(columns=['encoder_cluster']), 'print_df_2': PrintDataframe(verbose=1), 'estimator': LinearRegression()}
Starting pipeline using method: EvaluationType.BASIC


Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 2882.73it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 23.42it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring       encoder  encoder_cluster
0    23381    LR  model      F1            BE                4
1    23381    LR  model      F1  BUCV10RGLMME                2
2    23381    LR  model      F1      BUCV10TE                1
3    23381    LR  model      F1   BUCV2RGLMME                2
4    23381    LR  model      F1       BUCV2TE                1
----------------------------------------
----------------------------------------
Printing dataframe:
   encoder_cluster
0                4
1                2
2                1
3                2
4                1
----------------------------------------


Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 5021.51it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 24.81it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring       encoder  encoder_cluster
0    41007   SVC     no     ACC            BE                1
1    41007   SVC     no     ACC  BUCV10RGLMME                2
2    41007   SVC     no     ACC      BUCV10TE                5
3    41007   SVC     no     ACC   BUCV2RGLMME                2
4    41007   SVC     no     ACC       BUCV2TE                5
----------------------------------------
----------------------------------------
Printing dataframe:
   encoder_cluster
0                1
1                2
2                5
3                2
4                5
----------------------------------------
-0.04293566411797524
Finished running the pipeline


In this test run the pipeline score is a 0.015. As we are using spearmans R as our metric where the best score is 1.0 or -1.0 we can see that we achieved a very poor score.

### Try out Poincare

In [8]:
pipeline.clear_steps()

# create poincare transformer
from src.pipeline.pipeline_transformers import PoincareEmbedding
poincare_embedddings_transformer = PoincareEmbedding(graph=graph, epochs=100)

# add the column transformer to the pipeline
pipeline.add_new_step(poincare_embedddings_transformer, "embeddings_transformer")

pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_1")

pipeline.add_new_step(ColumnKeeper(columns=["poincare_embedding_dim1", "poincare_embedding_dim2"]),
                                  "column_keeper")

pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_2")

print(pipeline.get_pipeline().named_steps)

pipeline.run()

{'embeddings_transformer': PoincareEmbedding(graph=<networkx.classes.graph.Graph object at 0x0000018D1BC4D7D0>), 'print_df_1': PrintDataframe(verbose=1), 'column_keeper': ColumnKeeper(columns=['poincare_embedding_dim1', 'poincare_embedding_dim2']), 'print_df_2': PrintDataframe(verbose=1), 'estimator': LinearRegression()}
Starting pipeline using method: EvaluationType.BASIC


<class 'pandas.core.frame.DataFrame'>
----------------------------------------
Printing dataframe:
   dataset model tuning scoring       encoder  poincare_embedding_dim1  \
0    23381    LR  model      F1            BE                 0.007151   
1    23381    LR  model      F1  BUCV10RGLMME                 0.095912   
2    23381    LR  model      F1      BUCV10TE                 0.048728   
3    23381    LR  model      F1   BUCV2RGLMME                 0.082937   
4    23381    LR  model      F1       BUCV2TE                 0.042925   

   poincare_embedding_dim2  
0                 0.085302  
1                 0.152033  
2                -0.160407  
3                 0.130173  
4                -0.138942  
----------------------------------------
----------------------------------------
Printing dataframe:
   poincare_embedding_dim1  poincare_embedding_dim2
0                 0.007151                 0.085302
1                 0.095912                 0.152033
2                 0.0487

In [9]:
# using grid search to find the best parameters
param_grid = {
    "embeddings_transformer__epochs": [10, 20]
}

grid_pipeline = pipelineFactory.create_pipeline(train_df,
                                                ModelType.REGRE_BASELINE,
                                                verbose_level=1,
                                                evaluation=EvaluationType.GRID_SEARCH,
                                                param_grid=param_grid,
                                                split_factors=[])
grid_pipeline.add_new_step(poincare_embedddings_transformer, "embeddings_transformer")
grid_pipeline.add_new_step(ColumnKeeper(columns=["poincare_embedding_dim1", "poincare_embedding_dim2"]), "column_keeper")
grid_pipeline.add_new_step(PrintDataframe(verbose=grid_pipeline._verbose_level), "print_df_1")
grid_pipeline.change_estimator(LinearRegression())

grid_pipeline.run()


Starting pipeline using method: EvaluationType.GRID_SEARCH
Performing grid search
Fitting 5 folds for each of 2 candidates, totalling 10 fits


ValueError: Invalid parameter 'epochs' for estimator PoincareEmbedding(graph=<networkx.classes.graph.Graph object at 0x000001F81CE72A90>). Valid parameters are: ['graph'].