# Pipeline Graph Embeddings
The purpose of this notebook is to apply graph embeddings in our pipeline

In [6]:
# imports
import os
import pandas as pd
from src import configuration as config
from src.pipeline.pipeline_factory import PipelineFactory, ModelType, EvaluationType

In [7]:
# load the data
train_df = config.load_traindata_for_regression()
pipelineFactory = PipelineFactory()

# create the baseline pipeline
pipeline = pipelineFactory.create_pipeline(train_df,
                                            ModelType.REGRE_BASELINE,
                                            verbose_level=1,
                                            evaluation=EvaluationType.BASIC)
pipeline.run()

Starting pipeline using method: EvaluationType.BASIC
0.0
Finished running the pipeline


In [8]:
from src.features.encoder_utils import load_graph
graph = load_graph(config.ROOT_DIR / "data/external/graphs/encodings_graph.adjlist")

### Try out Node2Vec

In [9]:
pipeline.clear_steps()
from src.pipeline.pipeline_transformers import Node2VecEmbedding, PrintDataframe, ColumnKeeper
embedddings_transformer = Node2VecEmbedding(graph=graph)

# add the column transformer to the pipeline
pipeline.add_new_step(embedddings_transformer, "embeddings_transformer")

pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_1")

pipeline.add_new_step(ColumnKeeper(columns=["node2vec_embedding_dim1", "node2vec_embedding_dim2"]),
                                  "column_keeper")

pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_2")

print(pipeline.get_pipeline().named_steps)

pipeline.run()

{'embeddings_transformer': Node2VecEmbedding(graph=<networkx.classes.graph.Graph object at 0x000002A0905FC550>), 'print_df_1': PrintDataframe(verbose=1), 'column_keeper': ColumnKeeper(columns=['node2vec_embedding_dim1', 'node2vec_embedding_dim2']), 'print_df_2': PrintDataframe(verbose=1), 'estimator': DummyRegressor()}
Starting pipeline using method: EvaluationType.BASIC


Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 5287.09it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 49.00it/s]


<class 'pandas.core.frame.DataFrame'>
----------------------------------------
Printing dataframe:
   dataset model tuning scoring       encoder  node2vec_embedding_dim1  \
0    23381    LR  model      F1            BE                 0.135819   
1    23381    LR  model      F1  BUCV10RGLMME                -0.125350   
2    23381    LR  model      F1      BUCV10TE                -0.052346   
3    23381    LR  model      F1   BUCV2RGLMME                -0.142361   
4    23381    LR  model      F1       BUCV2TE                -0.062420   

   node2vec_embedding_dim2  
0                 0.172843  
1                -0.050254  
2                -0.065892  
3                -0.024288  
4                -0.053443  
----------------------------------------
----------------------------------------
Printing dataframe:
   node2vec_embedding_dim1  node2vec_embedding_dim2
0                 0.135819                 0.172843
1                -0.125350                -0.050254
2                -0.0523

Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 5329.48it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 53.39it/s]


<class 'pandas.core.frame.DataFrame'>
----------------------------------------
Printing dataframe:
   dataset model tuning scoring       encoder  node2vec_embedding_dim1  \
0    41007   SVC     no     ACC            BE                 0.251390   
1    41007   SVC     no     ACC  BUCV10RGLMME                -0.157840   
2    41007   SVC     no     ACC      BUCV10TE                 0.134794   
3    41007   SVC     no     ACC   BUCV2RGLMME                -0.129164   
4    41007   SVC     no     ACC       BUCV2TE                 0.088115   

   node2vec_embedding_dim2  
0                -0.175512  
1                -0.308738  
2                -0.119933  
3                -0.325769  
4                -0.115201  
----------------------------------------
----------------------------------------
Printing dataframe:
   node2vec_embedding_dim1  node2vec_embedding_dim2
0                 0.251390                -0.175512
1                -0.157840                -0.308738
2                 0.1347

### Try out Node2Vec with kmeans

In [10]:
pipeline.clear_steps()
from src.pipeline.pipeline_transformers import Node2VecGraphEmbeddingWithKMeans

embedddings_transformer = Node2VecGraphEmbeddingWithKMeans(graph=graph)

# add the column transformer to the pipeline
pipeline.add_new_step(embedddings_transformer, "embeddings_transformer")

print(pipeline.get_pipeline().named_steps)

{'embeddings_transformer': Node2VecGraphEmbeddingWithKMeans(graph=<networkx.classes.graph.Graph object at 0x000002A0905FC550>), 'estimator': DummyRegressor()}


In [11]:
# check if the embedding worked
from src.pipeline.pipeline_transformers import PrintDataframe
pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df")
print(pipeline.get_pipeline().named_steps)

pipeline.run()

{'embeddings_transformer': Node2VecGraphEmbeddingWithKMeans(graph=<networkx.classes.graph.Graph object at 0x000002A0905FC550>), 'print_df': PrintDataframe(verbose=1), 'estimator': DummyRegressor()}
Starting pipeline using method: EvaluationType.BASIC


Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 5615.03it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 53.67it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring       encoder  encoder_cluster
0    23381    LR  model      F1            BE                2
1    23381    LR  model      F1  BUCV10RGLMME                1
2    23381    LR  model      F1      BUCV10TE                0
3    23381    LR  model      F1   BUCV2RGLMME                1
4    23381    LR  model      F1       BUCV2TE                0
----------------------------------------


Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 5108.50it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 54.73it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring       encoder  encoder_cluster
0    41007   SVC     no     ACC            BE                2
1    41007   SVC     no     ACC  BUCV10RGLMME                3
2    41007   SVC     no     ACC      BUCV10TE                1
3    41007   SVC     no     ACC   BUCV2RGLMME                3
4    41007   SVC     no     ACC       BUCV2TE                1
----------------------------------------
0.0
Finished running the pipeline


As we can see the application of the kmeans encoder worked

In [12]:
# lets try to get a prediction with a regression model
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from src.pipeline.pipeline_transformers import ColumnKeeper

# only keep selected columns
column_keeper = ColumnKeeper(columns=["encoder_cluster"])

pipeline.add_new_step(column_keeper, "column_keeper")
pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_2")

pipeline.change_estimator(LinearRegression())

print(pipeline.get_pipeline().named_steps)
pipeline.run()

{'embeddings_transformer': Node2VecGraphEmbeddingWithKMeans(graph=<networkx.classes.graph.Graph object at 0x000002A0905FC550>), 'print_df': PrintDataframe(verbose=1), 'column_keeper': ColumnKeeper(columns=['encoder_cluster']), 'print_df_2': PrintDataframe(verbose=1), 'estimator': LinearRegression()}
Starting pipeline using method: EvaluationType.BASIC


Computing transition probabilities: 100%|██████████| 45/45 [00:00<?, ?it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 54.96it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring       encoder  encoder_cluster
0    23381    LR  model      F1            BE                2
1    23381    LR  model      F1  BUCV10RGLMME                1
2    23381    LR  model      F1      BUCV10TE                3
3    23381    LR  model      F1   BUCV2RGLMME                1
4    23381    LR  model      F1       BUCV2TE                3
----------------------------------------
----------------------------------------
Printing dataframe:
   encoder_cluster
0                2
1                1
2                3
3                1
4                3
----------------------------------------


Computing transition probabilities: 100%|██████████| 45/45 [00:00<?, ?it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 48.12it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring       encoder  encoder_cluster
0    41007   SVC     no     ACC            BE                0
1    41007   SVC     no     ACC  BUCV10RGLMME                3
2    41007   SVC     no     ACC      BUCV10TE                2
3    41007   SVC     no     ACC   BUCV2RGLMME                3
4    41007   SVC     no     ACC       BUCV2TE                2
----------------------------------------
----------------------------------------
Printing dataframe:
   encoder_cluster
0                0
1                3
2                2
3                3
4                2
----------------------------------------
0.05638620570951227
Finished running the pipeline


In this test run the pipeline score is a 0.071. As we are using spearmans R as our metric where the best score is 1.0 or -1.0 we can see that we achieved a very poor score.

### Try out Poincare

In [13]:
pipeline.clear_steps()

# create poincare transformer
from src.pipeline.pipeline_transformers import PoincareEmbedding
embedddings_transformer = PoincareEmbedding(graph=graph)

# add the column transformer to the pipeline
pipeline.add_new_step(embedddings_transformer, "embeddings_transformer")

pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_1")

pipeline.add_new_step(ColumnKeeper(columns=["poincare_embedding_dim1", "poincare_embedding_dim2"]),
                                  "column_keeper")

pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_2")

print(pipeline.get_pipeline().named_steps)

pipeline.run()

{'embeddings_transformer': PoincareEmbedding(graph=<networkx.classes.graph.Graph object at 0x000002A0905FC550>), 'print_df_1': PrintDataframe(verbose=1), 'column_keeper': ColumnKeeper(columns=['poincare_embedding_dim1', 'poincare_embedding_dim2']), 'print_df_2': PrintDataframe(verbose=1), 'estimator': LinearRegression()}
Starting pipeline using method: EvaluationType.BASIC
<class 'pandas.core.frame.DataFrame'>
----------------------------------------
Printing dataframe:
   dataset model tuning scoring       encoder  poincare_embedding_dim1  \
0    23381    LR  model      F1            BE                -0.752100   
1    23381    LR  model      F1  BUCV10RGLMME                 0.577925   
2    23381    LR  model      F1      BUCV10TE                -0.475352   
3    23381    LR  model      F1   BUCV2RGLMME                 0.448117   
4    23381    LR  model      F1       BUCV2TE                -0.342273   

   poincare_embedding_dim2  
0                -0.344086  
1                -0.53