# Pipeline Graph Embeddings
The purpose of this notebook is to apply graph embeddings in our pipeline

In [1]:
# imports
import os
import pandas as pd
from src import configuration as config
from src.pipeline.pipeline_factory import PipelineFactory, ModelType, EvaluationType

In [2]:
# load the data
train_df = config.load_traindata_for_regression()
pipelineFactory = PipelineFactory()

# create the baseline pipeline
pipeline = pipelineFactory.create_pipeline(train_df,
                                            ModelType.REGRE_BASELINE,
                                            verbose_level=1,
                                            evaluation=EvaluationType.BASIC)
pipeline.run()

Starting pipeline using method: EvaluationType.BASIC
0.0
Finished running the pipeline


In [3]:
from src.features.encoder_utils import load_graph
graph = load_graph(config.ROOT_DIR / "data/external/graphs/encodings_graph.adjlist")

### Try out Node2Vec

In [4]:
pipeline.clear_steps()
from src.pipeline.pipeline_transformers import Node2VecEmbedding, PrintDataframe, ColumnKeeper
embedddings_transformer = Node2VecEmbedding(graph=graph)

# add the column transformer to the pipeline
pipeline.add_new_step(embedddings_transformer, "embeddings_transformer")

pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_1")

pipeline.add_new_step(ColumnKeeper(columns=["node2vec_embedding_dim1", "node2vec_embedding_dim2"]),
                                  "column_keeper")

pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_2")

print(pipeline.get_pipeline().named_steps)

pipeline.run()

  from .autonotebook import tqdm as notebook_tqdm


{'embeddings_transformer': Node2VecEmbedding(graph=<networkx.classes.graph.Graph object at 0x000002A0FE635290>), 'print_df_1': PrintDataframe(verbose=1), 'column_keeper': ColumnKeeper(columns=['poincare_embedding_dim1', 'poincare_embedding_dim2']), 'print_df_2': PrintDataframe(verbose=1), 'estimator': DummyRegressor()}
Starting pipeline using method: EvaluationType.BASIC


Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 9973.25it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 46.82it/s]


<class 'pandas.core.frame.DataFrame'>
----------------------------------------
Printing dataframe:
   dataset model tuning scoring       encoder  node2vec_embedding_dim1  \
0    23381    LR  model      F1            BE                 0.218471   
1    23381    LR  model      F1  BUCV10RGLMME                -0.118829   
2    23381    LR  model      F1      BUCV10TE                -0.287909   
3    23381    LR  model      F1   BUCV2RGLMME                -0.065149   
4    23381    LR  model      F1       BUCV2TE                -0.254103   

   node2vec_embedding_dim2  
0                -0.136139  
1                -0.040382  
2                 0.249924  
3                -0.028696  
4                 0.342775  
----------------------------------------


KeyError: "None of [Index(['poincare_embedding_dim1', 'poincare_embedding_dim2'], dtype='object')] are in the [columns]"

### Try out Node2Vec with kmeans

In [3]:
pipeline.clear_steps()
from src.pipeline.pipeline_transformers import Node2VecGraphEmbeddingWithKMeans

embedddings_transformer = Node2VecGraphEmbeddingWithKMeans(graph=graph)

# add the column transformer to the pipeline
pipeline.add_new_step(embedddings_transformer, "embeddings_transformer")

print(pipeline.get_pipeline().named_steps)

  from .autonotebook import tqdm as notebook_tqdm


{'embeddings_transformer': Node2VecGraphEmbeddingWithKMeans(graph=<networkx.classes.graph.Graph object at 0x0000026B9286B810>), 'estimator': DummyRegressor()}


In [4]:
# check if the embedding worked
from src.pipeline.pipeline_transformers import PrintDataframe
pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df")
print(pipeline.get_pipeline().named_steps)

pipeline.run()

{'embeddings_transformer': Node2VecGraphEmbeddingWithKMeans(graph=<networkx.classes.graph.Graph object at 0x0000026B9286B810>), 'print_df': PrintDataframe(verbose=1), 'estimator': DummyRegressor()}
Starting pipeline using method: EvaluationType.BASIC


Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 7948.78it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 49.81it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring       encoder  encoder_cluster
0    23381    LR  model      F1            BE                0
1    23381    LR  model      F1  BUCV10RGLMME                1
2    23381    LR  model      F1      BUCV10TE                4
3    23381    LR  model      F1   BUCV2RGLMME                1
4    23381    LR  model      F1       BUCV2TE                4
----------------------------------------


Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 9933.36it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 49.76it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring       encoder  encoder_cluster
0    41007   SVC     no     ACC            BE                2
1    41007   SVC     no     ACC  BUCV10RGLMME                3
2    41007   SVC     no     ACC      BUCV10TE                1
3    41007   SVC     no     ACC   BUCV2RGLMME                3
4    41007   SVC     no     ACC       BUCV2TE                1
----------------------------------------
0.0
Finished running the pipeline


As we can see the application of the kmeans encoder worked

In [5]:
# lets try to get a prediction with a regression model
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from src.pipeline.pipeline_transformers import ColumnKeeper

# only keep selected columns
column_keeper = ColumnKeeper(columns=["encoder_cluster"])

pipeline.add_new_step(column_keeper, "column_keeper")
pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_2")

pipeline.change_estimator(LinearRegression())

print(pipeline.get_pipeline().named_steps)
pipeline.run()

{'embeddings_transformer': Node2VecGraphEmbeddingWithKMeans(graph=<networkx.classes.graph.Graph object at 0x0000026B9286B810>), 'print_df': PrintDataframe(verbose=1), 'column_keeper': ColumnKeeper(columns=['encoder_cluster']), 'print_df_2': PrintDataframe(verbose=1), 'estimator': LinearRegression()}
Starting pipeline using method: EvaluationType.BASIC


Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 9908.85it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 46.20it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring       encoder  encoder_cluster
0    23381    LR  model      F1            BE                4
1    23381    LR  model      F1  BUCV10RGLMME                1
2    23381    LR  model      F1      BUCV10TE                0
3    23381    LR  model      F1   BUCV2RGLMME                1
4    23381    LR  model      F1       BUCV2TE                0
----------------------------------------
----------------------------------------
Printing dataframe:
   encoder_cluster
0                4
1                1
2                0
3                1
4                0
----------------------------------------


Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 11250.82it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 57.28it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring       encoder  encoder_cluster
0    41007   SVC     no     ACC            BE                1
1    41007   SVC     no     ACC  BUCV10RGLMME                3
2    41007   SVC     no     ACC      BUCV10TE                2
3    41007   SVC     no     ACC   BUCV2RGLMME                3
4    41007   SVC     no     ACC       BUCV2TE                2
----------------------------------------
----------------------------------------
Printing dataframe:
   encoder_cluster
0                1
1                3
2                2
3                3
4                2
----------------------------------------
-0.05068835041091806
Finished running the pipeline


In this test run the pipeline score is a 0.071. As we are using spearmans R as our metric where the best score is 1.0 or -1.0 we can see that we achieved a very poor score.

### Try out Poincare

In [6]:
pipeline.clear_steps()

# create poincare transformer
from src.pipeline.pipeline_transformers import PoincareEmbedding
embedddings_transformer = PoincareEmbedding(graph=graph)

# add the column transformer to the pipeline
pipeline.add_new_step(embedddings_transformer, "embeddings_transformer")

pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_1")

pipeline.add_new_step(ColumnKeeper(columns=["poincare_embedding_dim1", "poincare_embedding_dim2"]),
                                  "column_keeper")

pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_2")

print(pipeline.get_pipeline().named_steps)

pipeline.run()

{'embeddings_transformer': PoincareEmbedding(graph=<networkx.classes.graph.Graph object at 0x0000026B9286B810>), 'print_df_1': PrintDataframe(verbose=1), 'column_keeper': ColumnKeeper(columns=['poincare_embedding_dim1', 'poincare_embedding_dim2']), 'print_df_2': PrintDataframe(verbose=1), 'estimator': LinearRegression()}
Starting pipeline using method: EvaluationType.BASIC
<class 'pandas.core.frame.DataFrame'>
----------------------------------------
Printing dataframe:
   dataset model tuning scoring       encoder  poincare_embedding_dim1  \
0    23381    LR  model      F1            BE                -0.752100   
1    23381    LR  model      F1  BUCV10RGLMME                 0.577925   
2    23381    LR  model      F1      BUCV10TE                -0.475352   
3    23381    LR  model      F1   BUCV2RGLMME                 0.448117   
4    23381    LR  model      F1       BUCV2TE                -0.342273   

   poincare_embedding_dim2  
0                -0.344086  
1                -0.53