# Pipeline Graph Embeddings
The purpose of this notebook is to apply graph embeddings in our pipeline

In [2]:
# imports
import os
import pandas as pd
from src import configuration as config
from src.pipeline.pipeline_factory import PipelineFactory, ModelType, EvaluationType

In [3]:
# load the data
train_df = config.load_traindata_for_regression()
pipelineFactory = PipelineFactory()

# create the baseline pipeline
pipeline = pipelineFactory.create_pipeline(train_df,
                                            ModelType.REGRE_BASELINE,
                                            verbose_level=1,
                                            evaluation=EvaluationType.BASIC)
pipeline.run()

Starting pipeline using method: EvaluationType.BASIC
0.0
Finished running the pipeline


In [4]:
from sklearn.compose import ColumnTransformer
from src.pipeline.pipeline_transformers import Node2VecGraphEmbeddingWithKMeans
from src.features.encoder_utils import load_graph

graph = load_graph(config.ROOT_DIR / "data/external/graphs/encodings_graph.adjlist")
embedddings_transformer = Node2VecGraphEmbeddingWithKMeans(graph=graph)

# add the column transformer to the pipeline
pipeline.add_new_step(embedddings_transformer, "embeddings_transformer")

print(pipeline.get_pipeline().named_steps)

{'embeddings_transformer': Node2VecGraphEmbeddingWithKMeans(graph=<networkx.classes.graph.Graph object at 0x00000216ECC0B410>), 'estimator': DummyRegressor()}


In [5]:
# check if the embedding worked
from src.pipeline.pipeline_transformers import PrintDataframe
pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df")
print(pipeline.get_pipeline().named_steps)

pipeline.run()

{'embeddings_transformer': Node2VecGraphEmbeddingWithKMeans(graph=<networkx.classes.graph.Graph object at 0x00000216ECC0B410>), 'print_df': PrintDataframe(verbose=1), 'estimator': DummyRegressor()}
Starting pipeline using method: EvaluationType.BASIC


Computing transition probabilities: 100%|██████████| 45/45 [00:00<?, ?it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 54.27it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring       encoder  encoder_cluster
0    23381    LR  model      F1            BE                2
1    23381    LR  model      F1  BUCV10RGLMME                1
2    23381    LR  model      F1      BUCV10TE                0
3    23381    LR  model      F1   BUCV2RGLMME                1
4    23381    LR  model      F1       BUCV2TE                0
----------------------------------------


Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 5477.50it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 45.80it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring       encoder  encoder_cluster
0    41007   SVC     no     ACC            BE                3
1    41007   SVC     no     ACC  BUCV10RGLMME                0
2    41007   SVC     no     ACC      BUCV10TE                2
3    41007   SVC     no     ACC   BUCV2RGLMME                0
4    41007   SVC     no     ACC       BUCV2TE                2
----------------------------------------
0.0
Finished running the pipeline


As we can see the application of the kmeans encoder worked

In [7]:
# lets try to get a prediction with a regression model
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from src.pipeline.pipeline_transformers import ColumnKeeper

# only keep selected columns
column_keeper = ColumnKeeper(columns=["encoder_cluster"])

pipeline.add_new_step(column_keeper, "column_keeper")
pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_2")

pipeline.change_estimator(LinearRegression())

print(pipeline.get_pipeline().named_steps)
pipeline.run()

{'embeddings_transformer': Node2VecGraphEmbeddingWithKMeans(graph=<networkx.classes.graph.Graph object at 0x00000216ECC0B410>), 'print_df': PrintDataframe(verbose=1), 'column_keeper': ColumnKeeper(columns=['encoder_cluster']), 'print_df_2': PrintDataframe(verbose=1), 'estimator': LinearRegression()}
Starting pipeline using method: EvaluationType.BASIC


Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 7385.49it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 49.55it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring       encoder  encoder_cluster
0    23381    LR  model      F1            BE                2
1    23381    LR  model      F1  BUCV10RGLMME                3
2    23381    LR  model      F1      BUCV10TE                4
3    23381    LR  model      F1   BUCV2RGLMME                3
4    23381    LR  model      F1       BUCV2TE                4
----------------------------------------
----------------------------------------
Printing dataframe:
   encoder_cluster
0                2
1                3
2                4
3                3
4                4
----------------------------------------


Computing transition probabilities: 100%|██████████| 45/45 [00:00<?, ?it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 53.43it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring       encoder  encoder_cluster
0    41007   SVC     no     ACC            BE                0
1    41007   SVC     no     ACC  BUCV10RGLMME                2
2    41007   SVC     no     ACC      BUCV10TE                3
3    41007   SVC     no     ACC   BUCV2RGLMME                2
4    41007   SVC     no     ACC       BUCV2TE                3
----------------------------------------
----------------------------------------
Printing dataframe:
   encoder_cluster
0                0
1                2
2                3
3                2
4                3
----------------------------------------
0.07425108966396504
Finished running the pipeline


In this test run the pipeline score is a 0.071. As we are using spearmans R as our metric where the best score is 1.0 or -1.0 we can see that we achieved a very poor score.