# Pipeline Graph Embeddings
The purpose of this notebook is to apply graph embeddings in our pipeline

In [1]:
# imports
import os
import pandas as pd
from src import configuration as config
from src.pipeline.pipeline_factory import PipelineFactory, ModelType, EvaluationType

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load the data
train_df = config.load_traindata_for_regression()
pipelineFactory = PipelineFactory()

# create the baseline pipeline
pipeline = pipelineFactory.create_pipeline(train_df,
                                            ModelType.REGRE_BASELINE,
                                            verbose_level=1,
                                            evaluation=EvaluationType.BASIC)
pipeline.run()

Starting pipeline using method: EvaluationType.BASIC
Finished running the pipeline
Evaluation metrics:
    validation_rmse: 0.2206 [std=0.]
    validation_mae: 0.1783 [std=0.]
    validation_r2: -0.005 [std=0.]
    validation_average_spearman: 0. [std=0.]


In [3]:
from src.features.encoder_utils import load_graph
graph = load_graph(config.ROOT_DIR / "data/external/graphs/encodings_graph.adjlist")

### Try out Node2Vec

In [4]:
pipeline.clear_steps()
from src.pipeline.pipeline_transformers import Node2VecEmbedding, PrintDataframe, ColumnKeeper
from sklearn.linear_model import LinearRegression
n2v_embedddings_transformer = Node2VecEmbedding(graph=graph, walk_length=20, num_walks=1000, workers=1)

# add the column transformer to the pipeline
pipeline.add_new_step(n2v_embedddings_transformer, "embeddings_transformer")

pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_1")

pipeline.add_new_step(ColumnKeeper(columns=["node2vec_embedding_dim1", "node2vec_embedding_dim2"]),
                                  "column_keeper")

pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_2")

print(pipeline.get_pipeline().named_steps)

pipeline.change_estimator(LinearRegression())

pipeline.run()

{'embeddings_transformer': Node2VecEmbedding(graph=<networkx.classes.graph.Graph object at 0x000002753EE5E590>), 'print_df_1': PrintDataframe(verbose=1), 'column_keeper': ColumnKeeper(columns=['node2vec_embedding_dim1', 'node2vec_embedding_dim2']), 'print_df_2': PrintDataframe(verbose=1), 'estimator': DummyRegressor()}
Starting pipeline using method: EvaluationType.BASIC


Computing transition probabilities:   0%|          | 0/45 [00:00<?, ?it/s]

Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 22539.25it/s]
Generating walks (CPU: 1): 100%|██████████| 1000/1000 [00:03<00:00, 323.46it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring  node2vec_embedding_dim1  \
0    23381    LR  model      F1                -0.329613   
1    23381    LR  model      F1                -0.073129   
2    23381    LR  model      F1                 0.025337   
3    23381    LR  model      F1                -0.105937   
4    23381    LR  model      F1                 0.055852   

   node2vec_embedding_dim2  
0                -0.056413  
1                -0.195376  
2                -0.027240  
3                -0.134483  
4                 0.008353  
----------------------------------------
----------------------------------------
Printing dataframe:
   node2vec_embedding_dim1  node2vec_embedding_dim2
0                -0.329613                -0.056413
1                -0.073129                -0.195376
2                 0.025337                -0.027240
3                -0.105937                -0.134483
4                 0.055852                

Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 14948.81it/s]
Generating walks (CPU: 1): 100%|██████████| 1000/1000 [00:02<00:00, 360.88it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring  node2vec_embedding_dim1  \
0    41007   SVC     no     ACC                 0.233271   
1    41007   SVC     no     ACC                 0.004386   
2    41007   SVC     no     ACC                 0.196471   
3    41007   SVC     no     ACC                 0.023457   
4    41007   SVC     no     ACC                 0.160871   

   node2vec_embedding_dim2  
0                -0.131774  
1                -0.175404  
2                -0.045817  
3                -0.240275  
4                -0.045453  
----------------------------------------
----------------------------------------
Printing dataframe:
   node2vec_embedding_dim1  node2vec_embedding_dim2
0                 0.233271                -0.131774
1                 0.004386                -0.175404
2                 0.196471                -0.045817
3                 0.023457                -0.240275
4                 0.160871                

Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 21687.20it/s]
Generating walks (CPU: 1): 100%|██████████| 1000/1000 [00:02<00:00, 390.24it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring  node2vec_embedding_dim1  \
0    23381    LR  model      F1                 0.169039   
1    23381    LR  model      F1                -0.049130   
2    23381    LR  model      F1                 0.042312   
3    23381    LR  model      F1                -0.062575   
4    23381    LR  model      F1                 0.028709   

   node2vec_embedding_dim2  
0                -0.312533  
1                -0.413293  
2                 0.052501  
3                -0.363093  
4                 0.058674  
----------------------------------------
----------------------------------------
Printing dataframe:
   node2vec_embedding_dim1  node2vec_embedding_dim2
0                 0.169039                -0.312533
1                -0.049130                -0.413293
2                 0.042312                 0.052501
3                -0.062575                -0.363093
4                 0.028709                

Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 21602.80it/s]
Generating walks (CPU: 1): 100%|██████████| 1000/1000 [00:02<00:00, 361.55it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring  node2vec_embedding_dim1  \
0    41007   SVC     no     ACC                 0.032947   
1    41007   SVC     no     ACC                -0.027107   
2    41007   SVC     no     ACC                 0.185739   
3    41007   SVC     no     ACC                -0.031538   
4    41007   SVC     no     ACC                 0.185026   

   node2vec_embedding_dim2  
0                 0.025555  
1                 0.104762  
2                -0.300849  
3                 0.090770  
4                -0.339091  
----------------------------------------
----------------------------------------
Printing dataframe:
   node2vec_embedding_dim1  node2vec_embedding_dim2
0                 0.032947                 0.025555
1                -0.027107                 0.104762
2                 0.185739                -0.300849
3                -0.031538                 0.090770
4                 0.185026                

### Try out Node2Vec with kmeans

In [5]:
pipeline.clear_steps()
from src.pipeline.pipeline_transformers import Node2VecGraphEmbeddingWithKMeans

n2v_embedddings_transformer = Node2VecGraphEmbeddingWithKMeans(graph=graph)

# add the column transformer to the pipeline
pipeline.add_new_step(n2v_embedddings_transformer, "embeddings_transformer")

print(pipeline.get_pipeline().named_steps)

{'embeddings_transformer': Node2VecGraphEmbeddingWithKMeans(graph=<networkx.classes.graph.Graph object at 0x000002753EE5E590>), 'estimator': LinearRegression()}


In [6]:
# check if the embedding worked
from src.pipeline.pipeline_transformers import PrintDataframe
pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df")
print(pipeline.get_pipeline().named_steps)

{'embeddings_transformer': Node2VecGraphEmbeddingWithKMeans(graph=<networkx.classes.graph.Graph object at 0x000002753EE5E590>), 'print_df': PrintDataframe(verbose=1), 'estimator': LinearRegression()}


As we can see the application of the kmeans encoder worked

In [7]:
# lets try to get a prediction with a regression model
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from src.pipeline.pipeline_transformers import ColumnKeeper

# only keep selected columns
column_keeper = ColumnKeeper(columns=["encoder_cluster"])

pipeline.add_new_step(column_keeper, "column_keeper")
pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_2")

pipeline.change_estimator(LinearRegression())

print(pipeline.get_pipeline().named_steps)
pipeline.run()

{'embeddings_transformer': Node2VecGraphEmbeddingWithKMeans(graph=<networkx.classes.graph.Graph object at 0x000002753EE5E590>), 'print_df': PrintDataframe(verbose=1), 'column_keeper': ColumnKeeper(columns=['encoder_cluster']), 'print_df_2': PrintDataframe(verbose=1), 'estimator': LinearRegression()}
Starting pipeline using method: EvaluationType.BASIC


Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 15028.56it/s]
Generating walks (CPU: 1):   0%|          | 0/1000 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1000/1000 [00:02<00:00, 336.45it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring  encoder_cluster
0    23381    LR  model      F1                3
1    23381    LR  model      F1                2
2    23381    LR  model      F1                0
3    23381    LR  model      F1                2
4    23381    LR  model      F1                0
----------------------------------------
----------------------------------------
Printing dataframe:
   encoder_cluster
0                3
1                2
2                0
3                2
4                0
----------------------------------------


Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 20686.51it/s]
Generating walks (CPU: 1): 100%|██████████| 1000/1000 [00:03<00:00, 314.72it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring  encoder_cluster
0    41007   SVC     no     ACC                3
1    41007   SVC     no     ACC                5
2    41007   SVC     no     ACC                1
3    41007   SVC     no     ACC                5
4    41007   SVC     no     ACC                1
----------------------------------------
----------------------------------------
Printing dataframe:
   encoder_cluster
0                3
1                5
2                1
3                5
4                1
----------------------------------------


Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 22655.59it/s]
Generating walks (CPU: 1): 100%|██████████| 1000/1000 [00:03<00:00, 329.56it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring  encoder_cluster
0    23381    LR  model      F1                3
1    23381    LR  model      F1                0
2    23381    LR  model      F1                5
3    23381    LR  model      F1                0
4    23381    LR  model      F1                5
----------------------------------------
----------------------------------------
Printing dataframe:
   encoder_cluster
0                3
1                0
2                5
3                0
4                5
----------------------------------------


Computing transition probabilities: 100%|██████████| 45/45 [00:00<00:00, 11143.21it/s]
Generating walks (CPU: 1): 100%|██████████| 1000/1000 [00:04<00:00, 234.13it/s]


----------------------------------------
Printing dataframe:
   dataset model tuning scoring  encoder_cluster
0    41007   SVC     no     ACC                0
1    41007   SVC     no     ACC                2
2    41007   SVC     no     ACC                1
3    41007   SVC     no     ACC                2
4    41007   SVC     no     ACC                1
----------------------------------------
----------------------------------------
Printing dataframe:
   encoder_cluster
0                0
1                2
2                1
3                2
4                1
----------------------------------------
Finished running the pipeline
Evaluation metrics:
    validation_rmse: 0.2206 [std=0.]
    validation_mae: 0.178 [std=0.]
    validation_r2: -0.0045 [std=0.]
    validation_average_spearman: -0.0721 [std=0.]


In this test run the pipeline score is a 0.015. As we are using spearmans R as our metric where the best score is 1.0 or -1.0 we can see that we achieved a very poor score.

### Try out Poincare

In [8]:
pipeline.clear_steps()

pd.set_option('display.expand_frame_repr', False)
# create poincare transformer
from src.pipeline.pipeline_transformers import PoincareEmbedding
poincare_embedddings_transformer = PoincareEmbedding(graph=graph, epochs=100)

# add the column transformer to the pipeline
pipeline.add_new_step(poincare_embedddings_transformer, "embeddings_transformer")

pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_1")

pipeline.add_new_step(ColumnKeeper(columns=["poincare_embedding_dim1", "poincare_embedding_dim2"]),
                                  "column_keeper")

pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_2")

print(pipeline.get_pipeline().named_steps)

pipeline.run()

{'embeddings_transformer': PoincareEmbedding(graph=<networkx.classes.graph.Graph object at 0x000002753EE5E590>), 'print_df_1': PrintDataframe(verbose=1), 'column_keeper': ColumnKeeper(columns=['poincare_embedding_dim1', 'poincare_embedding_dim2']), 'print_df_2': PrintDataframe(verbose=1), 'estimator': LinearRegression()}
Starting pipeline using method: EvaluationType.BASIC
----------------------------------------
Printing dataframe:
   dataset model tuning scoring  poincare_embedding_dim1  poincare_embedding_dim2
0    23381    LR  model      F1                -0.752100                -0.344086
1    23381    LR  model      F1                 0.577925                -0.534111
2    23381    LR  model      F1                -0.475352                 0.639553
3    23381    LR  model      F1                 0.448117                -0.426418
4    23381    LR  model      F1                -0.342273                 0.484127
----------------------------------------
------------------------------