In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from node2vec import Node2Vec
import networkx as nx

import sklearn
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [2]:
edges = pd.read_csv("berlin_digraph_edges.csv")
nodes = pd.read_csv("berlin_digraph_nodes.csv")

edges = edges.rename(columns = {'Unnamed: 0':'source', 'Unnamed: 1':'target'})
nodes = nodes.rename(columns = {'Unnamed: 0':'node id'})

In [3]:
display(nodes.head(3))
print('-'*100)
print('-'*100)
display(edges.head(3))

Unnamed: 0,node id,y,x,street_count,ref,highway
0,172546,52.567738,13.352255,3,,
1,530313,52.567835,13.352664,3,,
2,28345850,52.567623,13.351325,4,,


----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------


Unnamed: 0,source,target,osmid,name,highway,maxspeed,oneway,length,geometry,hour_of_day,speed_kph_mean,speed_kph_stddev,est_width
0,172546,530313,24295532,Emmentaler Straße,secondary,50,False,29.733,"LINESTRING (13.3522545 52.567738, 13.3526642 5...",9,45.276,6.04,
1,172546,28345850,4638312,Emmentaler Straße,secondary,50,True,64.155,"LINESTRING (13.3522545 52.567738, 13.3520655 5...",9,14.532,14.245,
2,530313,172546,24295532,Emmentaler Straße,secondary,50,False,29.733,"LINESTRING (13.3526642 52.5678354, 13.3522545 ...",7,35.007,11.342,


In [4]:
edges_simplified = edges[['source', 'target', 'osmid', 'highway', 'maxspeed', 'oneway', 
                          'length', 'geometry', 'hour_of_day', 'speed_kph_mean']]

In [10]:
G_data = nx.from_pandas_edgelist(df = edges_simplified, source = "source", target = "target", 
                                 edge_attr = True, create_using=nx.DiGraph())

In [19]:
# Checks
df = nx.to_pandas_edgelist(G_data)
display(df.head(5))
print()
display(pd.DataFrame.from_dict(G_data.edges(), orient='index').head(5))
print(type(G_data))

Unnamed: 0,source,target,maxspeed,hour_of_day,highway,osmid,oneway,speed_kph_mean,length,geometry
0,172546,530313,50,9,secondary,24295532,False,45.276,29.733,"LINESTRING (13.3522545 52.567738, 13.3526642 5..."
1,172546,28345850,50,9,secondary,4638312,True,14.532,64.155,"LINESTRING (13.3522545 52.567738, 13.3520655 5..."
2,530313,172546,50,7,secondary,24295532,False,35.007,29.733,"LINESTRING (13.3526642 52.5678354, 13.3522545 ..."
3,28345850,35244866,50,8,secondary,4638312,True,35.756,19.106,"LINESTRING (13.351325 52.5676227, 13.35105 52...."
4,28345850,29492967,50,8,secondary,4638317,True,46.77,197.683,"LINESTRING (13.351325 52.5676227, 13.351286 52..."





Unnamed: 0,Unnamed: 1,osmid,highway,maxspeed,oneway,length,geometry,hour_of_day,speed_kph_mean
172546,530313,24295532,secondary,50,False,29.733,"LINESTRING (13.3522545 52.567738, 13.3526642 5...",9,45.276
172546,28345850,4638312,secondary,50,True,64.155,"LINESTRING (13.3522545 52.567738, 13.3520655 5...",9,14.532
530313,172546,24295532,secondary,50,False,29.733,"LINESTRING (13.3526642 52.5678354, 13.3522545 ...",7,35.007
28345850,35244866,4638312,secondary,50,True,19.106,"LINESTRING (13.351325 52.5676227, 13.35105 52....",8,35.756
28345850,29492967,4638317,secondary,50,True,197.683,"LINESTRING (13.351325 52.5676227, 13.351286 52...",8,46.77


<class 'networkx.classes.digraph.DiGraph'>


In [22]:
node2vec = Node2Vec(G_data, dimensions=30, walk_length=7, num_walks=50, workers = 1, p = 10, q = 0.25)

HBox(children=(HTML(value='Computing transition probabilities'), FloatProgress(value=0.0, max=8292.0), HTML(va…



Generating walks (CPU: 1):   0%|                                                                                                                                             | 0/50 [00:00<?, ?it/s][A[A






Generating walks (CPU: 1):   4%|█████▎                                                                                                                               | 2/50 [00:02<01:09,  1.44s/it][A[A

Generating walks (CPU: 1):   6%|███████▉                                                                                                                             | 3/50 [00:05<01:18,  1.66s/it][A[A

Generating walks (CPU: 1):   8%|██████████▋                                                                                                                          | 4/50 [00:06<01:17,  1.69s/it][A[A

Generating walks (CPU: 1):  10%|█████████████▎                                                                                                                       | 5/50 [00:08<01:16,  1.71s/it][A[A

Generating walks (CPU: 1):  12%|███████████████▉                                                                                                                     | 6/50 [00:10<01:

Generating walks (CPU: 1):  84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                     | 42/50 [02:09<00:24,  3.05s/it][A[A

Generating walks (CPU: 1):  86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                  | 43/50 [02:11<00:20,  2.97s/it][A[A

Generating walks (CPU: 1):  88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏               | 44/50 [02:14<00:17,  2.96s/it][A[A

Generating walks (CPU: 1):  90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 45/50 [02:17<00:14,  2.99s/it][A[A

Generating walks (CPU: 1):  92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍          | 46/50 [02:21<00:12

In [23]:
model = node2vec.fit(window=10, min_count=1) 

In [24]:
model['172546']

  model['172546']


array([ 1.2259401 , -0.2618795 , -1.5824711 , -0.7380643 ,  1.7189546 ,
       -1.3138628 ,  0.15870473,  0.12895451, -0.7979673 ,  1.1153505 ,
       -2.152818  , -0.71310806,  0.4883935 ,  0.5049538 ,  1.2767116 ,
       -0.8088286 ,  0.19628453,  0.8560546 , -0.9179689 ,  1.785974  ,
        1.9957111 , -0.09563167, -0.14332326,  0.19446622, -0.33670878,
       -0.20018202, -0.5220272 , -0.7945271 , -0.59697086, -1.2696252 ],
      dtype=float32)

In [25]:
# Unsurprisingly, the neighbor nodes we saw a couple lines above are in the top 2.
model.wv.most_similar('172546')

[('530313', 0.998839259147644),
 ('28345850', 0.9972468018531799),
 ('29492967', 0.9965168833732605),
 ('35359625', 0.9958294034004211),
 ('530312', 0.991866409778595),
 ('35244866', 0.9863973259925842),
 ('29492954', 0.9706072807312012),
 ('29492968', 0.9701778888702393),
 ('29492956', 0.9620769023895264),
 ('271370539', 0.9510524272918701)]

In [26]:
model.wv.most_similar('28345850')

[('35359625', 0.9981916546821594),
 ('29492967', 0.9979572296142578),
 ('530313', 0.9975138902664185),
 ('172546', 0.9972468018531799),
 ('530312', 0.9959478974342346),
 ('35244866', 0.9937652945518494),
 ('29492968', 0.9825006127357483),
 ('29492954', 0.9773008227348328),
 ('29492956', 0.9723113179206848),
 ('271370539', 0.9681207537651062)]

In [27]:
model.wv.save_word2vec_format('embedding2.csv')
model.save('model2')

In [28]:
node_ids = model.wv.index2word   # list of node IDs
node_embeddings = model.wv.vectors

In [29]:
source_embeddings = []
target_embeddings = []

for node_id in list(edges.source.values):
    source_embedding = model[str(node_id)]
    source_embeddings.append(source_embedding)

for node_id in edges.target.values:
    target_embedding = model[str(node_id)]
    target_embeddings.append(target_embedding)

  source_embedding = model[str(node_id)]
  target_embedding = model[str(node_id)]


In [30]:
# Checks
print(source_embeddings[0] == source_embeddings[1])
print(target_embeddings[0] == source_embeddings[2])

print(len(source_embeddings))

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True]
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True]
9012


### Node2Vec: First Run

In [33]:
X = [(model[str(i)]+model[str(j)]) for i,j in zip(edges['source'], edges['target'])]
y = edges['speed_kph_mean']

  X = [(model[str(i)]+model[str(j)]) for i,j in zip(edges['source'], edges['target'])]


In [34]:
# Check
model['172546'] + model['530313'] == X[0]

  model['172546'] + model['530313'] == X[0]


array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])

In [35]:
X_train, X_test, y_train, y_test = train_test_split(np.array(X), y, 
                                                test_size = 0.25, 
                                                random_state = 13)

In [37]:
lr = linear_model.LinearRegression().fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

rf = RandomForestRegressor(n_estimators = 100, random_state = 0).fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Mean squared error
print('Mean squared error for linear regression: %.2f' % mean_squared_error(y_test, y_pred_lr))
print('Mean squared error for random forest: %.2f' % mean_squared_error(y_test, y_pred_rf))

Mean squared error for linear regression: 99.82
Mean squared error for random forest: 83.31


In [40]:
feature_scaler = StandardScaler()
X_train = feature_scaler.fit_transform(X_train)
X_test = feature_scaler.transform(X_test)

In [41]:
lr = linear_model.LinearRegression().fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

rf = RandomForestRegressor(n_estimators = 100, random_state = 0).fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Mean squared error
print('Mean squared error for linear regression: %.2f' % mean_squared_error(y_test, y_pred_lr))
print('Mean squared error for random forest: %.2f' % mean_squared_error(y_test, y_pred_rf))

Mean squared error for linear regression: 99.82
Mean squared error for random forest: 83.23


### Second Run: Using only the mean_speed attribute before fitting node2vec

In [42]:
edges_simplified = edges[['source', 'target', 'osmid', 'highway', 'maxspeed', 'oneway', 
                          'length', 'geometry', 'hour_of_day', 'speed_kph_mean']]

G_data = nx.from_pandas_edgelist(df = edges_simplified, source = "source", target = "target", 
                                 edge_attr = 'speed_kph_mean', create_using=nx.DiGraph())

df = nx.to_pandas_edgelist(G_data)
display(df.head(5))
print()
display(pd.DataFrame.from_dict(G_data.edges(), orient='index').head(5))
print(type(G_data))

Unnamed: 0,source,target,speed_kph_mean
0,172546,530313,45.276
1,172546,28345850,14.532
2,530313,172546,35.007
3,28345850,35244866,35.756
4,28345850,29492967,46.77





Unnamed: 0,Unnamed: 1,speed_kph_mean
172546,530313,45.276
172546,28345850,14.532
172547,34694265,42.437
172558,530225,17.602
172562,530351,33.422


<class 'networkx.classes.digraph.DiGraph'>


In [43]:
node2vec = Node2Vec(G_data, dimensions=30, walk_length=7, num_walks=50, workers = 1, p = 10, q = 0.25)

HBox(children=(HTML(value='Computing transition probabilities'), FloatProgress(value=0.0, max=8292.0), HTML(va…

Generating walks (CPU: 1):   0%|                                                                                                                                             | 0/50 [00:00<?, ?it/s]




Generating walks (CPU: 1):   8%|█████████▉                                                                                                                      | 78/1000 [35:20<6:57:39, 27.18s/it]
Generating walks (CPU: 1): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [02:06<00:00,  2.54s/it]


In [44]:
model = node2vec.fit(window=10, min_count=1)

In [45]:
node_ids = model.wv.index2word   # list of node IDs
node_embeddings = model.wv.vectors

source_embeddings = []
target_embeddings = []

for node_id in list(edges.source.values):
    source_embedding = model[str(node_id)]
    source_embeddings.append(source_embedding)

for node_id in edges.target.values:
    target_embedding = model[str(node_id)]
    target_embeddings.append(target_embedding)
    
edges['source_embedding'] = source_embeddings
edges['target_embedding'] = target_embeddings



  source_embedding = model[str(node_id)]
  target_embedding = model[str(node_id)]


In [46]:
X = [(model[str(i)]+model[str(j)]) for i,j in zip(edges['source'], edges['target'])]
y = edges['speed_kph_mean']

X_train, X_test, y_train, y_test = train_test_split(np.array(X), y, 
                                                test_size = 0.25, 
                                                random_state = 13)

lr = linear_model.LinearRegression().fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

rf = RandomForestRegressor(n_estimators = 100, random_state = 0).fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Mean squared error
print('Mean squared error for linear regression: %.2f' % mean_squared_error(y_test, y_pred_lr))
print('Mean squared error for random forest: %.2f' % mean_squared_error(y_test, y_pred_rf))



  X = [(model[str(i)]+model[str(j)]) for i,j in zip(edges['source'], edges['target'])]


Mean squared error for linear regression: 100.13
Mean squared error for random forest: 82.86


### Third Run: Flipping the values of Node2Vec parameters p and q

In [47]:
node2vec = Node2Vec(G_data, dimensions=30, walk_length=7, num_walks=50, workers = 1, p = 1, q = 10)

HBox(children=(HTML(value='Computing transition probabilities'), FloatProgress(value=0.0, max=8292.0), HTML(va…

Generating walks (CPU: 1):   0%|                                                                                                                                             | 0/50 [00:00<?, ?it/s]




Generating walks (CPU: 1): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [02:10<00:00,  2.60s/it]


In [48]:
model = node2vec.fit(window=10, min_count=1)

In [52]:
node_ids = model.wv.index2word   # list of node IDs
node_embeddings = model.wv.vectors

source_embeddings = []
target_embeddings = []

for node_id in list(edges.source.values):
    source_embedding = model[str(node_id)]
    source_embeddings.append(source_embedding)

for node_id in edges.target.values:
    target_embedding = model[str(node_id)]
    target_embeddings.append(target_embedding)
    
edges['source_embedding'] = source_embeddings
edges['target_embedding'] = target_embeddings

  source_embedding = model[str(node_id)]
  target_embedding = model[str(node_id)]


In [53]:
X = [(model[str(i)]+model[str(j)]) for i,j in zip(edges['source'], edges['target'])]
y = edges['speed_kph_mean']

X_train, X_test, y_train, y_test = train_test_split(np.array(X), y, 
                                                test_size = 0.25, 
                                                random_state = 13)

lr = linear_model.LinearRegression().fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

rf = RandomForestRegressor(n_estimators = 100, random_state = 0).fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Mean squared error
print('Mean squared error for linear regression: %.2f' % mean_squared_error(y_test, y_pred_lr))
print('Mean squared error for random forest: %.2f' % mean_squared_error(y_test, y_pred_rf))


  X = [(model[str(i)]+model[str(j)]) for i,j in zip(edges['source'], edges['target'])]


Mean squared error for linear regression: 99.85
Mean squared error for random forest: 82.28


In [54]:
feature_scaler = StandardScaler()
X_train = feature_scaler.fit_transform(X_train)
X_test = feature_scaler.transform(X_test)

In [55]:
lr = linear_model.LinearRegression().fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

rf = RandomForestRegressor(n_estimators = 100, random_state = 0).fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Mean squared error
print('Mean squared error for linear regression: %.2f' % mean_squared_error(y_test, y_pred_lr))
print('Mean squared error for random forest: %.2f' % mean_squared_error(y_test, y_pred_rf))

Mean squared error for linear regression: 99.85
Mean squared error for random forest: 82.34
