In [5]:
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
#import xgboost as xgb
import sklearn
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [6]:
edges = pd.read_csv("berlin_digraph_edges.csv")
nodes = pd.read_csv("berlin_digraph_nodes.csv")

#### Rename the source and target columns to 'source' and 'target', respectively

In [7]:
edges = edges.rename(columns = {'Unnamed: 0':'source', 'Unnamed: 1':'target'})

In [8]:
edges.columns

Index(['source', 'target', 'osmid', 'name', 'highway', 'maxspeed', 'oneway',
       'length', 'geometry', 'hour_of_day', 'speed_kph_mean',
       'speed_kph_stddev', 'est_width'],
      dtype='object')

In [9]:
edges.shape

(9012, 13)

In [10]:
edges.head(1)

Unnamed: 0,source,target,osmid,name,highway,maxspeed,oneway,length,geometry,hour_of_day,speed_kph_mean,speed_kph_stddev,est_width
0,172546,530313,24295532,Emmentaler Straße,secondary,50,False,29.733,"LINESTRING (13.3522545 52.567738, 13.3526642 5...",9,45.276,6.04,


#### Drop 'osmid', 'name', 'geometry' columns

These columns are not predictive of mean speed

In [11]:
edges = edges[['source', 'target', 'highway', 'maxspeed', 'oneway', 'length', 'hour_of_day', 'speed_kph_mean', 'speed_kph_stddev', 'est_width']]
edges.head()

Unnamed: 0,source,target,highway,maxspeed,oneway,length,hour_of_day,speed_kph_mean,speed_kph_stddev,est_width
0,172546,530313,secondary,50,False,29.733,9,45.276,6.04,
1,172546,28345850,secondary,50,True,64.155,9,14.532,14.245,
2,530313,172546,secondary,50,False,29.733,7,35.007,11.342,
3,28345850,35244866,secondary,50,True,19.106,8,35.756,11.516,
4,28345850,29492967,secondary,50,True,197.683,8,46.77,5.457,


#### All but two values in the 'est_width' column are null, so drop that column too

In [12]:
edges['est_width'].isnull().sum()

9010

In [13]:
edges = edges[['source', 'target', 'highway', 'maxspeed', 'oneway', 'length', 'hour_of_day', 'speed_kph_mean', 'speed_kph_stddev']]
edges.head()

Unnamed: 0,source,target,highway,maxspeed,oneway,length,hour_of_day,speed_kph_mean,speed_kph_stddev
0,172546,530313,secondary,50,False,29.733,9,45.276,6.04
1,172546,28345850,secondary,50,True,64.155,9,14.532,14.245
2,530313,172546,secondary,50,False,29.733,7,35.007,11.342
3,28345850,35244866,secondary,50,True,19.106,8,35.756,11.516
4,28345850,29492967,secondary,50,True,197.683,8,46.77,5.457


In [14]:
edges.dtypes

source                int64
target                int64
highway              object
maxspeed             object
oneway                 bool
length              float64
hour_of_day           int64
speed_kph_mean      float64
speed_kph_stddev    float64
dtype: object

#### 250 rows have a null 'maxspeed' value

In [15]:
edges['maxspeed'].isnull().sum()

250

In [16]:
edges['maxspeed'].value_counts()

50              6120
30              2444
60               101
10                43
80                32
20                18
['50', '30']       2
40                 1
70                 1
Name: maxspeed, dtype: int64

#### Replace nulls with the mode 'maxspeed' value

In [17]:
edges['maxspeed'].fillna(edges['maxspeed'].mode()[0], inplace=True)

In [18]:
edges['maxspeed'].isnull().sum()

0

In [19]:
edges.dtypes

source                int64
target                int64
highway              object
maxspeed             object
oneway                 bool
length              float64
hour_of_day           int64
speed_kph_mean      float64
speed_kph_stddev    float64
dtype: object

In [20]:
edges['maxspeed'].value_counts()

50              6370
30              2444
60               101
10                43
80                32
20                18
['50', '30']       2
40                 1
70                 1
Name: maxspeed, dtype: int64

#### Get rid of the two annoying '['50', '30']' values and replace them with '50'

In [21]:
edges['maxspeed'] = edges['maxspeed'].replace("['50', '30']", '50')

In [22]:
edges['maxspeed'].value_counts()

50    6372
30    2444
60     101
10      43
80      32
20      18
40       1
70       1
Name: maxspeed, dtype: int64

#### Convert the 'maxspeed' column from categorical to integer

In [23]:
edges['maxspeed'] = edges['maxspeed'].astype(str).astype(int)

In [24]:
edges.dtypes

source                int64
target                int64
highway              object
maxspeed              int32
oneway                 bool
length              float64
hour_of_day           int64
speed_kph_mean      float64
speed_kph_stddev    float64
dtype: object

#### One-hot encode the 'highway' categorical column

In [25]:
modeldf = pd.get_dummies(edges)
modeldf.head()

Unnamed: 0,source,target,maxspeed,oneway,length,hour_of_day,speed_kph_mean,speed_kph_stddev,"highway_['residential', 'living_street']",highway_living_street,...,highway_motorway_link,highway_primary,highway_primary_link,highway_residential,highway_secondary,highway_secondary_link,highway_tertiary,highway_tertiary_link,highway_trunk_link,highway_unclassified
0,172546,530313,50,False,29.733,9,45.276,6.04,0,0,...,0,0,0,0,1,0,0,0,0,0
1,172546,28345850,50,True,64.155,9,14.532,14.245,0,0,...,0,0,0,0,1,0,0,0,0,0
2,530313,172546,50,False,29.733,7,35.007,11.342,0,0,...,0,0,0,0,1,0,0,0,0,0
3,28345850,35244866,50,True,19.106,8,35.756,11.516,0,0,...,0,0,0,0,1,0,0,0,0,0
4,28345850,29492967,50,True,197.683,8,46.77,5.457,0,0,...,0,0,0,0,1,0,0,0,0,0


In [26]:
modeldf.dtypes

source                                        int64
target                                        int64
maxspeed                                      int32
oneway                                         bool
length                                      float64
hour_of_day                                   int64
speed_kph_mean                              float64
speed_kph_stddev                            float64
highway_['residential', 'living_street']      uint8
highway_living_street                         uint8
highway_motorway                              uint8
highway_motorway_link                         uint8
highway_primary                               uint8
highway_primary_link                          uint8
highway_residential                           uint8
highway_secondary                             uint8
highway_secondary_link                        uint8
highway_tertiary                              uint8
highway_tertiary_link                         uint8
highway_trun

#### Drop the 'speed_kph_stddev' column because it's a function of 'speed_kph_mean'

In [27]:
modeldf = modeldf.drop(['speed_kph_stddev'], axis=1)

In [28]:
modeldf.dtypes

source                                        int64
target                                        int64
maxspeed                                      int32
oneway                                         bool
length                                      float64
hour_of_day                                   int64
speed_kph_mean                              float64
highway_['residential', 'living_street']      uint8
highway_living_street                         uint8
highway_motorway                              uint8
highway_motorway_link                         uint8
highway_primary                               uint8
highway_primary_link                          uint8
highway_residential                           uint8
highway_secondary                             uint8
highway_secondary_link                        uint8
highway_tertiary                              uint8
highway_tertiary_link                         uint8
highway_trunk_link                            uint8
highway_uncl

In [29]:
y = modeldf['speed_kph_mean']

In [30]:
X = modeldf.drop(['speed_kph_mean'], axis=1)

In [31]:
X.head()

Unnamed: 0,source,target,maxspeed,oneway,length,hour_of_day,"highway_['residential', 'living_street']",highway_living_street,highway_motorway,highway_motorway_link,highway_primary,highway_primary_link,highway_residential,highway_secondary,highway_secondary_link,highway_tertiary,highway_tertiary_link,highway_trunk_link,highway_unclassified
0,172546,530313,50,False,29.733,9,0,0,0,0,0,0,0,1,0,0,0,0,0
1,172546,28345850,50,True,64.155,9,0,0,0,0,0,0,0,1,0,0,0,0,0
2,530313,172546,50,False,29.733,7,0,0,0,0,0,0,0,1,0,0,0,0,0
3,28345850,35244866,50,True,19.106,8,0,0,0,0,0,0,0,1,0,0,0,0,0
4,28345850,29492967,50,True,197.683,8,0,0,0,0,0,0,0,1,0,0,0,0,0


#### Split the dataframe into train, test and split

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=13)

#### Fit a linear regression model

In [33]:
regr = linear_model.LinearRegression().fit(X_train, y_train)
y_pred = regr.predict(X_test)

In [34]:
# The coefficients
# carried over from a homework notebook; not super crucial
print('Coefficients: \n', regr.coef_)

Coefficients: 
 [ 1.41098388e-11  5.87522070e-10  1.87031664e-01  5.45055685e-02
  2.76504646e-02 -5.71075869e-01 -5.82693157e+00 -1.30350125e+01
  2.56613358e+01  4.80637404e+00  5.47530823e+00 -3.51180273e+00
 -4.60293786e+00  3.29265596e+00 -5.32159858e+00  6.47695984e-01
 -6.72361728e+00  0.00000000e+00 -8.61469436e-01]


In [35]:
# Mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))

Mean squared error: 72.89


#### Can't get the plot to work, but not super important

In [36]:
# # Plot outputs
# plt.scatter(X_test, y_test, color='black')
# plt.plot(X_test, y_pred, color='blue', linewidth=3)

# plt.xticks(())
# plt.yticks(())

# plt.show()

In [37]:
print(X_test.shape, y_test.shape, y_pred.shape)

(2253, 19) (2253,) (2253,)


#### Fit a random forest regression model

In [38]:
RFregressor = RandomForestRegressor(n_estimators = 100, random_state = 0).fit(X_train, y_train)
y_pred = RFregressor.predict(X_test)

In [39]:
# Mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))

Mean squared error: 68.37


## Now I'll try dropping the two node columns (source, target) to see if the MSE improves

In [40]:
modeldf = pd.get_dummies(edges)
modeldf.head()
modeldf = modeldf.drop(['speed_kph_stddev', 'source', 'target'], axis=1)
y = modeldf['speed_kph_mean']
X = modeldf.drop(['speed_kph_mean'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=13)

In [41]:
X.head()

Unnamed: 0,maxspeed,oneway,length,hour_of_day,"highway_['residential', 'living_street']",highway_living_street,highway_motorway,highway_motorway_link,highway_primary,highway_primary_link,highway_residential,highway_secondary,highway_secondary_link,highway_tertiary,highway_tertiary_link,highway_trunk_link,highway_unclassified
0,50,False,29.733,9,0,0,0,0,0,0,0,1,0,0,0,0,0
1,50,True,64.155,9,0,0,0,0,0,0,0,1,0,0,0,0,0
2,50,False,29.733,7,0,0,0,0,0,0,0,1,0,0,0,0,0
3,50,True,19.106,8,0,0,0,0,0,0,0,1,0,0,0,0,0
4,50,True,197.683,8,0,0,0,0,0,0,0,1,0,0,0,0,0


#### Fit a linear regression model

In [42]:
regr = linear_model.LinearRegression().fit(X_train, y_train)
y_pred = regr.predict(X_test)

In [43]:
# Mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))

Mean squared error: 73.37


#### Fit a random forest regression model

In [44]:
RFregressor = RandomForestRegressor(n_estimators = 100, random_state = 0).fit(X_train, y_train)
y_pred = RFregressor.predict(X_test)

In [45]:
# Mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))

Mean squared error: 86.62


In [46]:
edges

Unnamed: 0,source,target,highway,maxspeed,oneway,length,hour_of_day,speed_kph_mean,speed_kph_stddev
0,172546,530313,secondary,50,False,29.733,9,45.276,6.040
1,172546,28345850,secondary,50,True,64.155,9,14.532,14.245
2,530313,172546,secondary,50,False,29.733,7,35.007,11.342
3,28345850,35244866,secondary,50,True,19.106,8,35.756,11.516
4,28345850,29492967,secondary,50,True,197.683,8,46.770,5.457
...,...,...,...,...,...,...,...,...,...
9007,6206501300,6206500560,residential,30,False,121.381,8,26.346,5.005
9008,6366276983,29789747,secondary,50,True,33.976,8,40.362,13.430
9009,6400342990,6400342991,tertiary,50,False,13.361,9,31.353,15.362
9010,6469357849,700777138,secondary,50,True,23.033,7,21.917,13.241


Collecting xgboost
  Downloading xgboost-1.4.2-py3-none-win_amd64.whl (97.8 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.4.2
Collecting xgboost
  Downloading xgboost-1.4.2-py3-none-win_amd64.whl (97.8 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.4.2


#### Build a graph

In [48]:
G_data = nx.from_pandas_edgelist(edges, "source", "target", create_using=nx.Graph())

In [49]:
# 1. Number of common neighbors

neighbors = []
for i in range(len(edges)):
    neighbors.append(len(list(nx.common_neighbors(G_data, edges.source[i], edges.target[i]))))

edges['common_neighbor'] = neighbors
print(edges.head())
print(edges['common_neighbor'].unique())
print(edges['common_neighbor'].value_counts())

In [53]:
# 2. Jaccard coefficient

jaccard = []
for i in range(len(edges)):
    #jaccard.append(len(list(nx.jaccard_coefficient(G_data, edges.source[i], edges.target[i]))))
    jaccard.append(list(nx.jaccard_coefficient(G_data, ebunch=[(edges.source[i], edges.target[i])]))[0][2])

edges['jaccard_similarity'] = jaccard
print(edges.head())
print(edges['jaccard_similarity'].unique())
print(edges['jaccard_similarity'].value_counts())

     source    target    highway  maxspeed  oneway   length  hour_of_day  \
0    172546    530313  secondary        50   False   29.733            9   
1    172546  28345850  secondary        50    True   64.155            9   
2    530313    172546  secondary        50   False   29.733            7   
3  28345850  35244866  secondary        50    True   19.106            8   
4  28345850  29492967  secondary        50    True  197.683            8   

   speed_kph_mean  speed_kph_stddev  common_neighbor  jaccard_similarity  
0          45.276             6.040                0            0.000000  
1          14.532            14.245                1            0.166667  
2          35.007            11.342                0            0.000000  
3          35.756            11.516                0            0.000000  
4          46.770             5.457                0            0.000000  
[0.         0.16666667 0.33333333 0.25       0.14285714 0.125
 0.2        0.4        0.285714

In [54]:
# 3. Adamic/Adar

adamic_adar_index = []
for i in range(0, len(edges), 1):
  if edges.source[i] == edges.target[i]:
    adamic_adar_index.append(0)
  else:
    adamic_adar_index.append(list(nx.adamic_adar_index(G_data, ebunch=[(edges.source[i], edges.target[i])]))[0][2])

edges['adamic_adar_index'] = adamic_adar_index
print(edges.head())
print(edges['adamic_adar_index'].unique())
print(edges['adamic_adar_index'].value_counts())

     source    target    highway  maxspeed  oneway   length  hour_of_day  \
0    172546    530313  secondary        50   False   29.733            9   
1    172546  28345850  secondary        50    True   64.155            9   
2    530313    172546  secondary        50   False   29.733            7   
3  28345850  35244866  secondary        50    True   19.106            8   
4  28345850  29492967  secondary        50    True  197.683            8   

   speed_kph_mean  speed_kph_stddev  common_neighbor  jaccard_similarity  \
0          45.276             6.040                0            0.000000   
1          14.532            14.245                1            0.166667   
2          35.007            11.342                0            0.000000   
3          35.756            11.516                0            0.000000   
4          46.770             5.457                0            0.000000   

   adamic_adar_index  
0           0.000000  
1           0.721348  
2           0.000

In [56]:
# 4. Shortest path

def compute_shortest_path_length(u,v):
    try:
        if G_data.has_edge(u,v):
            return -1
        else:
            p= nx.shortest_path_length(G_data,source=u,target=v)
        return p
    except:
        return -1

shortest_path = []
for i in range(len(edges)):
  shortest_path.append(compute_shortest_path_length(edges.source[i],edges.target[i]))

edges['shortest_path'] = shortest_path
print(edges['shortest_path'].unique())
print(edges['shortest_path'].value_counts())
print(edges.head())

[-1]
-1    9012
Name: shortest_path, dtype: int64
     source    target    highway  maxspeed  oneway   length  hour_of_day  \
0    172546    530313  secondary        50   False   29.733            9   
1    172546  28345850  secondary        50    True   64.155            9   
2    530313    172546  secondary        50   False   29.733            7   
3  28345850  35244866  secondary        50    True   19.106            8   
4  28345850  29492967  secondary        50    True  197.683            8   

   speed_kph_mean  speed_kph_stddev  common_neighbor  jaccard_similarity  \
0          45.276             6.040                0            0.000000   
1          14.532            14.245                1            0.166667   
2          35.007            11.342                0            0.000000   
3          35.756            11.516                0            0.000000   
4          46.770             5.457                0            0.000000   

   adamic_adar_index  shortest_path 

In [58]:
# 5. Katz score

node_list = list(G_data.nodes) 

from numpy.linalg import inv
L = nx.normalized_laplacian_matrix(G_data)
e = (np.linalg.eigvals(L.A)).real

beta = 1/max(e)
I = np.identity(len(G_data.nodes))
katz = inv(I - nx.to_numpy_array(G_data)*beta) - I

katz_score = []

for i in range(len(edges)):
  katz_score.append(katz[int(node_list.index(edges.source[i]))][int(node_list.index(edges.target[i]))])

edges['katz_score'] = katz_score
print(edges['katz_score'].unique())
print(edges['katz_score'].value_counts())
print(edges.head())

[ 0.88589621  0.35476419  0.88589621 ... -1.78181818 -0.78930751
  1.59052231]
 0.666667    725
 1.000000    418
 1.200000    221
 1.000000    173
 1.200000    105
            ... 
-5.559017      1
-0.482236      1
-1.363636      1
-2.566667      1
-7.063478      1
Name: katz_score, Length: 5343, dtype: int64
     source    target    highway  maxspeed  oneway   length  hour_of_day  \
0    172546    530313  secondary        50   False   29.733            9   
1    172546  28345850  secondary        50    True   64.155            9   
2    530313    172546  secondary        50   False   29.733            7   
3  28345850  35244866  secondary        50    True   19.106            8   
4  28345850  29492967  secondary        50    True  197.683            8   

   speed_kph_mean  speed_kph_stddev  common_neighbor  jaccard_similarity  \
0          45.276             6.040                0            0.000000   
1          14.532            14.245                1            0.166667   
2   

In [None]:
# 6. Preferrential attachment
pref = []
for i in range(len(edges)):
  pref.append(list(nx.preferential_attachment(G_data, ebunch=[(edges.source[i],edges.target[i])]))[0][2])

edges['preferrential_attachment'] = pref
print(edges['preferrential_attachment'].unique())
print(edges['preferrential_attachment'].value_counts())
print(edges.head())

In [76]:
# 7. Hitting times SUBSET !!!!!!
# just to try if it works, as running on the full length of edges runs for too long! (I stopped at 50mins)
dist2 = []
for i in range(0,100):
  if edges.source[i] == edges.target[i]:
    dist.append(0)
    #print("added 0")
  else:
    asp = nx.all_simple_paths(G_data, edges.source[i], edges.target[i])
    for path in asp:
      #print(len(path))
      dist2.append(len(path))
#print(dist2)

In [78]:
# 7. Hitting times FULL SCOPE - runs for too long!
'''
dist = []
for i in range(len(edges)):
  if edges.source[i] == edges.target[i]:
    dist.append(0)
    #print("added 0")
  else:
    asp = nx.all_simple_paths(G_data, edges.source[i], edges.target[i])
    for path in asp:
      #print(len(path))
      dist.append(len(path))
      
edges['hitting_time'] = dist
print(edges['hitting_time'].unique())
print(edges['hitting_time'].value_counts())
print(edges.head())
'''

KeyboardInterrupt: 

In [None]:
# 8. SimRank ##### RUNS FOR TOO LONG !!!!!
'''
sim = nx.simrank_similarity(G_data)
sim_list = [[sim[u][v] for v in sorted(sim[u])] for u in sorted(sim)]
simrank = np.array(sim_list)
print("here")

sim_rank = []
for i in range(len(edges)):
  a = int(node_list.index(edges.source[i]))
  b = int(node_list.index(edges.target[i]))
  sim_rank.append(simrank[a][b])
print("here2")
edges['simrank'] = sim_rank
print(edges['simrank'].unique())
print(edges['simrank'].value_counts())
print(edges.head())
'''

#### Preprocessing "One-hot encode the 'highway' categorical column" etc

In [61]:
edges

Unnamed: 0,source,target,highway,maxspeed,oneway,length,hour_of_day,speed_kph_mean,speed_kph_stddev,common_neighbor,jaccard_similarity,adamic_adar_index,shortest_path,katz_score
0,172546,530313,secondary,50,False,29.733,9,45.276,6.040,0,0.000000,0.000000,-1,0.885896
1,172546,28345850,secondary,50,True,64.155,9,14.532,14.245,1,0.166667,0.721348,-1,0.354764
2,530313,172546,secondary,50,False,29.733,7,35.007,11.342,0,0.000000,0.000000,-1,0.885896
3,28345850,35244866,secondary,50,True,19.106,8,35.756,11.516,0,0.000000,0.000000,-1,-0.853327
4,28345850,29492967,secondary,50,True,197.683,8,46.770,5.457,0,0.000000,0.000000,-1,0.406717
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9007,6206501300,6206500560,residential,30,False,121.381,8,26.346,5.005,0,0.000000,0.000000,-1,0.666667
9008,6366276983,29789747,secondary,50,True,33.976,8,40.362,13.430,0,0.000000,0.000000,-1,1.000000
9009,6400342990,6400342991,tertiary,50,False,13.361,9,31.353,15.362,0,0.000000,0.000000,-1,1.200000
9010,6469357849,700777138,secondary,50,True,23.033,7,21.917,13.241,0,0.000000,0.000000,-1,4.000000


In [62]:
modeldf = pd.get_dummies(edges)
print(modeldf.head())

modeldf = modeldf.drop(['speed_kph_stddev'], axis=1)

     source    target  maxspeed  oneway   length  hour_of_day  speed_kph_mean  \
0    172546    530313        50   False   29.733            9          45.276   
1    172546  28345850        50    True   64.155            9          14.532   
2    530313    172546        50   False   29.733            7          35.007   
3  28345850  35244866        50    True   19.106            8          35.756   
4  28345850  29492967        50    True  197.683            8          46.770   

   speed_kph_stddev  common_neighbor  jaccard_similarity  ...  \
0             6.040                0            0.000000  ...   
1            14.245                1            0.166667  ...   
2            11.342                0            0.000000  ...   
3            11.516                0            0.000000  ...   
4             5.457                0            0.000000  ...   

   highway_motorway_link  highway_primary  highway_primary_link  \
0                      0                0              

In [63]:
y = modeldf['speed_kph_mean']
X = modeldf.drop(['speed_kph_mean'], axis=1)
X.head()

Unnamed: 0,source,target,maxspeed,oneway,length,hour_of_day,common_neighbor,jaccard_similarity,adamic_adar_index,shortest_path,...,highway_motorway_link,highway_primary,highway_primary_link,highway_residential,highway_secondary,highway_secondary_link,highway_tertiary,highway_tertiary_link,highway_trunk_link,highway_unclassified
0,172546,530313,50,False,29.733,9,0,0.0,0.0,-1,...,0,0,0,0,1,0,0,0,0,0
1,172546,28345850,50,True,64.155,9,1,0.166667,0.721348,-1,...,0,0,0,0,1,0,0,0,0,0
2,530313,172546,50,False,29.733,7,0,0.0,0.0,-1,...,0,0,0,0,1,0,0,0,0,0
3,28345850,35244866,50,True,19.106,8,0,0.0,0.0,-1,...,0,0,0,0,1,0,0,0,0,0
4,28345850,29492967,50,True,197.683,8,0,0.0,0.0,-1,...,0,0,0,0,1,0,0,0,0,0


#### Split the dataframe into train, test and split

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=13)

#### Fit a linear regression model

In [65]:
regr = linear_model.LinearRegression().fit(X_train, y_train)
y_pred = regr.predict(X_test)

In [66]:
# Mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))

Mean squared error: 71.26


In [67]:
print(X_test.shape, y_test.shape, y_pred.shape)

(2253, 24) (2253,) (2253,)


#### Fit a random forest regression model

In [68]:
RFregressor = RandomForestRegressor(n_estimators = 100, random_state = 0).fit(X_train, y_train)
y_pred = RFregressor.predict(X_test)

In [69]:
# Mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))

Mean squared error: 64.29


#### Now we'll try dropping the two node columns (source, target) to see if the MSE improves

In [70]:
modeldf = pd.get_dummies(edges)
modeldf.head()
modeldf = modeldf.drop(['speed_kph_stddev', 'source', 'target'], axis=1)
y = modeldf['speed_kph_mean']
X = modeldf.drop(['speed_kph_mean'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=13)

In [71]:
X.head()

Unnamed: 0,maxspeed,oneway,length,hour_of_day,common_neighbor,jaccard_similarity,adamic_adar_index,shortest_path,katz_score,"highway_['residential', 'living_street']",...,highway_motorway_link,highway_primary,highway_primary_link,highway_residential,highway_secondary,highway_secondary_link,highway_tertiary,highway_tertiary_link,highway_trunk_link,highway_unclassified
0,50,False,29.733,9,0,0.0,0.0,-1,0.885896,0,...,0,0,0,0,1,0,0,0,0,0
1,50,True,64.155,9,1,0.166667,0.721348,-1,0.354764,0,...,0,0,0,0,1,0,0,0,0,0
2,50,False,29.733,7,0,0.0,0.0,-1,0.885896,0,...,0,0,0,0,1,0,0,0,0,0
3,50,True,19.106,8,0,0.0,0.0,-1,-0.853327,0,...,0,0,0,0,1,0,0,0,0,0
4,50,True,197.683,8,0,0.0,0.0,-1,0.406717,0,...,0,0,0,0,1,0,0,0,0,0


#### Fit a linear regression model

In [72]:
regr = linear_model.LinearRegression().fit(X_train, y_train)
y_pred = regr.predict(X_test)

In [73]:
# Mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))

Mean squared error: 71.84


#### Fit a random forest regression model

In [74]:
RFregressor = RandomForestRegressor(n_estimators = 100, random_state = 0).fit(X_train, y_train)
y_pred = RFregressor.predict(X_test)

In [75]:
# Mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))

Mean squared error: 71.52
