In [150]:
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import sklearn
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [59]:
edges = pd.read_csv("berlin_digraph_edges.csv")
nodes = pd.read_csv("berlin_digraph_nodes.csv")

#### Rename the source and target columns to 'source' and 'target', respectively

In [60]:
edges = edges.rename(columns = {'Unnamed: 0':'source', 'Unnamed: 1':'target'})

In [61]:
edges.columns

Index(['source', 'target', 'osmid', 'name', 'highway', 'maxspeed', 'oneway',
       'length', 'geometry', 'hour_of_day', 'speed_kph_mean',
       'speed_kph_stddev', 'est_width'],
      dtype='object')

In [62]:
edges.shape

(9012, 13)

In [63]:
edges.head(1)

Unnamed: 0,source,target,osmid,name,highway,maxspeed,oneway,length,geometry,hour_of_day,speed_kph_mean,speed_kph_stddev,est_width
0,172546,530313,24295532,Emmentaler Straße,secondary,50,False,29.733,"LINESTRING (13.3522545 52.567738, 13.3526642 5...",9,45.276,6.04,


#### Drop 'osmid', 'name', 'geometry' columns

These columns are not predictive of mean speed

In [64]:
edges = edges[['source', 'target', 'highway', 'maxspeed', 'oneway', 'length', 'hour_of_day', 'speed_kph_mean', 'speed_kph_stddev', 'est_width']]
edges.head()

Unnamed: 0,source,target,highway,maxspeed,oneway,length,hour_of_day,speed_kph_mean,speed_kph_stddev,est_width
0,172546,530313,secondary,50,False,29.733,9,45.276,6.04,
1,172546,28345850,secondary,50,True,64.155,9,14.532,14.245,
2,530313,172546,secondary,50,False,29.733,7,35.007,11.342,
3,28345850,35244866,secondary,50,True,19.106,8,35.756,11.516,
4,28345850,29492967,secondary,50,True,197.683,8,46.77,5.457,


#### All but two values in the 'est_width' column are null, so drop that column too

In [68]:
edges['est_width'].isnull().sum()

9010

In [69]:
edges = edges[['source', 'target', 'highway', 'maxspeed', 'oneway', 'length', 'hour_of_day', 'speed_kph_mean', 'speed_kph_stddev']]
edges.head()

Unnamed: 0,source,target,highway,maxspeed,oneway,length,hour_of_day,speed_kph_mean,speed_kph_stddev
0,172546,530313,secondary,50,False,29.733,9,45.276,6.04
1,172546,28345850,secondary,50,True,64.155,9,14.532,14.245
2,530313,172546,secondary,50,False,29.733,7,35.007,11.342
3,28345850,35244866,secondary,50,True,19.106,8,35.756,11.516
4,28345850,29492967,secondary,50,True,197.683,8,46.77,5.457


In [91]:
edges.dtypes

source                int64
target                int64
highway              object
maxspeed             object
oneway                 bool
length              float64
hour_of_day           int64
speed_kph_mean      float64
speed_kph_stddev    float64
dtype: object

#### 250 rows have a null 'maxspeed' value

In [74]:
edges['maxspeed'].isnull().sum()

250

In [80]:
edges['maxspeed'].value_counts()

50              6120
30              2444
60               101
10                43
80                32
20                18
['50', '30']       2
70                 1
40                 1
Name: maxspeed, dtype: int64

#### Replace nulls with the mode 'maxspeed' value

In [82]:
edges['maxspeed'].fillna(edges['maxspeed'].mode()[0], inplace=True)

In [88]:
edges['maxspeed'].isnull().sum()

0

In [93]:
edges.dtypes

source                int64
target                int64
highway              object
maxspeed             object
oneway                 bool
length              float64
hour_of_day           int64
speed_kph_mean      float64
speed_kph_stddev    float64
dtype: object

In [109]:
edges['maxspeed'].value_counts()

50              6370
30              2444
60               101
10                43
80                32
20                18
['50', '30']       2
70                 1
40                 1
Name: maxspeed, dtype: int64

#### Get rid of the two annoying '['50', '30']' values and replace them with '50'

In [116]:
edges['maxspeed'] = edges['maxspeed'].replace("['50', '30']", '50')

In [117]:
edges['maxspeed'].value_counts()

50    6372
30    2444
60     101
10      43
80      32
20      18
70       1
40       1
Name: maxspeed, dtype: int64

#### Convert the 'maxspeed' column from categorical to integer

In [118]:
edges['maxspeed'] = edges['maxspeed'].astype(str).astype(int)

In [119]:
edges.dtypes

source                int64
target                int64
highway              object
maxspeed              int32
oneway                 bool
length              float64
hour_of_day           int64
speed_kph_mean      float64
speed_kph_stddev    float64
dtype: object

#### One-hot encode the 'highway' categorical column

In [125]:
modeldf = pd.get_dummies(edges)
modeldf.head()

Unnamed: 0,source,target,maxspeed,oneway,length,hour_of_day,speed_kph_mean,speed_kph_stddev,"highway_['residential', 'living_street']",highway_living_street,...,highway_motorway_link,highway_primary,highway_primary_link,highway_residential,highway_secondary,highway_secondary_link,highway_tertiary,highway_tertiary_link,highway_trunk_link,highway_unclassified
0,172546,530313,50,False,29.733,9,45.276,6.04,0,0,...,0,0,0,0,1,0,0,0,0,0
1,172546,28345850,50,True,64.155,9,14.532,14.245,0,0,...,0,0,0,0,1,0,0,0,0,0
2,530313,172546,50,False,29.733,7,35.007,11.342,0,0,...,0,0,0,0,1,0,0,0,0,0
3,28345850,35244866,50,True,19.106,8,35.756,11.516,0,0,...,0,0,0,0,1,0,0,0,0,0
4,28345850,29492967,50,True,197.683,8,46.77,5.457,0,0,...,0,0,0,0,1,0,0,0,0,0


In [126]:
modeldf.dtypes

source                                        int64
target                                        int64
maxspeed                                      int32
oneway                                         bool
length                                      float64
hour_of_day                                   int64
speed_kph_mean                              float64
speed_kph_stddev                            float64
highway_['residential', 'living_street']      uint8
highway_living_street                         uint8
highway_motorway                              uint8
highway_motorway_link                         uint8
highway_primary                               uint8
highway_primary_link                          uint8
highway_residential                           uint8
highway_secondary                             uint8
highway_secondary_link                        uint8
highway_tertiary                              uint8
highway_tertiary_link                         uint8
highway_trun

#### Drop the 'speed_kph_stddev' column because it's a function of 'speed_kph_mean'

In [135]:
modeldf = modeldf.drop(['speed_kph_stddev'], axis=1)

In [139]:
modeldf.dtypes

source                                        int64
target                                        int64
maxspeed                                      int32
oneway                                         bool
length                                      float64
hour_of_day                                   int64
speed_kph_mean                              float64
highway_['residential', 'living_street']      uint8
highway_living_street                         uint8
highway_motorway                              uint8
highway_motorway_link                         uint8
highway_primary                               uint8
highway_primary_link                          uint8
highway_residential                           uint8
highway_secondary                             uint8
highway_secondary_link                        uint8
highway_tertiary                              uint8
highway_tertiary_link                         uint8
highway_trunk_link                            uint8
highway_uncl

In [136]:
y = modeldf['speed_kph_mean']

In [137]:
X = modeldf.drop(['speed_kph_mean'], axis=1)

In [138]:
X.head()

Unnamed: 0,source,target,maxspeed,oneway,length,hour_of_day,"highway_['residential', 'living_street']",highway_living_street,highway_motorway,highway_motorway_link,highway_primary,highway_primary_link,highway_residential,highway_secondary,highway_secondary_link,highway_tertiary,highway_tertiary_link,highway_trunk_link,highway_unclassified
0,172546,530313,50,False,29.733,9,0,0,0,0,0,0,0,1,0,0,0,0,0
1,172546,28345850,50,True,64.155,9,0,0,0,0,0,0,0,1,0,0,0,0,0
2,530313,172546,50,False,29.733,7,0,0,0,0,0,0,0,1,0,0,0,0,0
3,28345850,35244866,50,True,19.106,8,0,0,0,0,0,0,0,1,0,0,0,0,0
4,28345850,29492967,50,True,197.683,8,0,0,0,0,0,0,0,1,0,0,0,0,0


#### Split the dataframe into train, test and split

In [132]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=13)

#### Fit a linear regression model

In [156]:
regr = linear_model.LinearRegression().fit(X_train, y_train)
y_pred = regr.predict(X_test)

In [161]:
# The coefficients
# carried over from a homework notebook; not super crucial
print('Coefficients: \n', regr.coef_)

Coefficients: 
 [ 1.36104243e-10  4.64771140e-10  2.26043472e-01  4.31497673e-01
  2.13138847e-02 -7.66700777e-01 -9.52310457e-01 -8.99561892e+00
 -1.60656737e+01  2.98988921e+01  6.71840545e+00  6.31328930e+00
 -3.49757674e+00 -5.93640421e+00  3.66655091e+00 -4.27897308e+00
  2.08222534e-01 -6.27859128e+00  0.00000000e+00 -1.75252235e+00]


In [158]:
# Mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))

Mean squared error: 63.34


#### Can't get the plot to work, but not super important

In [None]:
# Plot outputs
plt.scatter(X_test, y_test, color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()

In [154]:
print(X_test.shape, y_test.shape, y_pred.shape)

(2253, 20) (2253,) (2253,)


#### Fit a random forest regression model

In [159]:
RFregressor = RandomForestRegressor(n_estimators = 100, random_state = 0).fit(X_train, y_train)
y_pred = RFregressor.predict(X_test)

In [160]:
# Mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))

Mean squared error: 53.87


## Now I'll try dropping the two node columns (source, target) to see if the MSE improves

In [162]:
X = modeldf.drop(['source', 'target'], axis=1)

In [163]:
X.head()

Unnamed: 0,maxspeed,oneway,length,hour_of_day,speed_kph_mean,"highway_['residential', 'living_street']",highway_living_street,highway_motorway,highway_motorway_link,highway_primary,highway_primary_link,highway_residential,highway_secondary,highway_secondary_link,highway_tertiary,highway_tertiary_link,highway_trunk_link,highway_unclassified
0,50,False,29.733,9,45.276,0,0,0,0,0,0,0,1,0,0,0,0,0
1,50,True,64.155,9,14.532,0,0,0,0,0,0,0,1,0,0,0,0,0
2,50,False,29.733,7,35.007,0,0,0,0,0,0,0,1,0,0,0,0,0
3,50,True,19.106,8,35.756,0,0,0,0,0,0,0,1,0,0,0,0,0
4,50,True,197.683,8,46.77,0,0,0,0,0,0,0,1,0,0,0,0,0


In [164]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=13)

#### Fit a linear regression model

In [165]:
regr = linear_model.LinearRegression().fit(X_train, y_train)
y_pred = regr.predict(X_test)

In [166]:
# The coefficients
# carried over from a homework notebook; not super crucial
print('Coefficients: \n', regr.coef_)

Coefficients: 
 [ 3.74027809e-16 -3.02457505e-15  1.26841636e-17  1.84966072e-16
  1.00000000e+00 -3.88467772e-15  1.72676830e-16 -3.00752217e-15
  3.73718181e-16  8.35713308e-16  6.61644201e-16  8.18320028e-16
  7.02963990e-16  9.73015334e-16  8.56043900e-16  1.06105006e-15
  0.00000000e+00  4.32298607e-16]


In [167]:
# Mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))

Mean squared error: 0.00


#### Can't get the plot to work, but not super important

In [None]:
# Plot outputs
plt.scatter(X_test, y_test, color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()

In [154]:
print(X_test.shape, y_test.shape, y_pred.shape)

(2253, 20) (2253,) (2253,)


#### Fit a random forest regression model

In [168]:
RFregressor = RandomForestRegressor(n_estimators = 100, random_state = 0).fit(X_train, y_train)
y_pred = RFregressor.predict(X_test)

In [169]:
# Mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))

Mean squared error: 0.00
