In [45]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import rbf_kernel
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [46]:
orders = pd.read_csv("orders.csv")
nodes  = pd.read_csv("nodes.csv")
nodes = nodes[nodes["Id"].isin(orders.Id)]


In [47]:
#Fill NA with mean
nodes.speed.fillna(nodes.speed.mean(), inplace=True)

nodes.speed = nodes.speed / 3.6 # From km/h to m/s
nodes['expected_node_time'] = nodes.distance / nodes.speed

orders = orders.merge(nodes.groupby('Id').sum()['expected_node_time'], left_on='Id', right_index=True).rename({'expected_node_time': 'expected_time'}, axis=1)

In [48]:
orders.running_time = pd.to_datetime(orders.running_time)
orders.completed_time = pd.to_datetime(orders.completed_time)

In [50]:
orders.running_time = orders.running_time.dt.hour * 60 + orders.running_time.dt.minute
orders.completed_time = orders.completed_time.dt.hour * 60 + orders.completed_time.dt.minute

apply_rbf = lambda X: rbf_kernel(X, [[450]], gamma=0.00002) + rbf_kernel(X, [[1050]], gamma=0.000006)

In [51]:
orders = orders.sort_values('running_time')
orders.loc[(orders['running_time'] > 1300) & (orders['completed_time'] < 1000), 'completed_time'] = 1440 + orders.loc[(orders['running_time'] > 1300) & (orders['completed_time'] < 1000), 'completed_time']

In [52]:
# Using the information about latest 10 rides (at the time of a ride start)
# Comparing the true target and the expected target calculated using distance/speed from nodes file
# This will improve the prediction of a speed

latest10_list = []

for ind, row in orders.iterrows():    
    latest10 = orders.loc[orders.completed_time < row.running_time].tail(10).mean()
    latest10_value = (latest10.delta_time - latest10.expected_time) / latest10.delta_time + 1
    latest10_list.append(latest10_value)


latest10_list[:10] = [1] * 10
orders['latest10'] = latest10_list
orders['expected_time'] = orders.latest10 * orders.expected_time

In [53]:
df = pd.merge(orders, nodes, left_on='Id', right_on='Id')
df.drop(['completed_time', 'node_start', 'node_finish'], axis=1, inplace=True)

# One order has 0 distance, removed it from the set
df = df.loc[df['route_distance_km'] != 0]

In [54]:
df['node_part_time'] = df['expected_node_time'] / df['expected_time']
df['node_part_distance'] = df['distance'] / df['route_distance_km'] / 1000

In [55]:
df['current_time'] = df.groupby('Id').expected_node_time.transform(np.cumsum) / 60 + df.running_time

df['time_rbf'] = apply_rbf(df[['current_time']])

In [56]:
df['delta_time'] = df['delta_time'] * df['node_part_time']
df.set_index('Id', inplace=True)

In [57]:
X = df.drop(['expected_time', 'delta_time', 'running_time', 'current_time'], axis=1)
y = df['delta_time']

In [60]:
test_ind = np.random.choice(X.index.unique(), size=1000, replace=False)
X_test = X.loc[test_ind]
y_test = y.loc[test_ind]
X_train = X.loc[~X.index.isin(test_ind)]
y_train = y.loc[~X.index.isin(test_ind)]

In [62]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

In [64]:
y_pred_node = rf.predict(X_test)
mean_squared_error(y_pred, y_test, squared=False)

2.280415781564325

In [66]:
y_pred_node = pd.Series(y_pred_node, index=y_test.index)
y_pred_total = y_pred_node.groupby('Id').sum()
y_test_total = y_test.groupby('Id').sum()
mean_squared_error(y_pred_total, y_test_total, squared=False)

90.23626092334955