In [1]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

with open("tomtom-traffic-data.json", 'r') as file:
  try:
    data = json.load(file)
  except json.JSONDecodeError:
    print("Cannot work here")


df_segment_results = pd.json_normalize(data['network']['segmentResults'],
                                        record_path='segmentTimeResults',
                                        meta=['segmentId', 'newSegmentId', 'speedLimit', 'frc', 'streetName', 'distance','latitude','longitude'],errors='ignore')

df_segment_results = df_segment_results[['harmonicAverageSpeed','standardDeviationSpeed','averageSpeed','averageTravelTime','speedLimit','streetName','distance','frc','latitude','longitude']]
# print(df_segment_results)

In [2]:
df_segment_results['distance'] = df_segment_results['distance'].astype(float)
df_segment_results['speedLimit'] = df_segment_results['speedLimit'].astype(float)
df_segment_results['frc'] = df_segment_results['frc'].astype(float)

In [3]:
days_distribution = [845] * 29 + [837] * 1 + [836]
dates = []

for day, record_count in enumerate(days_distribution, 1):
    for _ in range(record_count):
        dates.append(pd.Timestamp(f"2023-10-{day:02}"))

df_segment_results['date'] = dates

# To save the dataframe back to CSV or any other format, use:
# df.to_csv('filename.csv', index=False)

print(df_segment_results.head())

   harmonicAverageSpeed  standardDeviationSpeed  averageSpeed  \
0                 31.33                   10.12         35.08   
1                 18.57                    8.61         22.01   
2                 16.88                    6.11         19.05   
3                 12.41                    7.37         16.51   
4                 28.87                    7.61         31.01   

   averageTravelTime  speedLimit              streetName  distance  frc  \
0              11.93        50.0    Jalan Mohammad Saidi    103.80  4.0   
1              19.59        50.0      Jalan Ulujami Raya    101.07  3.0   
2              10.40        50.0  Jalan Peninggaran Raya     48.75  5.0   
3              10.25        50.0  Jalan Peninggaran Raya     35.32  5.0   
4              12.72        50.0    Jalan Mohammad Saidi    102.05  4.0   

  latitude longitude       date  
0      NaN       NaN 2023-10-01  
1      NaN       NaN 2023-10-01  
2      NaN       NaN 2023-10-01  
3      NaN       NaN 2

In [4]:
df_segment_results['date'] = pd.to_datetime(df_segment_results['date'])

# Create 'day_of_week' and 'is_weekend' columns
df_segment_results['day_of_week'] = df_segment_results['date'].dt.dayofweek
df_segment_results['is_weekend'] = df_segment_results['day_of_week'].isin([5,6]).astype(int)

# Set the 'date' column as the index and sort the DataFrame based on this index
df_segment_results = df_segment_results.set_index('date')
df_segment_results = df_segment_results.sort_index()

In [5]:
# Sorting by average speed and travel time
top_15_roads = df_segment_results.sort_values(by=['averageSpeed', 'averageTravelTime'], ascending=[False, True]).head(15)
worst_15_roads = df_segment_results.sort_values(by=['averageSpeed', 'averageTravelTime']).head(15)

In [6]:
train, test = train_test_split(df_segment_results, test_size=0.2, shuffle=False)
X_train = train[['harmonicAverageSpeed', 'standardDeviationSpeed', 'averageTravelTime', 'speedLimit', 'distance', 'frc', 'day_of_week', 'is_weekend']]
y_train = train['averageSpeed']
X_test = test[['harmonicAverageSpeed', 'standardDeviationSpeed', 'averageTravelTime', 'speedLimit', 'distance', 'frc', 'day_of_week', 'is_weekend']]
y_test = test['averageSpeed']
X_train['standardDeviationSpeed'].fillna(X_train['standardDeviationSpeed'].mean(), inplace=True)
# Training
model = LinearRegression()
model.fit(X_train, y_train)
# Prediction
predictions = model.predict(X_test)
# Evaluation
rmse = mean_squared_error(y_test, predictions, squared=False)
print(f"RMSE: {rmse}")

n_forecast = 30  # 30 days
X_forecast = X_test.iloc[-n_forecast:].copy()
y_forecast = model.predict(X_forecast)



RMSE: 2.007033777027117


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['standardDeviationSpeed'].fillna(X_train['standardDeviationSpeed'].mean(), inplace=True)


In [7]:

y_train_time = train['averageTravelTime']
y_test_time = test['averageTravelTime']
# Training for averageTravelTime
model_time = LinearRegression()
model_time.fit(X_train, y_train_time)

predictions_time = model_time.predict(X_test)
def predict_single_values(features, model_speed, model_time):
    speed = model_speed.predict(features)
    time = model_time.predict(features)
    return speed, time

# Example:
selected_road_features = X_test.iloc[1000].to_frame().T
predicted_speed, predicted_time = predict_single_values(selected_road_features, model, model_time)
print("Predicted Average Speed:", predicted_speed)
print("Predicted Average Travel Time:", predicted_time)


Predicted Average Speed: [41.81014759]
Predicted Average Travel Time: [1.39]


In [8]:
X_top_15 = top_15_test[['harmonicAverageSpeed', 'standardDeviationSpeed', 'averageTravelTime', 'speedLimit', 'distance', 'frc', 'day_of_week', 'is_weekend']]
y_top_15_true = top_15_test['averageSpeed']

# Forecast for top 15 roads
predictions_top_15 = model.predict(X_top_15)

# Saving the results in a DataFrame
forecast_top_15 = top_15_test[['streetName', 'latitude', 'longitude']].copy()
forecast_top_15['PredictedAverageSpeed'] = predictions_top_15


NameError: name 'top_15_test' is not defined

In [None]:
predictions_time = model_time.predict(X_test)

In [None]:
n_forecast = 7
X_forecast = X_test.iloc[-n_forecast:].copy()
y_forecast = model.predict(X_forecast)
plt.figure(figsize=(15, 6))
plt.plot(y_test.index, y_test.values, label='Actual Values')
plt.plot(y_test.index, predictions, label='Predicted Values')
forecast_dates = pd.date_range(y_test.index[-1], periods=n_forecast+1, closed='right')
plt.plot(forecast_dates, y_forecast, label='Forecast', linestyle='dashed')
plt.legend()
plt.title("Linear Regression Forecast")
plt.xlabel("Date")
plt.ylabel("Average Travel Time")
plt.tight_layout()
plt.grid(True)
plt.show()


In [None]:
from sklearn.ensemble import RandomForestRegressor

# One-hot encode streetName
df_segment_results_encoded = pd.get_dummies(df_segment_results, columns=['streetName'])


train, test = train_test_split(df_segment_results_encoded, test_size=0.2, shuffle=False)
X_train = train.drop('averageSpeed', axis=1)
y_train = train['averageSpeed']
X_test = test.drop('averageSpeed', axis=1)
y_test = test['averageSpeed']

# Handle missing values in training data
X_train['standardDeviationSpeed'].fillna(X_train['standardDeviationSpeed'].mean(), inplace=True)

# Ensure test data has the same columns as training data after one-hot encoding
missing_cols = set(X_train.columns) - set(X_test.columns)
for c in missing_cols:
    X_test[c] = 0
X_test = X_test[X_train.columns]  # reorder columns to match the training set

# Train Random Forest model
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

# Predict
y_pred_rf = model_rf.predict(X_test)

In [None]:
rmse = mean_squared_error(y_test, y_pred_rf, squared=False)
print(f"RMSE: {rmse}")

In [None]:
# Plotting
plt.figure(figsize=(15, 6))
plt.plot(y_test.index, y_test.values, label='Actual Values')
plt.plot(y_test.index, y_pred_rf, label='Predicted Values')
plt.legend()
plt.title("Random Forest Forecast")
plt.xlabel("Date")
plt.ylabel("Average Travel Time")
plt.tight_layout()
plt.grid(True)
plt.show()


In [None]:
# doesnt work well
from statsmodels.tsa.arima.model import ARIMA

time_series = df_segment_results['averageSpeed']
train_ts, test_ts = train_test_split(time_series, test_size=0.2, shuffle=False)


model_arima = ARIMA(train_ts, order=(5,1,0))
model_arima_fit = model_arima.fit()

# Predict
y_pred_arima = model_arima_fit.forecast(steps=len(test_ts))

# Forecast
y_forecast_arima = model_arima_fit.forecast(steps=len(test_ts)+n_forecast)[-n_forecast:]

# Plotting
plt.figure(figsize=(15, 6))
plt.plot(test_ts.index, test_ts.values, label='Actual Values')
plt.plot(test_ts.index, y_pred_arima, label='Predicted Values')
plt.plot(forecast_dates, y_forecast_arima, label='Forecast', linestyle='dashed')
plt.legend()
plt.title("ARIMA Forecast")
plt.xlabel("Date")
plt.ylabel("Average Travel Time")
plt.tight_layout()
plt.grid(True)
plt.show()
