In [56]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, mean_absolute_error
from sklearn import preprocessing
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split

In [57]:
# Load input data
input_file = 'traffic_data.txt'
data = []
with open(input_file, 'r') as f:
    for line in f.readlines():
        items = line[:-1].split(',')
        data.append(items)

In [58]:
data = np.array(data)

In [59]:
data[:, 1]

array(['00:00', '00:05', '00:10', ..., '23:45', '23:50', '23:55'],
      dtype='<U10')

In [60]:
# Convert string data to numerical data
label_encoder = [] 
X_encoded = np.empty(data.shape)
for i, item in enumerate(data[0]):
    if item.isdigit():
        X_encoded[:, i] = data[:, i]
    else:
        label_encoder.append(preprocessing.LabelEncoder())
        X_encoded[:, i] = label_encoder[-1].fit_transform(data[:, i])

In [61]:
X_encoded

array([[  5.,   0.,   1.,   0.,  39.],
       [  5.,   1.,   1.,   0.,  89.],
       [  5.,   2.,   1.,   0., 109.],
       ...,
       [  4., 285.,   7.,   0., 117.],
       [  4., 286.,   7.,   0., 147.],
       [  4., 287.,   7.,   0., 177.]])

In [62]:
X = X_encoded[:, :-1].astype(int)
y = X_encoded[:, -1].astype(int)

# Split data into training and testing datasets 
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=5)

In [63]:
# Extremely Random Forests regressor
params = {'n_estimators': 100, 'max_depth': 4, 'random_state': 0}
regressor = ExtraTreesRegressor(**params)
regressor.fit(X_train, y_train)

In [64]:
# Compute the regressor performance on test data
y_pred = regressor.predict(X_test)
print("Mean absolute error:", round(mean_absolute_error(y_test, y_pred), 2))

Mean absolute error: 74.95


In [65]:
# Testing encoding on single data instance
test_datapoint = ['Saturday', '10:20', 'Sumy', 'no']
test_datapoint_encoded = [-1] * len(test_datapoint)
count = 0
for i, item in enumerate(test_datapoint):
    if item.isdigit():
        test_datapoint_encoded[i] = int(test_datapoint[i])
    else:
        test_datapoint_encoded[i] = int(label_encoder[i].transform([test_datapoint[i]]))
        count = count + 1 

In [66]:
test_datapoint_encoded = np.array(test_datapoint_encoded)

# Predict the output for the test datapoint
print("Predicted traffic:", int(regressor.predict([test_datapoint_encoded])[0]))

Predicted traffic: 259
