In [177]:
import numpy as np
import pandas as pd
import xgboost as xgb
import joblib

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score

In [178]:
#import data
df = pd.read_csv('cleaned_data.csv',index_col="Unnamed: 0")

In [5]:
#columns we want to keep for our model
features = ["Temperature(F)", "Visibility(mi)", "Wind_Speed(mph)",
"Precipitation(in)", "weather", "Junction", "Stop", "Traffic_Signal",
"Sunrise_Sunset", "Day", "Hour", "Station", "Give_Way", "Crossing", "Railway"]

val_to_predict = ["duration(min)"]

In [6]:
#encode categoricals for weather, junction, stop, traffic_signal, sunrise_sunset, day, hour
categorical = ["weather","Junction","Stop","Traffic_Signal","Sunrise_Sunset","Day","Hour","Station", "Give_Way", "Crossing", "Railway"]

In [7]:
X = df[features]
Y = df[val_to_predict]

In [8]:
X = X.join(pd.get_dummies(df[categorical]),lsuffix="ohe_")
X = X.drop(labels=categorical, axis=1)

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [10]:
reg = GradientBoostingRegressor(random_state=42)

In [11]:
reg.fit(X_train, Y_train)

  y = column_or_1d(y, warn=True)


In [12]:
Y_pred = reg.predict(X_test)

In [13]:
reg.score(X_test, Y_test)

0.07761198328224106

In [14]:
reg.score(X_train, Y_train)

0.07744840234713746

### Changing to classification
#### The goal of this model is to be able to predict when an accident will be clear, in an application such as computing optimal route a couple hours in the future. 
#### We don't need to know the exact time it will take, so instead, we can bin the duration in a few relevant categories and see if performance improves.

In [15]:
#create new ETA column based off of duration(min) val
#15mins or less = 0
#15-30mins = 1
#30mins-1hr = 2
#1-3hr = 3
#3hr-6hr = 4
#rest of day = 5
df['ETA'] = 0
df.loc[df['duration(min)'] <= 15, 'ETA'] = 0 
df.loc[(df['duration(min)'] <= 30) & (df["duration(min)"] > 15), 'ETA'] = 1
df.loc[(df['duration(min)'] <= 60) & (df["duration(min)"] > 30), 'ETA'] = 2
df.loc[(df['duration(min)'] <= 180) & (df["duration(min)"] > 60), 'ETA'] = 3
df.loc[(df['duration(min)'] <= 360) & (df["duration(min)"] > 180), 'ETA'] = 4
df.loc[df['duration(min)'] > 360, 'ETA'] = 5

In [17]:
#functionize encoding
def prep_data(df):
    cat_vals = ["weather", "Junction", "Stop", "Traffic_Signal", "Sunrise_Sunset", "Day", "Hour", "Station", "Give_Way", "Crossing", "Railway"]
    con_vals = ["Visibility(mi)", "Precipitation(in)", "Temperature(F)", "Wind_Speed(mph)"]
    x = df[cat_vals+con_vals]
    y = df['ETA']
    x = x.join(pd.get_dummies(df[cat_vals]),lsuffix="_ohe")
    x = x.drop(labels=cat_vals, axis=1)
    y = np.array(y).ravel()
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8)
    
    return x_train, x_test, y_train, y_test

In [18]:
x_train, x_test, y_train, y_test = prep_data(df)

In [19]:
#set XGBoost params, objects
train = xgb.DMatrix(data=x_train, label=y_train)
test = xgb.DMatrix(data=x_test, label=y_test)
params = {
    "eta" : 0.05,
    "objective" : "multi:softprob",
    "num_class" : 6,
    "max_depth" : 15,
    }
epochs = 50

In [20]:
model = xgb.train(params, train, epochs)

In [21]:
y_pred = model.predict(test)

In [22]:
accuracy_score(y_test, np.argmax(y_pred,axis=1))

0.5918028294090278

In [23]:
pd.Series(model.get_score()).sort_values(ascending=False)

Temperature(F)        548277.0
Wind_Speed(mph)       352448.0
Hour_ohe              225630.0
Visibility(mi)        185536.0
Precipitation(in)      98096.0
weather_cloudy         45865.0
Day_Thursday           38526.0
Day_Friday             38249.0
Day_Monday             37062.0
Day_Wednesday          36857.0
Day_Tuesday            35804.0
Crossing_ohe           34481.0
weather_fair           34104.0
Day_Saturday           30417.0
Day_Sunday             28783.0
Station_ohe            20295.0
weather_rain           19069.0
Traffic_Signal_ohe     18332.0
Junction_ohe           17816.0
Stop_ohe               13916.0
Railway_ohe            13282.0
weather_fog            11093.0
Sunrise_Sunset_Day      8969.0
weather_storm           6887.0
weather_snow            5288.0
Give_Way_ohe            4983.0
weather_smoke           4119.0
weather_windy           3883.0
weather_dust             405.0
weather_hail               7.0
dtype: float64

##### This seems much better, and we can see our feature importance! Visibility, Hour of the day, and Precipitation seem to be our most important features.
##### Below, I use gridsearchCV to find best hyperparameters, then export the model.

In [24]:
estimator = xgb.XGBClassifier(objective="multi:softprob",
                             seed=42)
params = {
    "max_depth" : range(2, 20, 4),
    "n_estimators" : range(50, 250, 50),
    "learning_rate" : [0.2, 0.1, 0.05]
}

In [25]:
grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=params,
    n_jobs = -1,
    cv = 2,
    verbose=3
)

In [26]:
grid_search.fit(x_train, y_train)

Fitting 2 folds for each of 60 candidates, totalling 120 fits
[CV 1/2] END learning_rate=0.2, max_depth=6, n_estimators=100;, score=0.592 total time=264.7min
[CV 2/2] END learning_rate=0.2, max_depth=14, n_estimators=100;, score=0.598 total time=207.1min
[CV 2/2] END learning_rate=0.1, max_depth=2, n_estimators=50;, score=0.565 total time= 8.3min
[CV 2/2] END learning_rate=0.1, max_depth=2, n_estimators=100;, score=0.569 total time=16.7min
[CV 2/2] END learning_rate=0.1, max_depth=6, n_estimators=50;, score=0.584 total time=24.9min
[CV 2/2] END learning_rate=0.1, max_depth=6, n_estimators=200;, score=0.592 total time=101.7min
[CV 1/2] END learning_rate=0.1, max_depth=14, n_estimators=150;, score=0.598 total time=209.5min
[CV 1/2] END learning_rate=0.05, max_depth=10, n_estimators=50;, score=0.585 total time=44.5min
[CV 1/2] END learning_rate=0.05, max_depth=10, n_estimators=200;, score=0.594 total time=170.5min
[CV 1/2] END learning_rate=0.2, max_depth=6, n_estimators=200;, score=0.593

In [27]:
grid_search.best_params_

{'learning_rate': 0.2, 'max_depth': 14, 'n_estimators': 200}

In [28]:
grid_search.best_score_

0.600199278293851

In [29]:
y_pred = grid_search.predict(x_test)

In [30]:
accuracy_score(y_test, y_pred)

0.6056715650553751

In [34]:
grid_search.best_estimator_

In [38]:
#export model
joblib.dump(grid_search.best_estimator_, 'classifier.pkl', compress = 1)

['classifier.pkl']