In [1]:
#imports
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [2]:
#load in data
df = pd.read_csv('cleaned_data.csv')

In [3]:
df.columns

Index(['Unnamed: 0', 'ID', 'Severity', 'Start_Time', 'End_Time', 'Start_Lat',
       'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)', 'Description',
       'Number', 'Street', 'Side', 'City', 'County', 'State', 'Zipcode',
       'Country', 'Timezone', 'Airport_Code', 'Weather_Timestamp',
       'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)',
       'Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)',
       'Precipitation(in)', 'Weather_Condition', 'Amenity', 'Bump', 'Crossing',
       'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station',
       'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop',
       'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight', 'duration(min)', 'Time_of_Incident', 'Day',
       'Hour', 'weather', 'ETA', 'State_County', 'subtime'],
      dtype='object')

In [4]:
df = df.drop(labels=["Unnamed: 0"],axis=1)

In [5]:
def prep_data(df):
    cat_vals = ["Severity", "weather", "Junction", "Stop", "Traffic_Signal", "Sunrise_Sunset", "Day", "Hour", "Station", "Roundabout", "Give_Way", "Crossing", "Bump", "No_Exit", "Railway"]
    con_vals = ["Visibility(mi)", "Precipitation(in)"]
    x = df[cat_vals+con_vals]
    y = df['ETA']
    x = x.join(pd.get_dummies(df[cat_vals]),lsuffix="_ohe")
    x = x.drop(labels=cat_vals, axis=1)
    y = np.array(y).ravel()
    x_tr, x_te, y_tr, y_te = train_test_split(x, y, train_size=0.8)
    
    return x_tr, x_te, y_tr, y_te

In [6]:
x_train, x_test, y_train, y_test = prep_data(df)

In [7]:
estimator = xgb.XGBClassifier(objective="multi:softmax",
                             seed=42)

In [9]:
parameters = {
    "max_depth" : range(2, 10, 1),
    "n_estimators" : range(60, 220, 40),
    "learning_rate" : [0.1, 0.01, 0.05]
}

In [10]:
grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    n_jobs = 10,
    cv = 2,
    verbose=True
)

In [11]:
grid_search.fit(x_train, y_train)

Fitting 2 folds for each of 96 candidates, totalling 192 fits


In [12]:
grid_search.best_params_

{'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 180}

In [14]:
grid_search.scorer_

<function sklearn.metrics._scorer._passthrough_scorer(estimator, *args, **kwargs)>

In [15]:
grid_search.best_params_

{'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 180}

In [20]:
estimator.feature_importances_

NotFittedError: need to call fit or load_model beforehand