In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV

In [2]:
df_train_set = pd.read_csv('train_aggregated.csv', low_memory=False)

In [3]:
df_train_set.head()

Unnamed: 0,ride_id,travel_date,travel_time,travel_from,car_type,max_capacity,number_of_tickets
0,1442,17-10-17,7:15,Migori,Bus,49,1.0
1,5437,19-11-17,7:12,Migori,Bus,49,1.0
2,5710,26-11-17,7:05,Keroka,Bus,49,1.0
3,5777,27-11-17,7:10,Homa Bay,Bus,49,5.0
4,5778,27-11-17,7:12,Migori,Bus,49,31.0


In [4]:
df_train_set.drop(['ride_id'], axis=1, inplace=True) #ride_id is unnecessary in training set

In [5]:
df_train_set["travel_date"] = pd.to_datetime(df_train_set["travel_date"],infer_datetime_format=True)
df_train_set["travel_day"] = df_train_set["travel_date"].dt.dayofweek #change the full date to day of week
df_train_set["travel_week"] = df_train_set["travel_date"].dt.week
df_train_set.drop(['travel_date'], axis=1, inplace=True) 

In [6]:
df_train_set["car_type"] = pd.Categorical(df_train_set["car_type"])
car_type_categories = df_train_set.car_type.cat.categories
df_train_set["car_type"] = df_train_set.car_type.cat.codes

In [7]:
df_train_set["travel_from"] = pd.Categorical(df_train_set["travel_from"])
travel_from_categories = df_train_set.travel_from.cat.categories
df_train_set["travel_from"] = df_train_set.travel_from.cat.codes

In [8]:
#express travel time in minutes
df_train_set["travel_time"] = df_train_set["travel_time"].str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1]))

In [9]:
df_train_set.head()

Unnamed: 0,travel_time,travel_from,car_type,max_capacity,number_of_tickets,travel_day,travel_week
0,435,9,0,49,1.0,1,42
1,432,9,0,49,1.0,6,46
2,425,4,0,49,1.0,6,47
3,430,1,0,49,5.0,0,48
4,432,9,0,49,31.0,0,48


In [10]:
X = df_train_set.drop(["number_of_tickets"], axis=1)
y = df_train_set.number_of_tickets

In [11]:
loss = ['ls', 'lad', 'huber']

# Number of trees used in the boosting process
n_estimators = [100, 500, 900, 1100, 1500]

# Maximum depth of each tree
max_depth = [2, 3, 5, 10, 15]

# Minimum number of samples per leaf
min_samples_leaf = [1, 2, 4, 6, 8]

# Minimum number of samples to split a node
min_samples_split = [2, 4, 6, 10]

# Maximum number of features to consider for making splits
max_features = ['auto', 'sqrt', 'log2', None]

# Define the grid of hyperparameters to search
hyperparameter_grid = {'loss': loss,
                       'n_estimators': n_estimators,
                       'max_depth': max_depth,
                       'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,
                       'max_features': max_features}

In [12]:
model = GradientBoostingRegressor(random_state=10, criterion="mae")
random_cv = RandomizedSearchCV(estimator=model,
                               param_distributions=hyperparameter_grid,
                               cv=4, n_iter=13, 
                               scoring = 'neg_mean_absolute_error',
                               n_jobs = -1, verbose = 1, 
                               random_state=10)
#model = RandomForestRegressor(n_estimators=100, criterion="mae", n_jobs=-1)
#model = RandomForestRegressor(criterion="mae", oob_score=True)
#model.score(X,y)

In [None]:
#import pickle
#pickle.dump(random_cvv, open('traffic_model.pkl', 'wb'))

In [13]:
#random_cv = pickle.load(open('traffic_model.pkl','rb')) 
#random_cv.fit(X,y)



AttributeError: Can't get attribute 'DeprecationDict' on <module 'sklearn.utils.deprecation' from 'C:\\Program Files (x86)\\Microsoft Visual Studio\\Shared\\Anaconda3_64\\lib\\site-packages\\sklearn\\utils\\deprecation.py'>

In [None]:
preds_train_set = random_cv.predict(X)

In [None]:
print (mean_absolute_error(preds_train_set,y))

# Predictions for test set

In [None]:
df_test_set = pd.read_csv('test_questions.csv', low_memory=False)

Let's first format the data as we did for the training set.

In [None]:
df_test_set.drop(['travel_to'], axis=1, inplace=True)

In [None]:
df_test_set["travel_date"] = pd.to_datetime(df_test_set["travel_date"],infer_datetime_format=True)
df_test_set["travel_day"] = df_test_set["travel_date"].dt.dayofweek 
df_test_set["travel_week"] = df_test_set["travel_date"].dt.week
df_test_set.drop(['travel_date'], axis=1, inplace=True) 

In [None]:
df_test_set["car_type"] = pd.Categorical(df_test_set["car_type"], categories=car_type_categories)
df_test_set["car_type"] = df_test_set.car_type.cat.codes

In [None]:
df_test_set["travel_from"] = pd.Categorical(df_test_set["travel_from"], categories=travel_from_categories)
df_test_set["travel_from"] = df_test_set.travel_from.cat.codes

In [None]:
df_test_set["travel_time"] = df_test_set["travel_time"].str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1]))

Now let's calculate predictions using the random forest model we trained.

In [None]:
X_test = df_test_set.drop(['ride_id'], axis=1)
test_set_predictions = random_cv.predict(X_test)

And finally let's create a csv file with predictions. 

In [None]:
d = {'ride_id': df_test_set["ride_id"], 'number_of_ticket': test_set_predictions}
df_predictions = pd.DataFrame(data=d)
df_predictions = df_predictions[['ride_id','number_of_ticket']]

In [None]:
df_predictions.head()

In [None]:
df_predictions.to_csv('prediction_set.csv', index=False) #save to csv file