## Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import csv
import datetime

## Feature Selection 

In [2]:
train = pd.read_csv("train_clean.csv")
test = pd.read_csv('test_clean.csv')


In [3]:
print(train.shape)
print(test.shape)

(343583, 21)
(21018, 19)


In [4]:
test.isnull().sum()

site                 0
continent_id         0
buyer_country        0
buyer_region         0
buyer_city           0
distance             0
buyer_id             0
mobile               0
package              0
channel_id           0
adults               0
children             0
room                 0
destination_id       0
destination_type     0
regency_continent    0
regency_country      0
regency_market       0
Number of days       0
dtype: int64

In [5]:
##isolating buyer_id 
buyer_train = train['buyer_id']
buyer_test = test['buyer_id']

In [6]:
## dropping buyer_id from main dataset
train.drop(['buyer_id','regency_cluster'], axis = 1, inplace=True)
test.drop(['buyer_id'], axis = 1, inplace=True)

In [7]:
train.head()

Unnamed: 0,site,continent_id,buyer_country,buyer_region,buyer_city,distance,mobile,package,channel_id,adults,children,room,destination_id,destination_type,regency_continent,regency_country,regency_market,cnt,Number of days
0,2,3,66,348,48862,2234.2641,0,1,9,2,0,1,8250,1,3,2,50,628,4
1,2,3,66,348,48862,2234.2641,0,1,9,2,0,1,8250,1,1,2,50,628,4
2,2,3,66,348,48862,2234.2641,0,0,9,2,0,1,8250,1,1,2,50,628,4
3,2,3,66,442,35390,913.1932,0,0,3,2,0,1,14984,1,1,2,50,1457,5
4,2,3,66,442,35390,913.6259,0,0,3,2,0,1,14984,1,1,2,50,1457,5


In [8]:
main_df = train.copy()

In [9]:
df_feature = train.drop(['cnt'], axis = 1)
df_target = train['cnt']

In [10]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df_feature, df_target, test_size=0.2)

In [15]:
##evaluate model
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    test= (errors / test_labels)
    mape = 100 * (np.mean(errors)/np.mean(test_labels))
    print(mape)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

regressor_accuracy = evaluate(regressor, x_test, y_test)

6.62074509719223
Model Performance
Average Error: 36.7942 degrees.
Accuracy = 93.38%.


## Applying Algorithm


In [16]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 42)

regressor.fit(x_train,y_train)

RandomForestRegressor(n_estimators=10, random_state=42)

In [17]:
cnt = regressor.predict(test)


In [20]:
cnt_new = []
for i in cnt:
    cnt_new.append(formatNumber(i))

In [21]:
## append predicted cnt value to test dataset
test['cnt'] = cnt_new

In [22]:
##save the dataframe as csv
test.to_csv('test_clean.csv')

In [None]:
#saving base model as a pickle file
import pickle
filename = 'RFbaseCnt.pkl'
pickle.dump(regressor, open(filename, 'wb'))

In [19]:
def formatNumber(num):
    return int(num)
    

## Hyperparameter tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import pprint as pp
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import uniform, truncnorm, randint

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 10)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 40, num = 10)]



# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,       
               'max_features': truncnorm(a=0, b=1, loc=0.25, scale=0.1),
               'min_samples_split': uniform(0.01, 0.199)
               }
pp.pprint(random_grid)

# Random Search Training

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(x_train, y_train)


In [None]:
# print winning set of hyperparameters
rf_random.best_params_