# Random Forest Model Training

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict

In [2]:
# Read the data inb
df = pd.read_csv('data/the_data_we_work_with.csv')
tourism = df


# Scaling Pipeline

In [3]:
# Numerical Features
num_features = list(['total_female', 'total_male', 'night_mainland', 'night_zanzibar'])
print(num_features)

['total_female', 'total_male', 'night_mainland', 'night_zanzibar']


In [4]:
#Imputer Strategy is not needed as data was preprocessed (cleaned of nan)

num_pipeline = Pipeline([
    ('std_scaler', StandardScaler())
])

processor = ColumnTransformer([
    ('num', num_pipeline, num_features)
])

# One Hot Encoding Pipeline

In [5]:
y = tourism.total_cost
X = tourism.drop('total_cost', axis=1)

X = pd.get_dummies(X, drop_first=True)
X

Unnamed: 0,total_female,total_male,night_mainland,night_zanzibar,country_ANGOLA,country_ARGENTINA,country_AUSTRALIA,country_AUSTRIA,country_BELGIUM,country_BERMUDA,...,payment_mode_Credit Card,payment_mode_Other,payment_mode_Travellers Cheque,first_trip_tz_Yes,most_impressing_Friendly People,most_impressing_Good service,most_impressing_No comments,most_impressing_Satisfies and Hope Come Back,most_impressing_Wildlife,"most_impressing_Wonderful Country, Landscape, Nature"
0,1.0,1.0,13.0,0.0,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
1,1.0,0.0,14.0,7.0,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,True
2,0.0,1.0,1.0,31.0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1.0,1.0,11.0,0.0,False,False,False,False,False,False,...,False,False,False,True,True,False,False,False,False,False
4,1.0,0.0,7.0,4.0,False,False,False,False,False,False,...,False,False,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4754,0.0,1.0,2.0,0.0,False,False,False,False,False,False,...,True,False,False,False,False,False,True,False,False,False
4755,1.0,1.0,11.0,0.0,False,False,False,False,False,False,...,False,False,False,True,True,False,False,False,False,False
4756,1.0,0.0,3.0,7.0,False,False,False,False,False,False,...,False,False,False,True,False,True,False,False,False,False
4757,1.0,1.0,5.0,0.0,False,False,False,False,False,False,...,True,False,False,False,True,False,False,False,False,False


# Train Test Split

In [6]:
from sklearn.model_selection import train_test_split, cross_val_score
# Set random seed 
RSEED = 42

#Train-Test Split
X_train, X_test, y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=RSEED)


# Random Forest Model

In [7]:
# Random Forest Pipeline

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

pipe_random_forest = Pipeline([
    ('random_forest', RandomForestRegressor())
])

In [8]:
# Train the forest

#y_train_predicted = cross_val_predict(pipe_random_forest, X_test, y_test, cv=5)
random_forest = RandomForestRegressor()
random_forest.fit(X_train, y_train)



In [9]:

y_predicted = random_forest.predict(X_test)

In [22]:
def calc_mpe(y_true, y_predicted):
    mpe = np.mean(np.power((y_true-y_predicted)/y_true, 2))*100
    mpe = np.sqrt(mpe)
    return mpe

In [23]:
print("RMSE: " , np.round(np.power(mean_squared_error(y_test, y_predicted),1/2), 2)*0.00036)
print("MAE: " , np.round(mean_absolute_error(y_test, y_predicted), 2)*0.00036)
print("MPE: ", calc_mpe(y_test, y_predicted))
#print("RSquared: " , np.round(r2_score(y_test, y_predicted), 2))
#print("RSquared (adjusted): ", np.round(1 - ( 1 - r2_score(y_test, y_predicted) ) * ( len(y_test) - 1 ) / ( len(y_test) - X_test.shape[1] - 1 ), 2))

RMSE:  3479.8790628
MAE:  1932.4778904
MPE:  224.79054013939856
RSquared:  0.37
