In [65]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split

from xgboost import XGBRegressor

from pipelines import pipe, pipe2

In [126]:
train = pd.read_csv("..\Datasets\Train (1).csv")
train.head()

Unnamed: 0,ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,...,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,payment_mode,first_trip_tz,most_impressing,total_cost
0,tour_0,SWIZERLAND,45-64,Friends/Relatives,1.0,1.0,Leisure and Holidays,Wildlife tourism,"Friends, relatives",Independent,...,No,No,No,No,13.0,0.0,Cash,No,Friendly People,674602.5
1,tour_10,UNITED KINGDOM,25-44,,1.0,0.0,Leisure and Holidays,Cultural tourism,others,Independent,...,No,No,No,No,14.0,7.0,Cash,Yes,"Wonderful Country, Landscape, Nature",3214906.5
2,tour_1000,UNITED KINGDOM,25-44,Alone,0.0,1.0,Visiting Friends and Relatives,Cultural tourism,"Friends, relatives",Independent,...,No,No,No,No,1.0,31.0,Cash,No,Excellent Experience,3315000.0
3,tour_1002,UNITED KINGDOM,25-44,Spouse,1.0,1.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,...,Yes,Yes,Yes,No,11.0,0.0,Cash,Yes,Friendly People,7790250.0
4,tour_1004,CHINA,1-24,,1.0,0.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Independent,...,No,No,No,No,7.0,4.0,Cash,Yes,No comments,1657500.0


In [127]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     4809 non-null   object 
 1   country                4809 non-null   object 
 2   age_group              4809 non-null   object 
 3   travel_with            3695 non-null   object 
 4   total_female           4806 non-null   float64
 5   total_male             4804 non-null   float64
 6   purpose                4809 non-null   object 
 7   main_activity          4809 non-null   object 
 8   info_source            4809 non-null   object 
 9   tour_arrangement       4809 non-null   object 
 10  package_transport_int  4809 non-null   object 
 11  package_accomodation   4809 non-null   object 
 12  package_food           4809 non-null   object 
 13  package_transport_tz   4809 non-null   object 
 14  package_sightseeing    4809 non-null   object 
 15  pack

In [128]:
train = train.drop(columns=['ID', 'payment_mode', 'most_impressing'], axis=1)

In [129]:
np.array(train[3:4])

array([['UNITED KINGDOM', '25-44', 'Spouse', 1.0, 1.0,
        'Leisure and Holidays', 'Wildlife tourism',
        'Travel, agent, tour operator', 'Package Tour', 'No', 'Yes',
        'Yes', 'Yes', 'Yes', 'Yes', 'No', 11.0, 0.0, 'Yes', 7790250.0]],
      dtype=object)

In [130]:
train.isna().sum()

country                     0
age_group                   0
travel_with              1114
total_female                3
total_male                  5
purpose                     0
main_activity               0
info_source                 0
tour_arrangement            0
package_transport_int       0
package_accomodation        0
package_food                0
package_transport_tz        0
package_sightseeing         0
package_guided_tour         0
package_insurance           0
night_mainland              0
night_zanzibar              0
first_trip_tz               0
total_cost                  0
dtype: int64

In [131]:
for col in train.select_dtypes('object', 'category').columns:
    train.loc[:, col] = train.loc[:, col].fillna("None").astype(str)

In [132]:
for col in train.select_dtypes(np.number):
    train.loc[:, col] = train.loc[:, col].fillna(np.median)

In [135]:
X = train.drop(columns=['total_cost'], axis=1)
y = train['total_cost']

In [140]:
cat = X.select_dtypes(exclude=np.number).columns.tolist()

for col in cat:
    X[col] = X[col].astype('category')

In [141]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=123)

In [142]:
import xgboost as xgb

d_train = xgb.DMatrix(X_train, y_train, enable_categorical=True)
d_test = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [148]:
params = {"objective": "reg:squarederror",
         "tree_method": "hist"}

In [160]:

model = xgb.train(params=params,
                 dtrain=d_train,
                 num_boost_round=20000,
                 evals=[(d_test, "validation"), (d_train, "train")],
                 verbose_eval=500,
                 early_stopping_rounds=200)

[0]	validation-rmse:11497821.12181	train-rmse:12227763.61270
[500]	validation-rmse:10292087.26038	train-rmse:1637075.48754
[1000]	validation-rmse:10524839.71099	train-rmse:1437556.17969
[1500]	validation-rmse:10568970.32621	train-rmse:1398676.01198
[2000]	validation-rmse:10596823.09370	train-rmse:1386607.17883
[2500]	validation-rmse:10609563.90257	train-rmse:1383842.73232
[3000]	validation-rmse:10616496.71792	train-rmse:1382752.12519
[3500]	validation-rmse:10620437.67913	train-rmse:1382343.38008
[4000]	validation-rmse:10623409.90743	train-rmse:1382192.67556
[4500]	validation-rmse:10624556.66071	train-rmse:1382123.34012
[5000]	validation-rmse:10625261.77146	train-rmse:1382095.94867
[5500]	validation-rmse:10625816.54227	train-rmse:1382084.86292
[6000]	validation-rmse:10626030.96193	train-rmse:1382079.50965
[6500]	validation-rmse:10626231.96180	train-rmse:1382077.17786
[7000]	validation-rmse:10626393.10791	train-rmse:1382075.87762
[7500]	validation-rmse:10626460.31167	train-rmse:1382075.2

In [161]:
y_pred = model.predict(d_test)

In [162]:
mae = mean_absolute_error(y_pred, y_test)
mae

5954822.636061093

In [163]:
mse = mean_squared_error(y_test, y_pred, squared=False)
mse

10626649.437103499

In [164]:
rmse = np.sqrt(mse)
rmse

2440.2505273149914

In [167]:
cross_val = xgb.cv(params=params,
                      dtrain=d_train,
                      num_boost_round=20000,
                      nfold=10,
                      verbose_eval=500,
                      early_stopping_rounds=200)

[0]	train-rmse:12209023.21834+130407.27950	test-rmse:12445028.01747+1378812.47257
[206]	train-rmse:2201390.36538+99307.54918	test-rmse:10957695.39394+967246.00518


In [168]:
cross_val.head()

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,12209020.0,130407.279496,12445030.0,1378812.0
1,10544490.0,100143.537841,11133820.0,1401718.0
2,9444347.0,95076.50633,10452770.0,1355498.0
3,8681405.0,104840.388974,10084370.0,1318275.0
4,8119434.0,93130.216618,9923298.0,1291163.0


In [169]:
best_rmse = cross_val['test-rmse-mean'].min()

best_rmse

9841961.371428449

In [176]:
test = pd.read_csv('../Datasets/Test (1).csv')
test = test.drop(columns=['ID', 'most_impressing', 'payment_mode'], axis=1)

In [178]:
for col in test.select_dtypes('object', 'category'):
    test.loc[:, col] = test.loc[:, col].fillna("None").astype(str)
    
for col in test.select_dtypes(np.number):
    test.loc[:, col] = test.loc[:, col].fillna(np.median)
    
cats = test.select_dtypes(exclude=np.number).columns

for col in cats:
    test[col] = test[col].astype('category')

In [185]:
model.predict(xgb.DMatrix(test[:5], enable_categorical=True))

array([-5583209.5,  9406935. ,  9256287. , 10154423. , 24295304. ],
      dtype=float32)

In [186]:
test[:5]

Unnamed: 0,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,first_trip_tz
0,AUSTRALIA,45-64,Spouse,1.0,1.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,Yes,Yes,Yes,Yes,Yes,Yes,Yes,10,3,Yes
1,SOUTH AFRICA,25-44,Friends/Relatives,0.0,4.0,Business,Wildlife tourism,Tanzania Mission Abroad,Package Tour,Yes,Yes,No,No,No,No,No,13,0,No
2,GERMANY,25-44,Friends/Relatives,3.0,0.0,Leisure and Holidays,Beach tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,7,14,No
3,CANADA,24-Jan,Friends/Relatives,2.0,0.0,Leisure and Holidays,Cultural tourism,others,Independent,No,No,No,No,No,No,No,0,4,Yes
4,UNITED KINGDOM,45-64,Friends/Relatives,2.0,2.0,Leisure and Holidays,Wildlife tourism,"Friends, relatives",Package Tour,Yes,Yes,Yes,Yes,No,No,No,10,0,Yes
