In [47]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_columns = None
import os

In [49]:
train_df=pd.read_csv("/content/Train.csv")
test_df=pd.read_csv("/content/Test.csv")
sub_df=pd.read_csv("/content/SampleSubmission.csv")
# descp=pd.read_csv("/content/VariableDefinitions.csv")

In [None]:
descp

Unnamed: 0,Column Name,Definition
0,id,Unique identifier for each tourist
1,country,The country a tourist coming from.
2,age_group,The age group of a tourist.
3,travel_with,The relation of people a tourist travel with t...
4,total_female,Total number of females
5,total_male,Total number of males
6,purpose,The purpose of visiting Tanzania
7,main_activity,The main activity of tourism in Tanzania
8,infor_source,The source of information about tourism in Tan...
9,tour_arrangment,The arrangment of visiting Tanzania


## Data Exploration and Feature Engineering

In [50]:
print("Size of train",train_df.shape)
print("Size of test",test_df.shape)

Size of train (4809, 23)
Size of test (1601, 22)


In [51]:
train_df.head()

Unnamed: 0,ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,payment_mode,first_trip_tz,most_impressing,total_cost
0,tour_0,SWIZERLAND,45-64,Friends/Relatives,1.0,1.0,Leisure and Holidays,Wildlife tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,13.0,0.0,Cash,No,Friendly People,674602.5
1,tour_10,UNITED KINGDOM,25-44,,1.0,0.0,Leisure and Holidays,Cultural tourism,others,Independent,No,No,No,No,No,No,No,14.0,7.0,Cash,Yes,"Wonderful Country, Landscape, Nature",3214906.5
2,tour_1000,UNITED KINGDOM,25-44,Alone,0.0,1.0,Visiting Friends and Relatives,Cultural tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,1.0,31.0,Cash,No,Excellent Experience,3315000.0
3,tour_1002,UNITED KINGDOM,25-44,Spouse,1.0,1.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,No,Yes,Yes,Yes,Yes,Yes,No,11.0,0.0,Cash,Yes,Friendly People,7790250.0
4,tour_1004,CHINA,1-24,,1.0,0.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Independent,No,No,No,No,No,No,No,7.0,4.0,Cash,Yes,No comments,1657500.0


In [52]:
train_df.shape

(4809, 23)

In [53]:
train_df["age_group"].value_counts()

25-44    2487
45-64    1391
1-24      624
65+       307
Name: age_group, dtype: int64

In [54]:
## convert float dtypes to int

def convert_int(cols, df):
  for col in cols:
    # df[col]=df[col].astype('int')
    df[col] = df[col].fillna(0)
    df[col] = df[col].astype(int)

int_cols=["total_female", "total_male", "night_mainland", "night_zanzibar"]

convert_int(int_cols, train_df)
convert_int(int_cols, test_df)



In [55]:
train_df["total_female"].value_counts()

1     2418
0     1672
2      463
3      144
4       46
5       25
6       15
7       10
10       4
9        4
11       3
12       3
15       1
49       1
Name: total_female, dtype: int64

In [56]:
#feature engineering
train_df["total_persons"] = train_df["total_female"] + train_df["total_male"]
train_df["total_nights_spent"] = train_df["night_mainland"] + train_df["night_zanzibar"]
train_df["below_25"] = (train_df["age_group"]=="1-24").astype(int)

#On test set
test_df["total_persons"] = test_df["total_female"] + test_df["total_male"]
test_df["total_nights_spent"] = test_df["night_mainland"] + test_df["night_zanzibar"]
test_df["below_25"] = (test_df["age_group"]=="1-24").astype(int)


In [57]:
#Modelling
feat_cols = train_df.drop(["ID","total_cost"],1)
cols = feat_cols.columns
target=train_df["total_cost"]

In [58]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score

In [59]:
X_train, X_eval, y_train, y_eval = train_test_split(
    train_df[cols],
    target,
    test_size=0.20,
    shuffle=True,
    random_state=23
)

### Mean encode categorical variables

In [60]:
cat_cols=list(X_train.select_dtypes(include=['object']).columns)

In [61]:

X_tr=X_train.copy()
X_tr["target"]=y_train

X_val=X_eval.copy()
X_val["target"]=y_eval

for col in cat_cols:
  means=X_tr.groupby(col).target.mean()
  X_tr[col+" mean_target"]=X_train[col].map(means)
  X_val[col+" mean_target"]=X_eval[col].map(means)

####Do the same for the full train dataset that we will use in the final model

In [62]:
full_tr=train_df.copy()

X_test=test_df.copy()

for col in cat_cols:
  means=full_tr.groupby(col).total_cost.mean()
  full_tr[col+" mean_target"]=train_df[col].map(means)
  X_test[col+" mean_target"]=X_test[col].map(means)

####Drop the categorical variables

In [63]:
drop_cols=cat_cols

In [64]:
for col in drop_cols:

    X_train.drop(col, inplace=True,axis=1)
    X_train[col+" mean_target"]=X_tr[col+" mean_target"]

    X_eval.drop(col, inplace=True, axis=1)
    X_eval[col+" mean_target"]=X_val[col+" mean_target"]

In [65]:
X_eval.head()

Unnamed: 0,total_female,total_male,night_mainland,night_zanzibar,total_persons,total_nights_spent,below_25,country mean_target,age_group mean_target,travel_with mean_target,purpose mean_target,main_activity mean_target,info_source mean_target,tour_arrangement mean_target,package_transport_int mean_target,package_accomodation mean_target,package_food mean_target,package_transport_tz mean_target,package_sightseeing mean_target,package_guided_tour mean_target,package_insurance mean_target,payment_mode mean_target,first_trip_tz mean_target,most_impressing mean_target
2614,2,0,6,0,2,6,0,9741297.0,10894040.0,10230450.0,3262994.0,4092298.0,4368185.0,2649494.0,4560164.0,2818900.0,3434822.0,3773705.0,4733525.0,4914539.0,6675155.0,7658532.0,10055340.0,7879783.0
3998,0,1,5,4,1,9,0,7069767.0,5891898.0,3332672.0,11939220.0,8760690.0,6058200.0,14214330.0,16089480.0,14204820.0,14271210.0,14665280.0,15424260.0,14702360.0,15789250.0,7658532.0,10055340.0,6680328.0
1753,0,1,14,2,1,16,0,12527360.0,10894040.0,3332672.0,3262994.0,10525360.0,13126510.0,14214330.0,4560164.0,14204820.0,14271210.0,14665280.0,4733525.0,4914539.0,6675155.0,7658532.0,10055340.0,7817298.0
1513,0,1,23,0,1,23,0,1354819.0,10894040.0,3332672.0,1616978.0,1842626.0,6058200.0,14214330.0,16089480.0,14204820.0,14271210.0,14665280.0,4733525.0,4914539.0,6675155.0,7658532.0,3883777.0,
2446,0,1,2,0,1,2,0,1349215.0,5891898.0,3332672.0,1708193.0,10525360.0,6058200.0,2649494.0,4560164.0,2818900.0,3434822.0,3773705.0,4733525.0,4914539.0,6675155.0,7658532.0,3883777.0,7879783.0


In [66]:
for col in drop_cols:
    full_tr.drop(col, inplace=True,axis=1)
    X_test.drop(col, inplace=True, axis=1)

In [67]:
X_train.columns

Index(['total_female', 'total_male', 'night_mainland', 'night_zanzibar',
       'total_persons', 'total_nights_spent', 'below_25',
       'country mean_target', 'age_group mean_target',
       'travel_with mean_target', 'purpose mean_target',
       'main_activity mean_target', 'info_source mean_target',
       'tour_arrangement mean_target', 'package_transport_int mean_target',
       'package_accomodation mean_target', 'package_food mean_target',
       'package_transport_tz mean_target', 'package_sightseeing mean_target',
       'package_guided_tour mean_target', 'package_insurance mean_target',
       'payment_mode mean_target', 'first_trip_tz mean_target',
       'most_impressing mean_target'],
      dtype='object')

In [68]:
print(X_train.shape)
print(X_eval.shape)

(3847, 24)
(962, 24)


## Build the model

In [69]:
from xgboost import XGBRegressor
xgb=XGBRegressor( n_estimators= 300, learning_rate = 0.01, eval_metric="mae", max_depth =7, random_state= 3, seed=23, subsample= 0.6, colsample_bytree=0.5, gamma=1)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_eval)




In [70]:
from sklearn.metrics import mean_absolute_error 
mae = mean_absolute_error(y_eval, y_pred)
print('Error {}'.format(mae))

Error 5055825.587744023


Training on the full dataset

In [71]:
xgb_full=XGBRegressor( n_estimators= 300,learning_rate = 0.01,eval_metric="mae", max_depth =7, random_state= 3, seed=23, subsample= 0.6,colsample_bytree=0.5, gamma=1 )
xgb_full.fit(full_tr.drop(["total_cost","ID"], axis=1), full_tr["total_cost"])
y_pred_full = xgb_full.predict(X_test.drop("ID", axis=1))



In [74]:
submission_df = pd.DataFrame({'ID': test_df.ID, 'total_cost': y_pred_full})
submission_df.to_csv('mean_encodingbelow25-6-new-collab2.csv',index=False)

In [73]:
submission_df

Unnamed: 0,ID,total_cost
0,tour_1,2.138223e+07
1,tour_100,1.332125e+07
2,tour_1001,9.939004e+06
3,tour_1006,2.993766e+06
4,tour_1009,2.253369e+07
...,...,...
1596,tour_988,8.955044e+05
1597,tour_990,2.481826e+07
1598,tour_992,1.960139e+06
1599,tour_996,1.215919e+06
