## **SETUP**

In [None]:
!pip install catboost==0.22 --quiet

[K     |████████████████████████████████| 64.4 MB 31 kB/s 
[?25h

## **Libraries**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import re
import nltk
import string
import random
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
import lightgbm as lgb  
from tqdm import tqdm_notebook
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, log_loss
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier ,Pool
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder , MinMaxScaler

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.simplefilter('ignore')

## **UTILS**

In [None]:
ContryContinent = {'afghanistan': 'AS',
 'algeria': 'AF',
 'angola': 'AF',
 'argentina': 'SA',
 'armenia': 'EU',
 'australia': 'OC',
 'austria': 'EU',
 'bahrain': 'AS',
 'bangladesh': 'AS',
 'barbados': 'NA',
 'belarus': 'EU',
 'belgium': 'EU',
 'benin': 'AF',
 'bermuda': 'NA',
 'bolivia': 'SA',
 'bosnia': 'EU',
 'botswana': 'AF',
 'brazil': 'SA',
 'bulgaria': 'EU',
 'burgaria': 'EU',
 'burundi': 'AF',
 'cambodia': 'AS',
 'cameroon': 'AF',
 'canada': 'NA',
 'cape_verde': 'AF',
 'chad': 'AF',
 'chile': 'SA',
 'china': 'AS',
 'colombia': 'SA',
 'comoro': 'AF',
 'congo': 'AF',
 'costarica': 'NA',
 'croatia': 'EU',
 'cyprus': 'EU',
 'czech_republic': 'EU',
 'denmark': 'EU',
 'djibout': 'AF',
 'dominica': 'NA',
 'drc': 'NA',
 'ecuado': 'SA',
 'egypt': 'AF',
 'el_salvador': 'NA',
 'eritrea': 'AF',
 'estonia': 'EU',
 'ethiopia': 'AF',
 'finland': 'EU',
 'france': 'EU',
 'gambia': 'AF',
 'georgia': 'EU',
 'germany': 'EU',
 'ghana': 'AF',
 'greece': 'EU',
 'guinea': 'AF',
 'haiti': 'NA',
 'hungary': 'EU',
 'iceland': 'EU',
 'india': 'AS',
 'indonesia': 'AS',
 'iran': 'AS',
 'iraq': 'AS',
 'ireland': 'EU',
 'israel': 'AS',
 'italy': 'EU',
 'ivory_coast': 'AF',
 'jamaica': 'NA',
 'japan': 'AS',
 'jordan': 'AS',
 'kazakhstan': 'AS',
 'kenya': 'AF',
 'korea': 'AS',
 'kuwait': 'AS',
 'latvia': 'EU',
 'lebanon': 'AS',
 'lesotho': 'AF',
 'liberia': 'AF',
 'lithuania': 'EU',
 'luxembourg': 'EU',
 'macedonia': 'EU',
 'madagascar': 'AF',
 'malawi': 'AF',
 'malaysia': 'AS',
 'malt': 'EU',
 'mauritius': 'AF',
 'mexico': 'NA',
 'monecasque': 'EU',
 'montenegro': 'EU',
 'morroco': 'AF',
 'mozambique': 'AF',
 'myanmar': 'AS',
 'namibia': 'AF',
 'nepal': 'AS',
 'netherlands': 'EU',
 'new_zealand': 'OC',
 'niger': 'AF',
 'nigeria': 'AF',
 'norway': 'EU',
 'oman': 'AS',
 'pakistan': 'AS',
 'papua_new_guinea': 'OC',
 'peru': 'SA',
 'philipines': 'AS',
 'poland': 'EU',
 'portugal': 'EU',
 'qatar': 'AS',
 'romania': 'EU',
 'russia': 'AS',
 'rwanda': 'AF',
 'saud_arabia': 'AS',
 'scotland': 'EU',
 'senegal': 'AF',
 'serbia': 'EU',
 'seychelles': 'AF',
 'singapore': 'AS',
 'slovakia': 'EU',
 'slovenia': 'EU',
 'somali': 'AF',
 'south_africa': 'AF',
 'spain': 'EU',
 'sri_lanka': 'AS',
 'sudan': 'AF',
 'swaziland': 'AF',
 'sweden': 'EU',
 'swizerland': 'EU',
 'taiwan': 'AS',
 'tanzania': 'AF',
 'thailand': 'AS',
 'trinidad_tobacco': 'NA',
 'tunisia': 'AF',
 'turkey': 'AS',
 'uae': 'AS',
 'uganda': 'AF',
 'ukrain': 'EU',
 'united_arab_emirates': 'AS',
 'united_kingdom': 'EU',
 'united_states_of_america': 'NA',
 'uruguay': 'SA',
 'venezuela': 'SA',
 'vietnam': 'AS',
 'yemen': 'AS',
 'zambia': 'AF',
 'zimbabwe': 'AF'}


subreg = {'afghanistan': 'southern_asia',
 'algeria': 'northern_africa',
 'angola': 'middle_africa',
 'argentina': 'south_america',
 'armenia': 'western_asia',
 'australia': 'australia_and_new_zealand',
 'austria': 'western_europe',
 'bahrain': 'western_asia',
 'bangladesh': 'southern_asia',
 'barbados': 'caribbean',
 'belarus': 'eastern_europe',
 'belgium': 'western_europe',
 'benin': 'western_africa',
 'bermuda': 'northern_america',
 'bolivia': 'south_america',
 'bosnia': 'southern_europe',
 'botswana': 'southern_africa',
 'brazil': 'south_america',
 'bulgaria': 'eastern_europe',
 'burgaria': 'eastern_europe',
 'burundi': 'eastern_africa',
 'cambodia': 'south-eastern_asia',
 'cameroon': 'middle_africa',
 'canada': 'northern_america',
 'cape_verde': 'western_africa',
 'chad': 'middle_africa',
 'chile': 'south_america',
 'china': 'eastern_asia',
 'colombia': 'south_america',
 'comoro': 'eastern_africa',
 'congo': 'middle_africa',
 'costarica': 'central_america',
 'croatia': 'southern_europe',
 'cyprus': 'western_asia',
 'czech_republic': 'eastern_europe',
 'denmark': 'northern_europe',
 'djibout': 'horn_of_africa',
 'dominica': 'caribbean',
 'drc': 'middle_africa',
 'ecuado': 'south_america',
 'egypt': 'northern_africa',
 'el_salvador': 'central_america',
 'eritrea': 'eastern_africa',
 'estonia': 'northern_europe',
 'ethiopia': 'eastern_africa',
 'finland': 'northern_europe',
 'france': 'western_europe',
 'gambia': 'western_africa',
 'georgia': 'western_asia',
 'germany': 'western_europe',
 'ghana': 'western_africa',
 'greece': 'southern_europe',
 'guinea': 'western_africa',
 'haiti': 'caribbean',
 'hungary': 'eastern_europe',
 'iceland': 'northern_europe',
 'india': 'southern_asia',
 'indonesia': 'south-eastern_asia',
 'iran': 'southern_asia',
 'iraq': 'western_asia',
 'ireland': 'northern_europe',
 'israel': 'western_asia',
 'italy': 'southern_europe',
 'ivory_coast': 'west_africa',
 'jamaica': 'caribbean',
 'japan': 'eastern_asia',
 'jordan': 'western_asia',
 'kazakhstan': 'central_asia',
 'kenya': 'eastern_africa',
 'korea': 'southeast_asia',
 'kuwait': 'western_asia',
 'latvia': 'northern_europe',
 'lebanon': 'western_asia',
 'lesotho': 'southern_africa',
 'liberia': 'western_africa',
 'lithuania': 'northern_europe',
 'luxembourg': 'western_europe',
 'macedonia': 'traditional_region of_greece',
 'madagascar': 'eastern_africa',
 'malawi': 'eastern_africa',
 'malaysia': 'south-eastern_asia',
 'malt': 'southern_europe',
 'mauritius': 'eastern_africa',
 'mexico': 'central_america',
 'monecasque': 'mediterranean_coast',
 'montenegro': 'southern_europe',
 'morroco': 'northern_africa',
 'mozambique': 'eastern_africa',
 'myanmar': 'south-eastern_asia',
 'namibia': 'southern_africa',
 'nepal': 'southern_asia',
 'netherlands': 'western_europe',
 'new_zealand': 'australia_and_new_zealand',
 'niger': 'western_africa',
 'nigeria': 'western_africa',
 'norway': 'northern_europe',
 'oman': 'western_asia',
 'pakistan': 'southern_asia',
 'papua_new_guinea': 'melanesia',
 'peru': 'south_america',
 'philipines': 'south-eastern_asia',
 'poland': 'eastern_europe',
 'portugal': 'southern_europe',
 'qatar': 'western_asia',
 'romania': 'eastern_europe',
 'russia': 'eastern_europe',
 'rwanda': 'eastern_africa',
 'saud_arabia': 'western_asia',
 'scotland': ' northern_third_of_great_britain',
 'senegal': 'western_africa',
 'serbia': 'southern_europe',
 'seychelles': 'eastern_africa',
 'singapore': 'south-eastern_asia',
 'slovakia': 'eastern_europe',
 'slovenia': 'southern_europe',
 'somali': 'eastern_africa',
 'south_africa': 'southern_africa',
 'spain': 'southern_europe',
 'sri_lanka': 'southern_asia',
 'sudan': 'northern_africa',
 'swaziland': 'southern_africa',
 'sweden': 'northern_europe',
 'swizerland': 'western_europe',
 'taiwan': 'south-eastern_asia',
 'tanzania': 'eastern_africa',
 'thailand': 'south-eastern_asia',
 'trinidad_tobacco': 'caribbean',
 'tunisia': 'northern_africa',
 'turkey': 'western_asia',
 'uae': 'western_asia',
 'uganda': 'eastern_africa',
 'ukrain': 'eastern_europe',
 'united_arab_emirates': 'western_asia',
 'united_kingdom': 'northern_europe',
 'united_states_of_america': 'northern_america',
 'uruguay': 'south_america',
 'venezuela': 'south_america',
 'vietnam': 'south-eastern_asia',
 'yemen': 'western_asia',
 'zambia': 'eastern_africa',
 'zimbabwe': 'eastern_africa'}

In [None]:
def sentCleaner(sent):
  sent = sent.lower()
  new_sent = re.sub(r'_', ' ', sent)
  new_sent = re.sub(r',', '', new_sent)
  new_sent = re.sub(r'-', '', new_sent)
  return new_sent.lower()

In [None]:
def sentTokenizer(sent):
  new_sent = re.sub(r'_', ' ', sent)
  tokenizer = TweetTokenizer(preserve_case= False, reduce_len=False, strip_handles=False)
  return tokenizer.tokenize(new_sent)

#BOW function
def Bag_of_words(sent_token):
    BOW = []
    for token in sent_token:
        for word in token:
            BOW.append(word)
    return BOW

def positive_count(sent_token):
  count = 0
  for word in sent_token:
    if word == "yes":
      count += 1
  return count

def negative_count(sent_token):
  count = 0
  for word in sent_token:
    if word == "no":
      count += 1
  return count

BOW_weights = {}

def sent_weight(sent_token):
  weight = 0
  for word in sent_token:
    weight += BOW_weights.get(word, 0)
  return weight

## **Dataset**

In [None]:
train=pd.read_csv("data/Train.csv")
test=pd.read_csv("data/Test.csv")
sub=pd.read_csv("data/SampleSubmission.csv")
desc=pd.read_csv("data/VariableDefinitions.csv")

In [None]:
train.head(2)

Unnamed: 0,Tour_ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,...,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,first_trip_tz,cost_category
0,tour_id1hffseyw,ITALY,45-64,With Children,0.0,2.0,Visiting Friends and Relatives,Beach Tourism,"Friends, relatives",Package Tour,...,Yes,Yes,Yes,No,No,No,0,7,Yes,High Cost
1,tour_idnacd7zag,UNITED KINGDOM,25-44,With Spouse,1.0,1.0,Leisure and Holidays,Wildlife Tourism,"Travel agent, tour operator",Package Tour,...,Yes,Yes,Yes,No,No,No,0,7,Yes,High Cost


In [None]:
data = pd.concat([train , test],sort=False).reset_index(drop=True)

In [None]:
data.shape

(24675, 21)

## **Feature Engineering**

In [None]:
data["tripdescr"] = data['package_transport_int'] + "_" + data['package_accomodation'] + "_" + data['package_food'] + "_" + data['package_transport_tz'] + "_" + data[ 'package_sightseeing'] + "_" + data['package_guided_tour'] + "_" + data['package_insurance'] + "_" + data['first_trip_tz']
data["tripdescrtoken"] = data["tripdescr"].apply(sentTokenizer)
data["tripdescrtoken"] = data["tripdescr"].apply(sentTokenizer)

BOW = Bag_of_words(data["tripdescrtoken"])
BOW_weights = pd.Series(BOW).value_counts().to_dict()

data["yes_count"]  = data["tripdescrtoken"] .apply(positive_count)
data["no_count"]  = data["tripdescrtoken"] .apply(negative_count)
data["tokenweight"] = data["tripdescrtoken"].apply(sent_weight)

In [None]:
vocabData = [sentCleaner(' '.join(data[["purpose", "country", "travel_with", "main_activity", "info_source", "tour_arrangement"]].astype(str).iloc[i].values)) for i in range(len(data))]

Vectorizer= CountVectorizer( 
                tokenizer = None,  
                preprocessor = None, 
                stop_words = None,  
                max_features = 30,
)
Vectorizer.fit(vocabData)
letter_array = Vectorizer.transform(vocabData).toarray()
vocab = Vectorizer.vocabulary_
wordData = pd.DataFrame(letter_array,columns=vocab)

In [None]:
wordData.head()

Unnamed: 0,visiting,friends,and,relatives,italy,with,children,beach,tourism,package,...,states,of,america,widlife,independent,alone,business,hunting,other,conference
0,0,0,0,1,1,0,1,0,2,0,...,0,0,1,1,0,0,1,0,0,1
1,1,0,0,1,0,0,0,0,0,1,...,1,0,2,1,1,1,0,0,1,1
2,1,0,1,1,0,0,0,0,0,1,...,1,1,2,1,1,1,0,1,0,1
3,0,0,0,2,1,0,1,0,0,1,...,1,0,0,1,0,0,0,0,0,1
4,1,1,1,1,0,0,0,0,0,1,...,0,1,2,1,1,1,0,1,0,0


In [None]:
##Filling Missing Values
imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_mode.fit(data[['total_female', 'total_male']])
data[['total_female', 'total_male']] = imp_mode.transform(data[['total_female', 'total_male']])

data["travel_with"] = data["travel_with"].fillna("Not Specified")

# Cleaning features data entries
catColProcess =  ["country", 'travel_with', 'purpose', 'main_activity', 'info_source', 'tour_arrangement']
for col in catColProcess:
  data[col] = data[col].str.lower().str.replace(" ", "_")
  data[col] = data[col].str.lower().str.replace(" ", "_")

data["continents"] = data["country"].map(ContryContinent)

data["subregion"] = data["country"].map(subreg)


le = LabelEncoder()
multiLabelCatCol = ["country","subregion", "continents", "age_group",'package_transport_int', 'package_accomodation', 'package_food', 
                'package_transport_tz', 'package_sightseeing', 'package_guided_tour', 'package_insurance',
                'first_trip_tz']
for col in multiLabelCatCol:
  data[col] = le.fit_transform(data[col])


## For other variables let's use get dummies
columns_to_transform = ['tour_arrangement','travel_with','purpose','main_activity','info_source']
for col in columns_to_transform:
  data[col] = le.fit_transform(data[col])

data["night_mainland"] = data['night_mainland'].astype('int')
data["night_zanzibar"] = data['night_zanzibar'].astype('int')

# feature creation
data["total_people"] = data["total_female"] + data["total_male"]
data["total_nights"] = data["night_mainland"] + data["night_zanzibar"]

In [None]:
data.head(3)

Unnamed: 0,Tour_ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,...,cost_category,tripdescr,tripdescrtoken,yes_count,no_count,tokenweight,continents,subregion,total_people,total_nights
0,tour_id1hffseyw,62,2,2,0.0,2.0,6,0,0,1,...,High Cost,Yes_Yes_Yes_Yes_No_No_No_Yes,"[yes, yes, yes, yes, no, no, no, yes]",5,3,747408,2,20,2.0,7
1,tour_idnacd7zag,133,1,4,1.0,1.0,1,9,7,1,...,High Cost,Yes_Yes_Yes_Yes_No_No_No_Yes,"[yes, yes, yes, yes, no, no, no, yes]",5,3,747408,2,14,2.0,7
2,tour_id62vz7e71,134,3,4,1.0,1.0,1,8,7,1,...,Higher Cost,Yes_Yes_Yes_Yes_Yes_Yes_No_Yes,"[yes, yes, yes, yes, yes, yes, no, yes]",7,1,663024,3,13,2.0,12


In [None]:
data = data.drop(["tripdescr", "tripdescrtoken"], axis = 1)

In [None]:
Dataset = pd.concat([data , wordData],axis = 1)

train = Dataset[Dataset.cost_category.notnull()].reset_index(drop=True)
test = Dataset[Dataset.cost_category.isna()].reset_index(drop=True)

In [None]:
train.head()

Unnamed: 0,Tour_ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,...,states,of,america,widlife,independent,alone,business,hunting,other,conference
0,tour_id1hffseyw,62,2,2,0.0,2.0,6,0,0,1,...,0,0,1,1,0,0,1,0,0,1
1,tour_idnacd7zag,133,1,4,1.0,1.0,1,9,7,1,...,1,0,2,1,1,1,0,0,1,1
2,tour_id62vz7e71,134,3,4,1.0,1.0,1,8,7,1,...,1,1,2,1,1,1,0,1,0,1
3,tour_idrc76tzix,106,1,5,3.0,1.0,1,0,4,0,...,1,0,0,1,0,0,0,0,0,1
4,tour_idn723m0n9,134,2,0,0.0,1.0,1,8,7,1,...,0,1,2,1,1,1,0,1,0,0


In [None]:
feat_cols = train.drop(['Tour_ID','cost_category'],1)
target = train['cost_category']

testTourID = test['Tour_ID']
test = test.drop(['Tour_ID', 'cost_category'], axis = 1)
cols = feat_cols.columns

In [None]:
mapperFunc = {'Lower Cost':0, 'Low Cost':1, 'Normal Cost':2, 'High Cost':3, 'Higher Cost':4, 'Highest Cost':5}
train_target = target.map(mapperFunc)

## **Modeling**

In [None]:
class CFG :
  SEED = 2022
  n_splits = 8
  lgb_params = {'boosting_type': 'gbdt','objective': 'multiclass','metric': 'multi_logloss',
                'n_estimators': 500,'colsample_bytree' : 0.8,
                'seed': SEED,'silent':False,'early_stopping_rounds': 100,'learning_rate' :0.1, 'num_class':6}

  catboost_params = {"verbose":False, 'learning_rate' :0.05,'iterations':10000,'random_seed':0,
                     'use_best_model':True,'verbose':100,
                     #'task_type':"GPU",
                     'devices':'0:1' }

  xgboost_params = {"n_estimators" : 265, 'random_state': SEED, "objective" :"multi:softmax", 'num_class':6}

In [None]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)

In [None]:
seed_everything(CFG.SEED)

In [None]:
# create training and testing vars
X_train, X_test, y_train, y_test = train_test_split(train[cols],train_target, test_size=0.20, random_state = CFG.SEED)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(14804, 56) (14804,)
(3702, 56) (3702,)


## **XGBOOST**

In [None]:
folds =  StratifiedKFold(n_splits=CFG.n_splits, shuffle=True, random_state=CFG.SEED)

In [None]:
XGBCol = ['tour_arrangement', 'yes_count', 'package', 'continents',
       'package_accomodation', 'total_people', 'package_transport_int',
       'travel_with', 'america', 'holidays', 'total_nights', 'purpose',
       'age_group', 'main_activity', 'children', 'night_zanzibar', 'business',
       'hunting', 'and', 'tour', 'alone', 'night_mainland', 'country',
       'package_guided_tour', 'beach', 'united', 'total_female', 'relatives',
       'tourism', 'package_sightseeing', 'package_transport_tz', 'total_male',
       'info_source', 'travel', 'package_food', 'first_trip_tz',
       'package_insurance', 'widlife', 'with',"subregion"]

In [None]:
test_pred_xgb = np.zeros((len(test), 6))
train_pred_xgb  = np.zeros((len(train[XGBCol]), 6))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train[XGBCol], train_target)):
      
    X_trn, y_trn = train[XGBCol].iloc[trn_idx], train_target.iloc[trn_idx]
    X_val, y_val = train[XGBCol].iloc[val_idx], train_target.iloc[val_idx]


    model_xgb = XGBClassifier(**CFG.xgboost_params)
    model_xgb.fit(X_trn, y_trn)
    train_p  = model_xgb.predict_proba(X_val)
    train_pred_xgb[val_idx, :] = train_p

    val_score = log_loss(y_val, train_p)
    print(4*'-- -- -- --')
    print(f'Fold {fold_+1} Val score: {val_score}')
    print(4*'-- -- -- --')
      
    test_pred_xgb += model_xgb.predict_proba(test[XGBCol])
      
test_pred_xgb /= CFG.n_splits
print()
print(3*'###',10*"^",3*'###')
print(f"Train LogLoss : {log_loss(train_target, train_pred_xgb)}")

-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 1 Val score: 1.0580642806783047
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 2 Val score: 1.0655259951899212
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 3 Val score: 1.0514817355475254
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 4 Val score: 1.0548743595563639
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 5 Val score: 1.0856803250223257
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 6 Val score: 1.0932361639817065
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 7 Val score: 1.0794947503816692
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 8 Val score: 1.0820376124898181
-- -- -- ---- -- -- ---- -- -

In [None]:
xgb_pred = pd.DataFrame(test_pred_xgb, columns = ['Lower Cost', 'Low Cost', 'Normal Cost', 'High Cost', 'Higher Cost', 'Highest Cost'])
subXGB = sub.copy()
subXGB['Tour_ID'] = testTourID
for col in  ['Lower Cost', 'Low Cost', 'Normal Cost', 'High Cost', 'Higher Cost', 'Highest Cost']:
  subXGB[col] = xgb_pred[col]

In [None]:
subXGB.to_csv('XGBoost_Solution.csv',index=False)

## **LIGHTGBM**

In [None]:
LGBCol = ['country', 'total_nights', 'night_mainland', 'total_people',
       'night_zanzibar', 'age_group', 'main_activity', 'continents',
       'yes_count', 'travel_with', 'total_male', 'total_female', 'info_source',
       'package_transport_int', 'purpose', 'tourism', 'hunting', 'beach',
       'package', 'tour_arrangement', 'first_trip_tz', 'no_count', 'united',
       'package_accomodation', 'america', 'holidays', 'and', 'travel',
       'package_sightseeing', 'relatives', 'tour', 'children',
       'package_guided_tour', 'package_transport_tz', 'package_insurance',
       'business', 'widlife', 'alone', 'package_food', 'operator', 'with',
       'kingdom', 'other', 'friends', 'italy', 'visiting', 'states', 'leisure',
       'tokenweight', 'spouse', 'conference', 'agent', "subregion"]#, 'wildlife', 'of','independent']

In [None]:
lgbm_pred = np.zeros((len(test), 6))
train_pred  = np.zeros((len(train[LGBCol]), 6))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train[LGBCol], train_target)):
      
    X_trn, y_trn = train[LGBCol].iloc[trn_idx], train_target.iloc[trn_idx]
    X_val, y_val = train[LGBCol].iloc[val_idx], train_target.iloc[val_idx]

    train_data = lgb.Dataset(X_trn, label= y_trn)#, categorical_feature=CFG.categ_features)
    valid_data = lgb.Dataset(X_val, label= y_val)#, categorical_feature=CFG.categ_features)

    estimator = lgb.train(CFG.lgb_params, train_data, valid_sets = [train_data, valid_data],verbose_eval = 0)
    train_p  = estimator.predict(X_val , num_iteration = estimator.best_iteration)
    train_pred[val_idx, :] = train_p

    val_score = log_loss(y_val, train_p)
    print(4*'-- -- -- --')
    print(f'Fold {fold_+1} Val score: {val_score}')
    print(4*'-- -- -- --')
      
    lgbm_pred += estimator.predict(test[LGBCol], num_iteration = estimator.best_iteration)
      
lgbm_pred /= CFG.n_splits
print()
print(3*'###',10*"^",3*'###')
print(f"Train LogLoss : {log_loss(train_target, train_pred)}")

-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 1 Val score: 1.0500524263769546
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 2 Val score: 1.061237413005223
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 3 Val score: 1.0341555652057057
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 4 Val score: 1.0511955919514628
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 5 Val score: 1.0838328906044037
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 6 Val score: 1.084218126911031
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 7 Val score: 1.0770877981286564
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 8 Val score: 1.0770815921090218
-- -- -- ---- -- -- ---- -- -- 

In [None]:
lgbm_pred = pd.DataFrame(lgbm_pred, columns = ['Lower Cost', 'Low Cost', 'Normal Cost', 'High Cost', 'Higher Cost', 'Highest Cost'])

In [None]:
subLGBM = sub.copy()
subLGBM['Tour_ID'] = testTourID
for col in  ['Lower Cost', 'Low Cost', 'Normal Cost', 'High Cost', 'Higher Cost', 'Highest Cost']:
  subLGBM[col] = lgbm_pred[col]

In [None]:
subLGBM.to_csv('LGBMSolution.csv',index=False)

## **CATBOOST**

In [None]:
CatCol = cols

In [None]:
cat_pred = np.zeros((len(test), 6))
train_pred_cat  = np.zeros((len(train[CatCol]), 6))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train[CatCol], train_target)):
      
    X_trn, y_trn = train[CatCol].iloc[trn_idx], train_target.iloc[trn_idx]
    X_val, y_val = train[CatCol].iloc[val_idx], train_target.iloc[val_idx]


    model_cat = CatBoostClassifier(**CFG.catboost_params)
    model_cat.fit(Pool(X_trn, y_trn), eval_set = Pool(X_val, y_val),
            early_stopping_rounds=200)
    train_p  = model_cat.predict_proba(X_val)
    train_pred_cat[val_idx, :] = train_p

    val_score = log_loss(y_val, train_p)
    print(4*'-- -- -- --')
    print(f'Fold {fold_+1} Val score: {val_score}')
    print(4*'-- -- -- --')
      
    cat_pred += model_cat.predict_proba(test[CatCol])
     
cat_pred /= CFG.n_splits
print()
print(3*'###',10*"^",3*'###')
print(f"Train LogLoss : {log_loss(train_target, train_pred_cat)}")

0:	learn: 1.7341472	test: 1.7336088	best: 1.7336088 (0)	total: 33.1ms	remaining: 5m 30s
100:	learn: 1.0609066	test: 1.0730943	best: 1.0730943 (100)	total: 3.27s	remaining: 5m 20s
200:	learn: 1.0166829	test: 1.0546863	best: 1.0546863 (200)	total: 7.49s	remaining: 6m 4s
300:	learn: 0.9866256	test: 1.0487618	best: 1.0487618 (300)	total: 15.1s	remaining: 8m 5s
400:	learn: 0.9629108	test: 1.0458804	best: 1.0458804 (400)	total: 20.5s	remaining: 8m 11s
500:	learn: 0.9415702	test: 1.0463177	best: 1.0458513 (401)	total: 24s	remaining: 7m 34s
600:	learn: 0.9209827	test: 1.0466084	best: 1.0458513 (401)	total: 27.7s	remaining: 7m 13s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 1.045851334
bestIteration = 401

Shrink model to first 402 iterations.
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 1 Val score: 1.0458513339706748
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
0:	learn: 1.7322567	test: 1.7327661	best: 1.7327661 (0)	total: 41ms	remaining: 6m 49s
100:	learn: 1.06089

In [None]:
cat_pred = pd.DataFrame(cat_pred, columns = ['Lower Cost', 'Low Cost', 'Normal Cost', 'High Cost', 'Higher Cost', 'Highest Cost'])
subCAT = sub.copy()
subCAT['Tour_ID'] = testTourID
for col in  ['Lower Cost', 'Low Cost', 'Normal Cost', 'High Cost', 'Higher Cost', 'Highest Cost']:
  subCAT[col] = cat_pred[col]

In [None]:
subCAT.to_csv('CATSolution.csv',index=False)

## **BLENDING**

In [None]:
subBlend = sub.copy()
subBlend['Tour_ID'] = testTourID
for col in  ['Lower Cost', 'Low Cost', 'Normal Cost', 'High Cost', 'Higher Cost', 'Highest Cost']:
  subBlend[col] = (0.3 * subCAT[col]) + (0.6 * subLGBM[col]) + (0.2 * subXGB[col])

In [None]:
subBlend.to_csv('FinalSubmission.csv',index=False)

In [None]:
subBlend.head()

Unnamed: 0,Tour_ID,High Cost,Higher Cost,Highest Cost,Low Cost,Lower Cost,Normal Cost
0,tour_idynufedne,0.139944,0.040938,0.000624,0.071356,0.039164,0.807974
1,tour_id9r3y5moe,0.297113,0.602104,0.028862,0.013395,0.022662,0.135863
2,tour_idf6itml6g,0.781571,0.15671,0.003643,0.011078,0.006672,0.140326
3,tour_id99u4znru,0.050581,0.016478,0.000402,0.262496,0.152045,0.617998
4,tour_idj4i9urbx,0.02401,0.009879,0.000333,0.293732,0.346481,0.425565
