In [1]:
import pandas as pd
from pandas_profiling import profile_report
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import (RandomizedSearchCV, 
                                     GridSearchCV, KFold, 
                                     StratifiedShuffleSplit, 
                                     TimeSeriesSplit)
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier


In [2]:
%matplotlib inline

In [3]:
train = pd.read_csv("Train.csv")
test = pd.read_csv("Test.csv")
# vard = pd.read_csv("VariableDefinitions.csv")

In [4]:
train.head(3)

Unnamed: 0,Tour_ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,...,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,first_trip_tz,cost_category
0,tour_id1hffseyw,ITALY,45-64,With Children,0.0,2.0,Visiting Friends and Relatives,Beach Tourism,"Friends, relatives",Package Tour,...,Yes,Yes,Yes,No,No,No,0,7,Yes,High Cost
1,tour_idnacd7zag,UNITED KINGDOM,25-44,With Spouse,1.0,1.0,Leisure and Holidays,Wildlife Tourism,"Travel agent, tour operator",Package Tour,...,Yes,Yes,Yes,No,No,No,0,7,Yes,High Cost
2,tour_id62vz7e71,UNITED STATES OF AMERICA,65+,With Spouse,1.0,1.0,Leisure and Holidays,Widlife Tourism,"Travel agent, tour operator",Package Tour,...,Yes,Yes,Yes,Yes,Yes,No,6,6,Yes,Higher Cost


In [5]:
test.head(3)

Unnamed: 0,Tour_ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,first_trip_tz
0,tour_idynufedne,KOREA,25-44,Alone,0.0,1.0,Leisure and Holidays,Widlife Tourism,Others,Independent,No,No,No,No,No,No,No,7,4,Yes
1,tour_id9r3y5moe,UNITED KINGDOM,45-64,With Children,1.0,1.0,Leisure and Holidays,Conference Tourism,"Travel agent, tour operator",Package Tour,Yes,Yes,Yes,Yes,Yes,Yes,Yes,7,0,Yes
2,tour_idf6itml6g,ITALY,25-44,With Spouse,1.0,1.0,Leisure and Holidays,Beach Tourism,"Travel agent, tour operator",Package Tour,Yes,Yes,No,Yes,No,No,No,0,6,Yes


In [6]:
# vard["Definition"]

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18506 entries, 0 to 18505
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Tour_ID                18506 non-null  object 
 1   country                18506 non-null  object 
 2   age_group              18506 non-null  object 
 3   travel_with            17431 non-null  object 
 4   total_female           18504 non-null  float64
 5   total_male             18500 non-null  float64
 6   purpose                18506 non-null  object 
 7   main_activity          18506 non-null  object 
 8   info_source            18506 non-null  object 
 9   tour_arrangement       18506 non-null  object 
 10  package_transport_int  18506 non-null  object 
 11  package_accomodation   18506 non-null  object 
 12  package_food           18506 non-null  object 
 13  package_transport_tz   18506 non-null  object 
 14  package_sightseeing    18506 non-null  object 
 15  pa

In [8]:
train.describe()

Unnamed: 0,total_female,total_male,night_mainland,night_zanzibar
count,18504.0,18500.0,18506.0,18506.0
mean,0.93623,0.998757,9.141576,2.493516
std,1.215582,1.173177,14.127449,5.275156
min,0.0,0.0,0.0,0.0
25%,0.0,1.0,3.0,0.0
50%,1.0,1.0,6.0,0.0
75%,1.0,1.0,11.0,4.0
max,49.0,58.0,365.0,240.0


In [9]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6169 entries, 0 to 6168
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Tour_ID                6169 non-null   object 
 1   country                6169 non-null   object 
 2   age_group              6169 non-null   object 
 3   travel_with            5808 non-null   object 
 4   total_female           6167 non-null   float64
 5   total_male             6168 non-null   float64
 6   purpose                6169 non-null   object 
 7   main_activity          6169 non-null   object 
 8   info_source            6169 non-null   object 
 9   tour_arrangement       6169 non-null   object 
 10  package_transport_int  6169 non-null   object 
 11  package_accomodation   6169 non-null   object 
 12  package_food           6169 non-null   object 
 13  package_transport_tz   6169 non-null   object 
 14  package_sightseeing    6169 non-null   object 
 15  pack

In [10]:
train["cost_category"].unique()

array(['High Cost', 'Higher Cost', 'Lower Cost', 'Normal Cost',
       'Low Cost', 'Highest Cost'], dtype=object)

In [11]:
train["cost_category"].value_counts()

Normal Cost     5471
Higher Cost     4865
High Cost       3678
Lower Cost      2567
Low Cost        1566
Highest Cost     359
Name: cost_category, dtype: int64

In [12]:
# train.profile_report()

## Handling missing values

In [13]:
# total nan values in columns
train.loc[:, train.isnull().any()].isnull().sum()

travel_with     1075
total_female       2
total_male         6
dtype: int64

In [14]:
test.loc[:, test.isnull().any()].isnull().sum()

travel_with     361
total_female      2
total_male        1
dtype: int64

In [15]:
print("travel_with : ", train["travel_with"].unique())
print("total_female : ", train["total_female"].unique())
print("total_male : ", train["total_male"].unique())

travel_with :  ['With Children' 'With Spouse' 'With Spouse and Children' 'Alone' nan
 'With Other Friends/Relatives']
total_female :  [ 0.  1.  3.  2.  4. 49.  5.  6.  7. 10. 13. 14.  8. 15. 20. 17. 11. 12.
 24.  9. 22. nan 26. 19. 23.]
total_male :  [ 2.  1.  0.  8.  4.  5.  3. 44.  7.  6. 14. 10. 12. 15. 24. 11.  9. 58.
 nan 13. 40. 23. 20. 17. 25.]


In [16]:
# Filling nan values with most frequent category
def fill_nan(df):
    nan_cols = df.loc[:, df.isnull().any()].isnull().sum().index
    for col in nan_cols:
        df[col].fillna(df[col].mode()[0], inplace=True)
        # print(f"{col} : {train[col].isnull().sum()}")
    return df

In [17]:
train_1 = fill_nan(train)
test_1 = fill_nan(test)

print(train_1.loc[:, train_1.isnull().any()].isnull().sum())
print(test_1.loc[:, test_1.isnull().any()].isnull().sum())

Series([], dtype: float64)
Series([], dtype: float64)


## Feature Engineering

In [18]:
# Creating 2 new features "total_people" and "total_night_spent"
train_1["total_people"] = train_1["total_male"] + train_1["total_female"]
train_1["total_night_spent"] = train_1["night_mainland"] + train_1["night_zanzibar"]

test_1["total_people"] = test_1["total_male"] + test_1["total_female"]
test_1["total_night_spent"] = test_1["night_mainland"] + test_1["night_zanzibar"]

In [19]:
# Mapping "Yes" to 1 and "No" to 0
# and creating a new feature "total_package"

map_bool = {"Yes":1,
           "No":0}
cols = [col for col in train_1.columns if col.startswith("package")]
train_2 = train_1.loc[:,cols]
for col in cols:
    train_2[col] = train_2[col].map(map_bool)

train_1["total_package"] = (train_2["package_accomodation"] + train_2["package_food"] + train_2["package_guided_tour"] + 
                           train_2["package_insurance"] + train_2["package_sightseeing"] + 
                           train_2["package_transport_int"] + train_2["package_transport_tz"])

In [20]:
train_1.head(1)

Unnamed: 0,Tour_ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,...,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,first_trip_tz,cost_category,total_people,total_night_spent,total_package
0,tour_id1hffseyw,ITALY,45-64,With Children,0.0,2.0,Visiting Friends and Relatives,Beach Tourism,"Friends, relatives",Package Tour,...,No,No,No,0,7,Yes,High Cost,2.0,7,4


In [21]:
cols = [col for col in test_1.columns if col.startswith("package")]
test_2 = test_1.loc[:,cols]
for col in cols:
    test_2[col] = test_2[col].map(map_bool)

test_1["total_package"] = (test_2["package_accomodation"] + test_2["package_food"] + test_2["package_guided_tour"] + 
                           test_2["package_insurance"] + test_2["package_sightseeing"] + 
                           test_2["package_transport_int"] + test_2["package_transport_tz"])

test_1.head(2)

Unnamed: 0,Tour_ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,...,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,first_trip_tz,total_people,total_night_spent,total_package
0,tour_idynufedne,KOREA,25-44,Alone,0.0,1.0,Leisure and Holidays,Widlife Tourism,Others,Independent,...,No,No,No,No,7,4,Yes,1.0,11,0
1,tour_id9r3y5moe,UNITED KINGDOM,45-64,With Children,1.0,1.0,Leisure and Holidays,Conference Tourism,"Travel agent, tour operator",Package Tour,...,Yes,Yes,Yes,Yes,7,0,Yes,2.0,7,7


## Encoding Categorical variables

In [22]:
# function to print name of boolean categorical columns and their unique values
def disp_bool_cat_cols(df):
    boolean_cols = []
    for col in df.columns:
        if df[col].nunique() <= 2:
            boolean_cols.append(col)
            print(f"{col} : {df[col].unique()}")
        
    
    
    
# function to return categorical columns with 2 unique categories
def ret_bool_cat_cols(df):
    boolean_cols = []
    for col in df.columns:
        if df[col].nunique() <=2:
            boolean_cols.append(col)
    
    return boolean_cols

In [23]:
disp_bool_cat_cols(train)

tour_arrangement : ['Package Tour' 'Independent']
package_transport_int : ['Yes' 'No']
package_accomodation : ['Yes' 'No']
package_food : ['Yes' 'No']
package_transport_tz : ['Yes' 'No']
package_sightseeing : ['No' 'Yes']
package_guided_tour : ['No' 'Yes']
package_insurance : ['No' 'Yes']
first_trip_tz : ['Yes' 'No']


In [24]:
# Function to encode categorical columns wiht 2 unique values
def encode_bool_cat_cols(df):
    boolean_cols = ret_bool_cat_cols(df)
    for col in boolean_cols:
        less_occured_cat = df[col].value_counts().nlargest().index[1] # category which occured less in the column
        less_occured_cat_col = col+"_"+less_occured_cat
        df = pd.get_dummies(data=df, columns=[col]).drop(less_occured_cat_col, axis="columns") # encoding most occuring category
    
    return df

In [25]:
# Encoding categorical variables with 2 unique categories in train set
train_1 = encode_bool_cat_cols(train)
train_1.head(2)

Unnamed: 0,Tour_ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,night_mainland,...,total_package,tour_arrangement_Package Tour,package_transport_int_No,package_accomodation_No,package_food_No,package_transport_tz_No,package_sightseeing_No,package_guided_tour_No,package_insurance_No,first_trip_tz_Yes
0,tour_id1hffseyw,ITALY,45-64,With Children,0.0,2.0,Visiting Friends and Relatives,Beach Tourism,"Friends, relatives",0,...,4,1,0,0,0,0,1,1,1,1
1,tour_idnacd7zag,UNITED KINGDOM,25-44,With Spouse,1.0,1.0,Leisure and Holidays,Wildlife Tourism,"Travel agent, tour operator",0,...,4,1,0,0,0,0,1,1,1,1


In [26]:
# disp_bool_cat_cols(test)

In [27]:
# Encoding categorical columns having 2 unique categories
test_1 = encode_bool_cat_cols(test)
test_1.drop("tour_arrangement_Independent", axis=1, inplace=True)
test_1["tour_arrangement_Package Tour"] = pd.get_dummies(data=test["tour_arrangement"], drop_first=True)

test_1.head(3)

Unnamed: 0,Tour_ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,night_mainland,...,total_package,package_transport_int_No,package_accomodation_No,package_food_No,package_transport_tz_No,package_sightseeing_No,package_guided_tour_No,package_insurance_No,first_trip_tz_Yes,tour_arrangement_Package Tour
0,tour_idynufedne,KOREA,25-44,Alone,0.0,1.0,Leisure and Holidays,Widlife Tourism,Others,7,...,0,1,1,1,1,1,1,1,1,0
1,tour_id9r3y5moe,UNITED KINGDOM,45-64,With Children,1.0,1.0,Leisure and Holidays,Conference Tourism,"Travel agent, tour operator",7,...,7,0,0,0,0,0,0,0,1,1
2,tour_idf6itml6g,ITALY,25-44,With Spouse,1.0,1.0,Leisure and Holidays,Beach Tourism,"Travel agent, tour operator",0,...,3,0,0,1,0,1,1,1,1,1


In [28]:
# Checking for columns only present in train set
for col in train_1.columns:
    if col not in test_1.columns:
        print(col)

cost_category


In [29]:
# Slecting columns with more than 2 unique categories
train_1.select_dtypes(include='object').head()

Unnamed: 0,Tour_ID,country,age_group,travel_with,purpose,main_activity,info_source,cost_category
0,tour_id1hffseyw,ITALY,45-64,With Children,Visiting Friends and Relatives,Beach Tourism,"Friends, relatives",High Cost
1,tour_idnacd7zag,UNITED KINGDOM,25-44,With Spouse,Leisure and Holidays,Wildlife Tourism,"Travel agent, tour operator",High Cost
2,tour_id62vz7e71,UNITED STATES OF AMERICA,65+,With Spouse,Leisure and Holidays,Widlife Tourism,"Travel agent, tour operator",Higher Cost
3,tour_idrc76tzix,RWANDA,25-44,With Spouse and Children,Leisure and Holidays,Beach Tourism,"Radio, TV, Web",Lower Cost
4,tour_idn723m0n9,UNITED STATES OF AMERICA,45-64,Alone,Leisure and Holidays,Widlife Tourism,"Travel agent, tour operator",Higher Cost


In [30]:
# Encoding categorical columns having more than 1 unique categories
train_1 = pd.get_dummies(data=train_1, columns=[
                                     "age_group",
                                     "travel_with",
                                     "purpose",
                                     "main_activity",
                                     "info_source"])

train_1["country"] = LabelEncoder().fit_transform(train_1["country"])

# cols = ["country",
#         "tour_arrangement",
#          "age_group",
#           "travel_with",
#           "purpose",
#          "main_activity",
#         "info_source"]
# for col in cols:
#     train_1[col] = OrdinalEncoder().fit_transform(train_1[[col]])
#     test_1[col] = OrdinalEncoder().fit_transform(test_1[[col]])
train_1.head(3)

Unnamed: 0,Tour_ID,country,total_female,total_male,night_mainland,night_zanzibar,cost_category,total_people,total_night_spent,total_package,...,main_activity_Widlife Tourism,main_activity_Wildlife Tourism,"info_source_Friends, relatives",info_source_Inflight magazines,"info_source_Newspaper, magazines, brochures",info_source_Others,"info_source_Radio, TV, Web",info_source_Tanzania Mission Abroad,info_source_Trade fair,"info_source_Travel agent, tour operator"
0,tour_id1hffseyw,54,0.0,2.0,0,7,High Cost,2.0,7,4,...,0,0,1,0,0,0,0,0,0,0
1,tour_idnacd7zag,123,1.0,1.0,0,7,High Cost,2.0,7,4,...,0,1,0,0,0,0,0,0,0,1
2,tour_id62vz7e71,124,1.0,1.0,6,6,Higher Cost,2.0,12,6,...,1,0,0,0,0,0,0,0,0,1


In [31]:
# Encoding categorical columns having more than 1 unique categories
test_1 = pd.get_dummies(data=test_1, columns=[
                                     "age_group",
                                     "travel_with",
                                     "purpose",
                                     "main_activity",
                                     "info_source"])

test_1["country"] = OrdinalEncoder().fit_transform(test_1[["country"]])
test_1.head(3)

Unnamed: 0,Tour_ID,country,total_female,total_male,night_mainland,night_zanzibar,total_people,total_night_spent,total_package,package_transport_int_No,...,main_activity_Widlife Tourism,main_activity_Wildlife Tourism,"info_source_Friends, relatives",info_source_Inflight magazines,"info_source_Newspaper, magazines, brochures",info_source_Others,"info_source_Radio, TV, Web",info_source_Tanzania Mission Abroad,info_source_Trade fair,"info_source_Travel agent, tour operator"
0,tour_idynufedne,57.0,0.0,1.0,7,4,1.0,11,0,1,...,1,0,0,0,0,1,0,0,0,0
1,tour_id9r3y5moe,112.0,1.0,1.0,7,0,2.0,7,7,0,...,0,0,0,0,0,0,0,0,0,1
2,tour_idf6itml6g,52.0,1.0,1.0,0,6,2.0,6,3,0,...,0,0,0,0,0,0,0,0,0,1


In [32]:
tour_id = test_1["Tour_ID"]
test_1.drop("Tour_ID", axis=1, inplace=True)

In [33]:
test_1.head(2)

Unnamed: 0,country,total_female,total_male,night_mainland,night_zanzibar,total_people,total_night_spent,total_package,package_transport_int_No,package_accomodation_No,...,main_activity_Widlife Tourism,main_activity_Wildlife Tourism,"info_source_Friends, relatives",info_source_Inflight magazines,"info_source_Newspaper, magazines, brochures",info_source_Others,"info_source_Radio, TV, Web",info_source_Tanzania Mission Abroad,info_source_Trade fair,"info_source_Travel agent, tour operator"
0,57.0,0.0,1.0,7,4,1.0,11,0,1,1,...,1,0,0,0,0,1,0,0,0,0
1,112.0,1.0,1.0,7,0,2.0,7,7,0,0,...,0,0,0,0,0,0,0,0,0,1


## Feature Selection

In [34]:
X = train_1.drop(["cost_category", "Tour_ID"], axis=1)
le = LabelEncoder()
y = le.fit_transform(train_1["cost_category"])

In [35]:
le.classes_

array(['High Cost', 'Higher Cost', 'Highest Cost', 'Low Cost',
       'Lower Cost', 'Normal Cost'], dtype=object)

In [36]:
X.shape

(18506, 53)

In [37]:
y.shape

(18506,)

In [38]:
y

array([0, 0, 1, ..., 1, 1, 3])

In [39]:
# # Train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=0.05, shuffle=True, random_state=42)


# Machine Learning Algorithms

In [40]:
import warnings
warnings.filterwarnings('ignore', '.*do not.*', )

## RandomForrestClassifier

In [41]:
# Kfold_split
# KF = KFold(n_splits=5)
# for train_index, test_index in KF.split(X.values):
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y[train_index], y[test_index]
    
#     rf = RandomForestClassifier(n_estimators=400, max_depth=7)
#     rf.fit(X_train, y_train)
#     rf_preds_proba = rf.predict_proba(X_test)
#     lg_loss = log_loss(y_test, rf_preds_proba)
#     print(f"log_loss : {lg_loss}")

In [42]:
#StratifiedShuffleSplit
# ssp = StratifiedShuffleSplit(test_size=0.02, random_state=42)
# for train_index, test_index in ssp.split(X.values, y):
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y[train_index], y[test_index]
    
    # rf = RandomForestClassifier(n_estimators=400, max_depth=7)
    # rf.fit(X_train, y_train)
    # rf_preds_proba = rf.predict_proba(X_test)
    # lg_loss = log_loss(y_test, rf_preds_proba)
    # print(f"log_loss : {lg_loss}")
    

In [43]:
# Random Forrest 
# rf = RandomForestClassifier(n_estimators=400, max_depth=7)
# rf.fit(X_train, y_train)

# rf_preds = rf.predict(X_test)
# rf_preds_proba = rf.predict_proba(X_test)

# ll = log_loss(y_test, rf_preds_proba)
# ll

In [44]:
# rf_preds_proba_test = rf.predict_proba(test_1.values)
# rf_preds_test = rf.predict(test_1.values)

In [45]:
# sub_file = pd.DataFrame(data=rf_preds_proba_test, columns = le.classes_)
# sub_file.insert(0, column="Tour_ID", value=tour_id)
# # sub_file["preds"] = rf_preds_test
# sub_file

In [46]:
# kf = KFold()
# params = {"n_estimators":[200, 300, 400, 500],
#         "max_depth": [3, 4, 5, 6, 7, 8]}

# rsv = RandomizedSearchCV(estimator=rf, param_distributions=params, cv=kf)
# rsv.fit(X_train, y_train)
# print(rsv.best_params_)
# print(rsv.best_score_)

## XGboost

In [47]:
ssp = StratifiedShuffleSplit(test_size=0.02, random_state=42)
for train_index, test_index in ssp.split(X.values, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    xgb = XGBClassifier(use_label_encoder=False,
                        n_estimators=63,
                       num_class=6,
                       learning_rate=0.13,
                       max_depth=7,
                       eval_metric='mlogloss',
                       random_state=42)
    xgb.fit(X_train.values, y_train)
    xgb_preds_proba = xgb.predict_proba(X_test.values)
    lg_loss = log_loss(y_test, xgb_preds_proba)
    print(f"log_loss : {lg_loss}")

log_loss : 1.1038707623012223
log_loss : 1.0514968166233234
log_loss : 1.1181471011327766
log_loss : 1.0393887412335192
log_loss : 1.036961639684065
log_loss : 1.1172419112035126
log_loss : 1.0853148686355658
log_loss : 1.110337963134472
log_loss : 1.1010671780567687
log_loss : 1.052442390647418


In [48]:
xgb = XGBClassifier(use_label_encoder=False,
                    n_estimators=63,
                    num_class=6,
                  learning_rate=0.13,
                  max_depth=7,
                  eval_metric='mlogloss',
                   random_state=42)
xgb.fit(X_train.values, y_train)
xgb_preds_proba = xgb.predict_proba(X_test.values)
lg_loss = log_loss(y_test, xgb_preds_proba)
print(f"log_loss : {lg_loss}")

log_loss : 1.052442390647418


In [49]:
xgb_preds =  xgb.predict(X_test.values)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, xgb_preds)

0.5956873315363881

In [50]:
xgb_preds_proba_test = xgb.predict_proba(test_1.values)

In [51]:
sub_file = pd.DataFrame(data=xgb_preds_proba_test, columns = le.classes_)
sub_file.insert(0, column="Tour_ID", value=tour_id)
# sub_file["preds"] = rf_preds_test
sub_file.head()

Unnamed: 0,Tour_ID,High Cost,Higher Cost,Highest Cost,Low Cost,Lower Cost,Normal Cost
0,tour_idynufedne,0.304219,0.108099,0.003224,0.007461,0.069228,0.507769
1,tour_id9r3y5moe,0.332436,0.425187,0.022449,0.030327,0.036878,0.152723
2,tour_idf6itml6g,0.368384,0.333632,0.015181,0.020479,0.010543,0.25178
3,tour_id99u4znru,0.123708,0.11791,0.003483,0.041377,0.107029,0.606493
4,tour_idj4i9urbx,0.066696,0.034316,0.002199,0.006058,0.051098,0.839633


In [52]:
sub_file.to_csv("xgb_1.csv")

In [53]:
# params = {"n_estimators":[50, 53, 60, 65],
#          "max_depth":[6, 7, 8, 9],
#          "learning_rate":[0.1],
#          }
# rsv = RandomizedSearchCV(estimator=xgb, param_distributions=params, cv=ssp)
# rsv.fit(X_train.values, y_train)
# print(rsv.best_params_)
# print(rsv.best_score_)
# print(rsv.best_estimator_)

In [54]:
# xgb = XGBClassifier(use_label_encoder=False,
#                     n_estimators=60,
#                     num_class=6,
#                   learning_rate=0.13,
#                   max_depth=7,
#                   eval_metric='mlogloss',
#                    random_state=42)
# xgb.fit(X_train.values, y_train)
# xgb_preds_proba = xgb.predict_proba(X_test.values)
# lg_loss = log_loss(y_test, xgb_preds_proba)
# print(f"log_loss : {lg_loss}")

## Catboost

In [60]:
err1=[]

fold=StratifiedShuffleSplit(n_splits=10, random_state=42)
for train_index, test_index in fold.split(X.values,y):
    X_train, X_test = X.iloc[train_index],X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    m2 = CatBoostClassifier(random_state=42)
    m2.fit(X_train.values,y_train,eval_set=[(X_train.values,y_train),(X_test.values, y_test)],early_stopping_rounds = 50,verbose=100,)
    preds = m2.predict_proba(X_test.values)
    print("err: ",(log_loss(y_test,preds)))
    err1.append((log_loss(y_test,preds)))
    p2 = m2.predict_proba(test_1)
print("The average score of Catboost is: ",np.mean(err1))

Learning rate set to 0.114224
0:	learn: 1.6592282	test: 1.6592282	test1: 1.6616318	best: 1.6616318 (0)	total: 16.4ms	remaining: 16.4s
100:	learn: 1.0260664	test: 1.0260664	test1: 1.1048393	best: 1.1048098 (99)	total: 1.26s	remaining: 11.3s
200:	learn: 0.9706267	test: 0.9706267	test1: 1.0959785	best: 1.0950444 (197)	total: 2.43s	remaining: 9.66s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 1.095044432
bestIteration = 197

Shrink model to first 198 iterations.
err:  1.0950444324555075
Learning rate set to 0.114224
0:	learn: 1.6600508	test: 1.6600508	test1: 1.6610585	best: 1.6610585 (0)	total: 12.1ms	remaining: 12.1s
100:	learn: 1.0278495	test: 1.0278495	test1: 1.0840907	best: 1.0840907 (100)	total: 1.2s	remaining: 10.7s
200:	learn: 0.9690353	test: 0.9690353	test1: 1.0753769	best: 1.0752577 (198)	total: 2.36s	remaining: 9.4s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 1.075216229
bestIteration = 229

Shrink model to first 230 iterations.
err:  1.

In [61]:
ssp = StratifiedShuffleSplit(test_size=0.02, random_state=42)
for train_index, test_index in ssp.split(X.values, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    cb = CatBoostClassifier(learning_rate=0.1,
                           iterations=1000,
                           use_best_model=True,
                           loss_function='MultiClass',
                           random_state=42,
                           logging_level='Silent',
                           )
    cb.fit(X_train.values, y_train, eval_set=(X_test.values, y_test))
    cb_preds_proba = cb.predict_proba(X_test.values)
    lg_loss = log_loss(y_test, cb_preds_proba)
    print(f"log_loss : {lg_loss}")

log_loss : 1.116222706084465
log_loss : 1.0591738132755855
log_loss : 1.1177894366930536
log_loss : 1.0604207528499887
log_loss : 1.034622174442958
log_loss : 1.1328874144088472
log_loss : 1.0727654987128947
log_loss : 1.0843951673314705
log_loss : 1.079289728089745
log_loss : 1.061982451903247


In [62]:
cb = CatBoostClassifier(learning_rate=0.1,
                       iterations=1000,
                       use_best_model=True,
                       loss_function='MultiClass',
                       random_state=42,
                       logging_level='Silent',
                       )
cb.fit(X_train.values, y_train, eval_set=(X_test.values, y_test))
cb_preds_proba = cb.predict_proba(X_test.values)
lg_loss = log_loss(y_test, cb_preds_proba)
print(f"log_loss : {lg_loss}")

log_loss : 1.061982451903247


In [69]:
cb_test_proba = cb.predict_proba(test_1.values)

In [72]:
sub_file = pd.DataFrame(data=cb_test_proba, columns = le.classes_)
sub_file.insert(0, column="Tour_ID", value=tour_id)
# sub_file["preds"] = rf_preds_test
sub_file.head()

Unnamed: 0,Tour_ID,High Cost,Higher Cost,Highest Cost,Low Cost,Lower Cost,Normal Cost
0,tour_idynufedne,0.220529,0.069345,0.000682,0.052041,0.05262,0.604782
1,tour_id9r3y5moe,0.211672,0.451729,0.019988,0.030862,0.021507,0.264243
2,tour_idf6itml6g,0.408035,0.292482,0.002706,0.014179,0.003532,0.279067
3,tour_id99u4znru,0.101887,0.136653,0.00042,0.166464,0.161949,0.432628
4,tour_idj4i9urbx,0.112351,0.056605,0.000197,0.100815,0.145795,0.584238


In [73]:
sub_file.to_csv("cb_1.csv", index=None)