In [7]:
import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt
from xgboostextension import XGBRanker 
from xgboost import XGBModel
import pandas as pd
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from math import isnan

In [8]:
FILE = 'training_set_VU_DM.csv'

In [9]:
comp_attributes = ['comp{}_rate'.format(i) for i in range(1, 9)] + ['comp{}_inv'.format(i) for i in range(1, 9)]
comp_percent_diffs = ['comp{}_rate_percent_diff'.format(i) for i in range(1, 9)]

# load data set
data = pd.read_csv(FILE)

# Summarise competitor attributes and drop them
data['competitors'] = np.nanmin(data[comp_attributes], axis=1)
data['competitors'] = data['competitors'].fillna(0)
data.drop(comp_attributes+comp_percent_diffs, axis=1, inplace=True)

# Adjust affinity score attribute
data['srch_query_affinity_score'] = np.exp(data['srch_query_affinity_score']).fillna(0)

# Add column 'prop_reviewed'
data['prop_reviewed'] = True
data['prop_reviewed'].loc[(data['prop_review_score'] == 0)] = False
data['prop_reviewed'].loc[(data['prop_review_score'].isnull())] = False

# now to iterate over every search ID and fill in more missing data
s_ids = data['srch_id'].unique()

for s_id in s_ids:
    # get block of each search
    search = data.loc[data['srch_id'] == s_id]
    s_idx = search.index

    # clean up prop_location_score2
    avg_pls2 = search['prop_location_score2'].mean()

    if np.isnan(avg_pls2):
        avg_pls2 = 0.5

    data.loc[s_idx, 'prop_location_score2'] = data.loc[s_idx, 'prop_location_score2'].fillna(avg_pls2)

# finally, drop unused columns
data.drop(columns=['gross_bookings_usd'], inplace=True)

  


NameError: name 'i' is not defined

In [10]:
# save cleaned csv
data.to_csv('train_data_cleaned.csv')

In [None]:
print('--- Starting Data Cleanup ---')

print('Loading Full Dataset...')
data_full = pd.read_csv(FILE)
print('Loaded')

# clean up orig_destination_distance attribute
data['avg_orig_dist'] = 0
data['std_orig_dist'] = 0

s_ids = data['srch_id'].unique()
for j, s_id in enumerate(s_ids):
    #print('Search ID {}  |  {}/{}'.format(s_id, j, len(s_ids)))
    # get block of each search
    search = data.loc[data['srch_id'] == s_id]
    s_idx = search.index

    # fill in missing values for orig_destination_distance
    num_nulls = search['orig_destination_distance'].isnull().values.sum()
    if num_nulls == len(search):
        # look for similar trips
        origin = search['visitor_location_country_id'].iloc[0]
        dest_id = search['srch_destination_id'].iloc[0]
        prop_country = search['prop_country_id'].iloc[0]

        same_trip = data_full.loc[
            (data_full['visitor_location_country_id'] == origin) & ((data_full['srch_destination_id'] == dest_id) | (data_full['prop_country_id'])) == prop_country]

        avg_odd = same_trip['orig_destination_distance'].mean()

        # can still be nan, just use average distance to this booking...
        if isnan(avg_odd):
            same_trip = data_full.loc[data_full['srch_destination_id'] == dest_id]
            avg_odd = same_trip['orig_destination_distance'].mean()
            std_odd = same_trip['orig_destination_distance'].std()
            if isnan(avg_odd):
                same_trip = data_full.loc[data_full['prop_country_id'] == prop_country]
                avg_odd = same_trip['orig_destination_distance'].mean()
                std_odd = same_trip['orig_destination_distance'].std()
                if isnan(avg_odd):
                    print('Dest_id {} has no distance data at all!'.format(dest_id))
        else:
            std_odd = same_trip['orig_destination_distance'].std()
    else:
        avg_odd = search['orig_destination_distance'].mean()
        std_odd = search['orig_destination_distance'].std()

    data.loc[s_idx, 'avg_orig_dest_dist'] = avg_odd
    data.loc[s_idx, 'std_orig_dest_dist'] = std_odd

data.drop(columns=['orig_destination_distance'], inplace=True)
data.to_csv('training_final_clean.csv')

print('Saved')

--- Starting Data Cleanup ---
Loading Full Dataset...
Loaded
Dest_id 19446 has no distance data at all!
Dest_id 21333 has no distance data at all!
Dest_id 21333 has no distance data at all!
Dest_id 19446 has no distance data at all!
Dest_id 15429 has no distance data at all!
Dest_id 837 has no distance data at all!
Dest_id 21333 has no distance data at all!
Dest_id 837 has no distance data at all!


In [None]:
DTYPES = {
    'srch_id'                    : np.int32,
    'visitor_location_country_id': np.int32,
    'visitor_hist_starrating'    : np.float32,
    'visitor_hist_adr_usd'       : np.float32,
    'prop_country_id'            : np.int32,
    'prop_id'                    : np.int32,
    'prop_starrating'            : np.float32,
    'prop_review_score'          : np.float32,
    'prop_brand_bool'            : np.bool,
    'prop_location_score1'       : np.float32,
    'prop_location_score2'       : np.float32,
    'prop_log_historical_price'  : np.float32,
    'position'                   : np.int32,
    'price_usd'                  : np.float32,
    'promotion_flag'             : np.bool,
    'srch_destination_id'        : np.int32,
    'srch_length_of_stay'        : np.float32,
    'srch_booking_window'        : np.float32,
    'srch_adults_count'          : np.float32,
    'srch_children_count'        : np.float32,
    'srch_room_count'            : np.float32,
    'srch_saturday_night_bool'   : np.bool,
    'srch_query_affinity_score'  : np.float32,
    'random_bool'                : np.bool,
    'click_bool'                 : np.bool,
    'booking_bool'               : np.bool,
    'competitors'                : np.float32,
    'prop_reviewed'              : np.bool,
    'avg_orig_dest_dist'         : np.float32,
    'std_orig_dest_dist'         : np.float32,
    'domestic_travel'            : np.bool,
}

FILE = 'training_final_clean.csv'

data = pd.read_csv(FILE, index_col=0, dtype=DTYPES)