Imports


In [1]:
import datetime
import gc
import lightgbm as lgb
from lightgbm.callback import early_stopping
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from sklearn.impute import KNNImputer
from sklearn.metrics import ndcg_score
from sklearn.model_selection import GroupShuffleSplit, ParameterGrid
from tqdm import tqdm
import dask.dataframe as dd

warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
def downscale_and_save(df, filename):
    df_copy = df.copy()

    for col in df_copy.select_dtypes(include='float64').columns:
        df_copy[col] = df_copy[col].astype('float32')

    df_copy.to_csv(filename, index=False)

    del df_copy
    gc.collect()

In [3]:
train_df = dd.read_csv('data/training_set_VU_DM.csv', parse_dates=['date_time'])
test_df = dd.read_csv('data/test_set_VU_DM.csv', parse_dates=['date_time'])

In [4]:
# Creating the relevance target
train_df['relevance'] = train_df['booking_bool'] * 2 + (train_df['click_bool'] * (1 - train_df['booking_bool']))

Handling Datetime

In [5]:
# Extract useful features from 'date_time'
train_df['year'] = train_df['date_time'].dt.year
train_df['month'] = train_df['date_time'].dt.month
train_df['day'] = train_df['date_time'].dt.day
train_df = train_df.drop(columns=['date_time'])

test_df['year'] = test_df['date_time'].dt.year
test_df['month'] = test_df['date_time'].dt.month
test_df['day'] = test_df['date_time'].dt.day
test_df = test_df.drop(columns=['date_time'])

Removing Outliers

In [6]:
num_feats_with_outliers = ['price_usd', 'comp1_rate_percent_diff', 'comp2_rate_percent_diff', 'comp3_rate_percent_diff', 'comp4_rate_percent_diff', 'comp5_rate_percent_diff', 'comp6_rate_percent_diff', 'comp7_rate_percent_diff', 'comp8_rate_percent_diff']

for feature in num_feats_with_outliers:  # Based on EDA only price_usd & compX_rate_percent_diff
    Q1 = train_df[feature].quantile(0.25)
    Q3 = train_df[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 3 * IQR
    upper_bound = Q3 + 3 * IQR
    
    # Replace outliers with NaN
    train_df[feature].mask(~train_df[feature].between(lower_bound, upper_bound), np.nan)

Selecting Subset of Records

In [7]:
# Calculate the count of missing values in each row
train_df['missing_count'] = train_df.isnull().sum(axis=1)
# Sort the dataframe by 'missing_count' in ascending order
train_df = train_df.sort_values(by='missing_count')
# Select the top x% of the rows with the least missing values
top_percentage = 0.75
cut_off = int(len(train_df) * top_percentage)
train_df = train_df.head(cut_off)
train_df = dd.from_pandas(train_df,10)
train_df

Unnamed: 0_level_0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,relevance,year,month,day,missing_count
npartitions=10,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1
0,int64,int64,int64,float64,float64,int64,int64,int64,float64,int64,float64,float64,float64,int64,float64,int64,int64,int64,int64,int64,int64,int64,int64,float64,float64,int64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,int64,float64,int64,int64,int32,int32,int32,int64
25868,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234883,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262289,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


Feature Engineering

Mean Position

In [8]:
mean_positions = train_df[train_df['random_bool'] == False].groupby('prop_id')['position'].mean().rename('mean_train_position')  # Exclude records where the results order is random
train_df = train_df.join(mean_positions, on='prop_id')
test_df = test_df.join(mean_positions, on='prop_id')


Click/Booking Prop

In [9]:
def compute_prior(df, group_field, value_field):
    # Sum and count values per group
    sums = df.groupby(group_field)[value_field].transform('sum')
    count = df.groupby(group_field)[value_field].transform('count')
    # Calculate leave-one-out prior
    prior = (sums - df[value_field]) / (count - 1)
    return prior

In [10]:

# Apply function for click and booking bool
train_df['click_prior'] = compute_prior(train_df, 'prop_id', 'click_bool')
train_df['booking_prior'] = compute_prior(train_df, 'prop_id', 'booking_bool')



# Handling cases with only one record per group
train_df = train_df.fillna({'click_prior': train_df['click_bool'].mean()})
train_df = train_df.fillna({'booking_prior': train_df['booking_bool'].mean()})


  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  sums = df.groupby(group_field)[value_field].transform('sum')
  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  count = df.groupby(group_field)[value_field].transform('count')
  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  sums = df.groupby(group_field)[value_field].transform('sum')
  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  count = df.groupby(group_field)[value_field].transform('count')


In [11]:
# Priors for click and booking bool from the training set
test_df['click_prior'] = test_df['prop_id'].map(train_df.groupby('prop_id')['click_bool'].mean())
test_df['booking_prior'] = test_df['prop_id'].map(train_df.groupby('prop_id')['booking_bool'].mean())

# Handling cases with only one record per group
test_df = test_df.fillna({'click_prior': train_df['click_bool'].mean()})
test_df = test_df.fillna({'booking_prior': train_df['booking_bool'].mean()})


You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.


Number of Previous searches

In [12]:
# Number of occurences "minus the current row"
train_df['previous_searches'] = train_df.groupby('prop_id')['prop_id'].transform('count') - 1
test_df['previous_searches'] = test_df['prop_id'].map(train_df['prop_id'].value_counts() - 1).fillna(0)


  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  train_df['previous_searches'] = train_df.groupby('prop_id')['prop_id'].transform('count') - 1
You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.


Number of Bookings for Property/Destination Combination

In [13]:
# Aggregate number of bookings for each property and destination combination
booking_counts = train_df.groupby(['prop_id', 'srch_destination_id'])['booking_bool'].sum().reset_index()
booking_counts = booking_counts.rename(columns={'booking_bool': 'booking_count'})

# Merge this count back to the train and test datasets
train_df = train_df.merge(booking_counts, on=['prop_id', 'srch_destination_id'], how='left')
test_df = test_df.merge(booking_counts, on=['prop_id', 'srch_destination_id'], how='left')

Mean Distance to Other Properties in the Query

In [14]:
# Calculate the maximum difference in distance to the user within each search query
train_df['max_distance_diff'] = train_df.groupby('srch_id')['orig_destination_distance'].transform(lambda x: x.max() - x.min())

# Compute the mean of these maximum differences by property and add it back to the dataset
mean_distance = train_df.groupby('prop_id')['max_distance_diff'].mean().reset_index()
mean_distance = mean_distance.rename(columns={'max_distance_diff': 'mean_max_distance_diff'})

train_df = train_df.merge(mean_distance, on='prop_id', how='left')
test_df = test_df.merge(mean_distance, on='prop_id', how='left')

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  train_df['max_distance_diff'] = train_df.groupby('srch_id')['orig_destination_distance'].transform(lambda x: x.max() - x.min())


Statistical Features

In [17]:
features_to_stat = ['visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_starrating', 'prop_review_score', 'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price', 'price_usd', 'orig_destination_distance', 'srch_query_affinity_score', 'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count', 'srch_children_count', 'srch_room_count']  # Perhaps change this based on LightGBM.feature_importances_

for feature in tqdm(features_to_stat):
    meta_df = pd.DataFrame(columns=[f'{feature}_mean', f'{feature}_median', f'{feature}_std'], dtype=float)
    stats = train_df.groupby('prop_id')[feature].agg(['mean', 'median', 'std'], meta = meta_df).rename(
        columns={'mean': f'{feature}_mean', 'median': f'{feature}_median', 'std': f'{feature}_std'})
    train_df = train_df.merge(stats, on='prop_id', how = 'left')

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]


MergeError: Not allowed to merge between different levels. (1 levels on the left, 2 on the right)

In [None]:
for feature in tqdm(features_to_stat):
    meta_df = pd.DataFrame(columns=[f'{feature}_mean', f'{feature}_median', f'{feature}_std'], dtype=float)
    stats = train_df.groupby('prop_id')[feature].agg(['mean', 'median', 'std']).rename(
        columns={'mean': f'{feature}_mean', 'median': f'{feature}_median', 'std': f'{feature}_std'})
    test_df = test_df.join(stats[feature], on='prop_id')

100%|██████████| 15/15 [00:00<00:00, 95.94it/s] 


In [None]:
train_df

Unnamed: 0_level_0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,promotion_flag,srch_destination_id,srch_length_of_stay,srch_booking_window,srch_adults_count,srch_children_count,srch_room_count,srch_saturday_night_bool,srch_query_affinity_score,orig_destination_distance,random_bool,comp1_rate,comp1_inv,comp1_rate_percent_diff,comp2_rate,comp2_inv,comp2_rate_percent_diff,comp3_rate,comp3_inv,comp3_rate_percent_diff,comp4_rate,comp4_inv,comp4_rate_percent_diff,comp5_rate,comp5_inv,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,relevance,year,month,day,missing_count,mean_train_position,click_prior,booking_prior,previous_searches,booking_count,max_distance_diff,mean_max_distance_diff,visitor_hist_starrating_mean,visitor_hist_starrating_median,visitor_hist_starrating_std,visitor_hist_adr_usd_mean,visitor_hist_adr_usd_median,visitor_hist_adr_usd_std,prop_starrating_mean,prop_starrating_median,prop_starrating_std,prop_review_score_mean,prop_review_score_median,prop_review_score_std,prop_location_score1_mean,prop_location_score1_median,prop_location_score1_std,prop_location_score2_mean,prop_location_score2_median,prop_location_score2_std,prop_log_historical_price_mean,prop_log_historical_price_median,prop_log_historical_price_std,price_usd_mean,price_usd_median,price_usd_std,orig_destination_distance_mean,orig_destination_distance_median,orig_destination_distance_std,srch_query_affinity_score_mean,srch_query_affinity_score_median,srch_query_affinity_score_std,srch_length_of_stay_mean,srch_length_of_stay_median,srch_length_of_stay_std,srch_booking_window_mean,srch_booking_window_median,srch_booking_window_std,srch_adults_count_mean,srch_adults_count_median,srch_adults_count_std,srch_children_count_mean,srch_children_count_median,srch_children_count_std,srch_room_count_mean,srch_room_count_median,srch_room_count_std
npartitions=10,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1
,int64,int64,int64,float64,float64,int64,int64,int64,float64,int64,float64,float64,float64,int64,float64,int64,int64,int64,int64,int64,int64,int64,int64,float64,float64,int64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,int64,float64,int64,int64,int32,int32,int32,int64,float64,float64,float64,int64,int64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


Start Computation

In [18]:
train_df = train_df.compute()
test_df = test_df.compute()

KeyError: 'visitor_hist_starrating'

See results

In [26]:
train_df

Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,...,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,relevance,year,month,day
0,1,12,187,,,219,893,3,3.5,1,...,0.0,0.0,,0,,0,0,2013,4,4
1,1,12,187,,,219,10404,4,4.0,1,...,0.0,0.0,,0,,0,0,2013,4,4
2,1,12,187,,,219,21315,3,4.5,1,...,0.0,0.0,,0,,0,0,2013,4,4
3,1,12,187,,,219,27348,2,4.0,1,...,-1.0,0.0,5.0,0,,0,0,2013,4,4
4,1,12,187,,,219,29604,4,3.5,1,...,0.0,0.0,,0,,0,0,2013,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260646,332785,5,219,,,219,77700,3,4.0,1,...,,,,0,,0,0,2013,6,30
260647,332785,5,219,,,219,88083,3,4.0,1,...,,,,0,,0,0,2013,6,30
260648,332785,5,219,,,219,94508,3,3.5,1,...,,,,0,,0,0,2013,6,30
260649,332785,5,219,,,219,128360,3,5.0,1,...,,,,1,157.84,1,2,2013,6,30


In [27]:
test_df

Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,...,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,year,month,day
0,1,24,216,,,219,3180,3,4.5,1,...,,,,,,,,2013,2,2
1,1,24,216,,,219,5543,3,4.5,1,...,,,,,,,,2013,2,2
2,1,24,216,,,219,14142,2,3.5,1,...,,,,,,,,2013,2,2
3,1,24,216,,,219,22393,3,4.5,1,...,,,,,,,,2013,2,2
4,1,24,216,,,219,24194,3,4.5,1,...,,,,,,,,2013,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275258,332787,24,216,,,117,32019,4,3.5,0,...,,,,,,,,2013,5,21
275259,332787,24,216,,,117,33959,4,3.0,1,...,,,,,,,,2013,5,21
275260,332787,24,216,,,117,35240,4,0.0,0,...,,,,,,,,2013,5,21
275261,332787,24,216,,,117,94437,4,0.0,0,...,,,,,,,,2013,5,21


In [None]:
downscale_and_save(train_df, 'data/processed_train.csv')
downscale_and_save(test_df, 'data/processed_test.csv')

#MODEL