In [16]:
import pandas as pd
import numpy as np

In [17]:
train_raw = pd.read_csv("data/train.csv")
train_raw.head()

Unnamed: 0,srch_id,prop_key,srch_date_time,srch_visitor_id,srch_visitor_visit_nbr,srch_visitor_loc_country,srch_visitor_loc_region,srch_visitor_loc_city,srch_visitor_wr_member,srch_posa_continent,...,prop_super_region,prop_continent,prop_country,prop_market_id,prop_submarket_id,prop_room_capacity,prop_review_score,prop_review_count,prop_hostel_bool,srch_local_date
0,-1046322713,257690,2014-09-13 18:37:32,9373b009-4e10-495a-afae-204dd1fe4b7c,5,TWN,TPE,TAIPEI,Signed in - Persistent|WR Member|Remembered FC...,ASIA,...,APAC,ASIA,JAPAN,60041,109140,575,4.1,403.0,0,2014-09-13
1,-1046322713,3066218,2014-09-13 18:37:32,9373b009-4e10-495a-afae-204dd1fe4b7c,5,TWN,TPE,TAIPEI,Signed in - Persistent|WR Member|Remembered FC...,ASIA,...,APAC,ASIA,JAPAN,60041,109140,339,3.6,101.0,0,2014-09-13
2,-1046322713,2271987,2014-09-13 18:37:32,9373b009-4e10-495a-afae-204dd1fe4b7c,5,TWN,TPE,TAIPEI,Signed in - Persistent|WR Member|Remembered FC...,ASIA,...,APAC,ASIA,JAPAN,60041,109140,179,4.1,1189.0,0,2014-09-13
3,-1046322713,3308025,2014-09-13 18:37:32,9373b009-4e10-495a-afae-204dd1fe4b7c,5,TWN,TPE,TAIPEI,Signed in - Persistent|WR Member|Remembered FC...,ASIA,...,APAC,ASIA,JAPAN,60041,98278,272,4.8,221.0,0,2014-09-13
4,-1046322713,3222046,2014-09-13 18:37:32,9373b009-4e10-495a-afae-204dd1fe4b7c,5,TWN,TPE,TAIPEI,Signed in - Persistent|WR Member|Remembered FC...,ASIA,...,APAC,ASIA,JAPAN,60041,109140,198,3.9,702.0,0,2014-09-13


In [18]:
def create_property_df(train_raw):
    """
    Convert raw training set into property dataframe for ranking. 
    
    Ranked by frequency of booking, then rating, then number of reviews, then 
    """
    prop_cols = [col for col in train_raw.columns if 'prop' in col]
    prop_df = train_raw[prop_cols].copy()
    # Get average review & booking incidence
    review_summ = prop_df.groupby(['prop_key']).agg(['mean', 'count'])[['prop_review_score', 
                                                                        'prop_booking_bool']].reset_index()
    # Relabel columns 
    review_summ.columns = ['_'.join(col) for col in review_summ.columns]
    review_summ.drop('prop_review_score_count', axis=1)
    # Add additional columns
    prop_extra = ['prop_key', 'prop_brand_bool', 'prop_starrating', 'prop_super_region', 'prop_continent', 'prop_country', 
              'prop_market_id', 'prop_submarket_id', 'prop_room_capacity', 'prop_hostel_bool']
    prop_df_min = prop_df.groupby(['prop_key']).agg(pd.Series.mode).reset_index()[prop_extra]
    # 
    property_df = review_summ.merge(prop_df_min, how = 'outer', 
                                    left_on='prop_key_', right_on='prop_key').drop(['prop_key'], axis=1)
    property_df = property_df[property_df['prop_booking_bool_count'] > 5].sort_values(by = ['prop_booking_bool_count',
#                                                                                             'prop_booking_bool_mean', 
                                                                                            'prop_review_score_mean', 
                                                                                            'prop_review_score_count'],
                                         ascending = False)
    return property_df

In [19]:
property_df = create_property_df(train_raw)

In [20]:
property_df.head()

Unnamed: 0,prop_key_,prop_review_score_mean,prop_review_score_count,prop_booking_bool_mean,prop_booking_bool_count,prop_brand_bool,prop_starrating,prop_super_region,prop_continent,prop_country,prop_market_id,prop_submarket_id,prop_room_capacity,prop_hostel_bool
1017,259046,3.399519,4862,0.065611,4862,1,3.0,AMER,NORTHAMERICA,UNITED STATES OF AMERICA,95602,110287,3770,0
62,241770,4.500177,4641,0.031459,4641,1,5.0,AMER,NORTHAMERICA,UNITED STATES OF AMERICA,95602,110287,4049,0
1195,263319,3.699852,4615,0.054388,4615,0,3.5,AMER,NORTHAMERICA,UNITED STATES OF AMERICA,95602,110287,2500,0
1206,263484,3.600843,4580,0.066157,4580,1,3.0,AMER,NORTHAMERICA,UNITED STATES OF AMERICA,95602,110287,4008,0
658,252583,4.003589,4514,0.086619,4514,0,4.0,AMER,NORTHAMERICA,UNITED STATES OF AMERICA,95602,110287,2885,0


In [69]:
def filtered_recommendations(srch_id, n_top, 
                             prop_room_capacity=None, prop_country=None, 
                             prop_market_id = None, prop_submarket_id = None,
                             property_df=property_df):
    '''
    INPUT:
    user_id - the user_id of the individual you are making recommendations for
    n_top - an integer of the number recommendations you want back
    FILTERS: 
    prop_country (list), prop_market_id (list), prop_submarket_id (list), prop_room_capacity (non-list)
    OUTPUT:
    top_movies - a list of the n_top recommended movies by movie title in order best to worst
    '''
    if prop_country:
        property_df = property_df[property_df['prop_country'].isin(prop_country)]
    if prop_market_id:
        property_df = property_df[property_df['prop_market_id'].isin(prop_market_id)]
    if prop_submarket_id:
        property_df = property_df[property_df['prop_submarket_id'].isin(prop_submarket_id)]
    if prop_room_capacity:
        property_df = property_df[property_df['prop_room_capacity'] > prop_room_capacity]
    
    top_hotels = [i for i in property_df['prop_key_'][:n_top]]
    return top_hotels 

In [23]:
filtered_recommendations(srch_id = 'abc', n_top = 4, 
                             prop_room_capacity=40, prop_country=['ITALY'], 
                             prop_market_id = None, prop_submarket_id = None,
                             property_df=property_df)

[436159, 298309, 323557, 3227839]

### Knowledge Based 

In [85]:
tst_search = train_raw.iloc[0]
tst_search[0]
# train_raw.iloc[0]

-1046322713

In [33]:
test_raw = pd.read_csv('data/test.csv')
test_raw.columns

Index(['srch_id', 'prop_key', 'srch_date_time', 'srch_visitor_id',
       'srch_visitor_visit_nbr', 'srch_visitor_loc_country',
       'srch_visitor_loc_region', 'srch_visitor_loc_city',
       'srch_visitor_wr_member', 'srch_posa_continent', 'srch_posa_country',
       'srch_hcom_destination_id', 'srch_dest_longitude', 'srch_dest_latitude',
       'srch_ci', 'srch_co', 'srch_ci_day', 'srch_co_day', 'srch_los',
       'srch_bw', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt',
       'srch_mobile_bool', 'srch_mobile_app', 'srch_device', 'srch_currency',
       'prop_travelad_bool', 'prop_dotd_bool',
       'prop_price_without_discount_local', 'prop_price_without_discount_usd',
       'prop_price_with_discount_local', 'prop_price_with_discount_usd',
       'prop_imp_drr', 'prop_brand_bool', 'prop_starrating',
       'prop_super_region', 'prop_continent', 'prop_country', 'prop_market_id',
       'prop_submarket_id', 'prop_room_capacity', 'prop_review_score',
       'prop_review_cou

In [100]:
def recommendation_eng(df, search_id):
    """
    Extract all properties for a given search id. 
    Order properties according to highest 'score' of likeliness to book.
    RETURN: *ordered* srch_id, prop_key, (score)
    """
    search_df = df[df['srch_id'] == search_id]
    n_searches = search_df.shape[0]
    prop_list =  search_df['prop_key']
    prop_market_id = search_df['prop_market_id'].agg('mode')[0]
#     prop_submarket_id = search_df['prop_submarket_id'].agg('mode')[0]
    prop_submarket_id = None
    prop_country = search_df['prop_country'].agg('mode')[0]
    
    recs = filtered_recommendations(srch_id = search_id, n_top=1000, 
                             prop_room_capacity=None, prop_country=[prop_country], 
                             prop_market_id = [prop_market_id], prop_submarket_id = None,
                             property_df=property_df)
    
    return_recs = [r for r in recs if r in search_df['prop_key']]
    return return_recs
    
    

In [103]:
search_df = train_raw[train_raw['srch_id'] == tst_search[0]]
prop_keys_orig = search_df['prop_key']

recs = recommendation_eng(train_raw, tst_search[0])

len(prop_keys_orig)
len(recs)

set(prop_keys_orig).difference(set(recs))

{241578,
 245408,
 246133,
 253037,
 253808,
 257690,
 265260,
 274121,
 274123,
 274965,
 279699,
 331084,
 349853,
 418876,
 439452,
 443003,
 447470,
 474268,
 522960,
 542765,
 557217,
 580658,
 592590,
 595014,
 1673652,
 1981700,
 2026758,
 2271987,
 2668252,
 3066218,
 3075622,
 3185546,
 3187316,
 3214020,
 3222046,
 3225583,
 3245700,
 3260327,
 3298414,
 3308025,
 3396692,
 3799587,
 3873959,
 3878649,
 3906821,
 3942297,
 3944441,
 3946005,
 3948259}

In [81]:
def create_results(df):
    """
    INPUT: train / test df
    OUTPUT: Results df (ordered)
    """
    results_dict = dict()
    for i in df['srch_id']:
        results_dict[i] = recommendation_eng(df, i)
        
    results_df = pd.DataFrame(results_dict)
    

In [86]:
train_raw['srch_id']

0        -1046322713
1        -1046322713
2        -1046322713
3        -1046322713
4        -1046322713
             ...    
841110     964783409
841111     964783409
841112     964783409
841113     964783409
841114     964783409
Name: srch_id, Length: 841115, dtype: int64