# Letor Convertor

In [9]:
import pandas as pd
import os
import numpy as np
import random

In [10]:
def getheaders():
    with open('properties.txt', "r") as f:
        properties = f.read().splitlines()
    return properties

In [11]:
def bagging(df):   
    groups = df.groupby('srch_id')
    id_list = np.random.choice(df.srch_id.unique(), df.srch_id.unique().shape[0], replace=True)
    all_indices = []
    for group_id in id_list:
        all_indices += list(groups.get_group(group_id).index.values) 
    return df.iloc[np.array(all_indices)]

In [12]:
def get_target(hotel_set):
    hotel_set['score'] = 0
    hotel_set.loc[hotel_set['click_bool'] == 1, 'score'] = 1
    hotel_set.loc[hotel_set['booking_bool'] == 1, 'score'] = 5
    return hotel_set

In [83]:
def downsampling(train):
    # DOWNSAMPLING

    # use all clicked data
    pos_train = train[(train["click_bool"] == 1)]
    pos_ids = np.unique(list(pos_train.srch_id.values))

    # use a random sample of each search
    
    neg_train = train[(train["click_bool"] == 0) & (train["srch_id"].isin(pos_ids))]
    
    #groups = train.groupby('srch_id',group_keys = False).groups
    #neg_train = train.loc[[random.choice(groups.get(key)) for key in groups]]
    
    # combine these samples
    sel_train = pd.concat([pos_train, neg_train])
    
    return sel_train

In [84]:
def open_set(hotel_set_file, downsample = False, target = True, bagg = False):
    hotel_set = pd.read_csv(hotel_set_file)
    hotel_set.drop(hotel_set.select_dtypes(['object']), inplace=True, axis=1)
    
    if downsample == True:
        hotel_set = downsampling(hotel_set).sort_values('srch_id')
    
    if bagging == True:
        hotel_set = bagging(hotel_set).sort_values('srch_id')
        
    if target == True:
        hotel_set = get_target(hotel_set)
        y = hotel_set['score'].values
    else:
        y = None
        
    qid = hotel_set['srch_id'].values
    X = hotel_set[properties].values
        
    return hotel_set, qid, y, X

In [101]:
#train_downsampled = 'test_files/hoteltest_training_downsampled_set'
train = 'test_files/hoteltest_training_set.csv'

In [102]:
properties = getheaders()
train_set, Tqids, Ty, TX = open_set(train, downsample = True, bagg = True)
val_set, Vqids, Vy, VX = open_set(train)

In [None]:
test = 'complete_files/hoteltest_testing_set.csv'

In [None]:
eval_set, Eqids, _, EX = open_set(test, target = False)

In [None]:
print(TX.shape, VX.shape, EX.shape)

# LambdaMART

In [103]:
import pyltr
from sklearn.model_selection import KFold, train_test_split

In [104]:
metric = pyltr.metrics.NDCG(k=5)

In [113]:
model = pyltr.models.LambdaMART(
    metric=metric,
    n_estimators=500,
    learning_rate=0.1,
    max_features='log2',
    query_subsample=0.5,
    max_leaf_nodes=10,
    min_samples_leaf=64,
    verbose=1,
)

In [114]:
#TX, VX, Ty, Vy, Tqids, Vqids = train_test_split(X, y, qids, test_size=0.2, shuffle=False)
print('fitting model...')
monitor = pyltr.models.monitors.ValidationMonitor(
            VX, Vy, Vqids, metric=metric)
#monitor = None
model.fit(TX, Ty, Tqids, monitor=monitor)

fitting model...
 Iter  Train score  OOB Improve    Remaining                           Monitor Output 
    1       0.1356       0.0980        4.05m      C:      0.1208 B:      0.1208 S:  0
    2       0.1718       0.0177        4.17m      C:      0.1516 B:      0.1516 S:  0
    3       0.1945       0.0340        4.27m      C:      0.1940 B:      0.1940 S:  0
    4       0.2011       0.0004        4.18m      C:      0.1995 B:      0.1995 S:  0
    5       0.2289       0.0295        4.13m      C:      0.2325 B:      0.2325 S:  0
    6       0.2635       0.0118        4.08m      C:      0.2532 B:      0.2532 S:  0
    7       0.2593       0.0016        4.04m      C:      0.2620 B:      0.2620 S:  0
    8       0.2749       0.0016        4.01m      C:      0.2733 B:      0.2733 S:  0
    9       0.2875      -0.0024        3.98m      C:      0.2767 B:      0.2767 S:  0
   10       0.2990       0.0001        3.95m      C:      0.2802 B:      0.2802 S:  0
   15       0.3195       0.0001     

<pyltr.models.lambdamart.LambdaMART at 0x7fd5844bf828>

In [115]:
predict = model.predict(TX)
print ('Random ranking:', metric.calc_mean_random(Tqids, Ty))
print ('Our model:', metric.calc_mean(Tqids, Ty, predict))

Random ranking: 0.15966958123678288
Our model: 0.5228847940546034


In [116]:
print(predict.shape)

(19987,)


In [117]:
print(Vqids.shape)
print(VX.shape)

(20000,)
(20000, 29)


In [118]:
result = pd.DataFrame({'srch_id' : Tqids,
                        'prop_id' : train_set.prop_id,
                        'target' : Ty,
                        'score' : predict})

In [119]:
#result['score'] = result['score'].abs()
result = result.sort_values(by=['srch_id', 'score'], ascending=[True, False])


In [120]:
result

Unnamed: 0,srch_id,prop_id,target,score
18,1,88218,0,-0.955091
8,1,53341,0,-1.231627
4,1,29604,0,-1.333824
16,1,88096,0,-1.336005
20,1,95166,0,-1.424761
5,1,30184,0,-1.734470
7,1,50984,0,-1.862479
26,1,114766,0,-1.946547
25,1,111106,0,-1.952152
12,1,68914,5,-2.035338


In [29]:
result.sort_values(by=['srch_id', 'target'], ascending=[True, False])

Unnamed: 0,srch_id,prop_id,target,score
12,1,68914,5,0.508796
2,1,21315,0,0.580251
13,1,74474,0,0.534132
24,1,111000,0,0.534132
27,1,122844,0,0.533506
3,1,27348,0,0.529319
1,1,10404,0,0.467154
15,1,85728,0,0.466853
23,1,107872,0,0.466853
26,1,114766,0,0.435209
