# Models, Evaluation, TrainTestSplit implementations

#### if you wanna run experiment scroll down

In [63]:
from numpy import dot
from numpy.linalg import norm
import csv
import pandas as pd
import math

# user based collaborative filtering
# if you want to run CF to create submission file uncomment write lines in fit function and comment output_list
# lines so you dont store 2gb in your ram memory
class CollaborativeFiltering:
    
    # we need more search features
    # we can remove all hotel features
    def __init__(self):
        # self.search_features = ["srch_adults_count", "srch_saturday_night_bool", "site_id", "visitor_location_country_id", "srch_length_of_stay", "srch_room_count"]
        self.search_features = ['promotion_flag', 'srch_length_of_stay','srch_booking_window', 'srch_saturday_night_bool',\
                                'srch_query_affinity_score', 'orig_destination_distance', 'user_hotel_price', 'desirability',\
                                'd_p_ratio', 'people_per_room', 'overall_advantage','site_id', 'visitor_location_country_id']
        
    def cos_sim(self, user_data, test_data):
        categorical = ["site_id", "visitor_location_country_id", "srch_saturday_night_bool", "promotion_flag"]
        user_categorical, user_numeric = user_data[categorical] , user_data.drop(categorical)
        test_categorical, test_numeric = test_data[categorical], test_data.drop(categorical)
        
        # for numeric calc cos_sim
        a = user_numeric.to_numpy()
        b = test_numeric.to_numpy()
        
        cos_sim_score = dot(a, b)/(norm(a)*norm(b))
        
        if(math.isnan(cos_sim_score)):
            return -1
        
        # for cateforical_data increase cos_sim_score if it matches
        for idx, col in enumerate(user_categorical):
            if user_categorical[idx] == test_categorical[idx]:
                cos_sim_score += 0.2
    
        return cos_sim_score
    
    def get_cosine_similarities(self, train_data, user_data):
        cosine_scores = []
        for idx, t_data in train_data.iterrows():
            cosine_scores.append((self.cos_sim(user_data, t_data) , idx))
            
        return sorted(cosine_scores, reverse=True)
    
    def find_n_nearest_searches(self, user_data, n, hotel_id):
        # take only those users who have data for this hotel
        train_data = self.train_data.loc[self.train_data["prop_id"] == hotel_id]
        train_data = train_data[self.search_features]
        # get n best rows (with highest cosine similiarity)
        similiarities = self.get_cosine_similarities(train_data, user_data)[:n]
        # print(f"sims: {similiarities}")
        idx = [i[1] for i in similiarities]
        # print(f"indeces {idx}")
        return self.train_data.loc[idx]
        
    def score_function(self, n_nearest_searches):
        if n_nearest_searches.empty:
            return -10
        score = 0
        b_w = 100 # booking weight
        for _, x in n_nearest_searches.iterrows():
            score += b_w * x["booking_bool"] + x["click_bool"] - x["position"]
        # return avg score 
        return score/len(n_nearest_searches)
    
    def fit(self, train_data, test_data, n=5):
        self.train_data = train_data
        output_list = []
        # for every user search
    # with open('cf.csv', 'w') as f:
        # writer = csv.writer(f)
        # writer.writerow(["srch_id", "prop_id"])
        for srch_id in test_data["srch_id"].unique():
            user_searches = test_data.loc[test_data["srch_id"] == srch_id]
            hotel_scores = []
            user_data = user_searches[self.search_features].iloc[0]
            for idx, search in user_searches.iterrows():
                # find n nearest users user_data + search_specific_data
                hotel_id = search["prop_id"]
                # n nearest searches that scored a given hotel
                n_nearest_searches = self.find_n_nearest_searches(user_data, n, hotel_id)

                score = self.score_function(n_nearest_searches)
                hotel_scores.append((score, hotel_id))

            # ordered list of hotel_ids w.r.t. scores
            hotel_scores = sorted(hotel_scores, reverse=True)
            # only save hotel ids
            hotel_scores = [[srch_id, int(h[1])] for h in hotel_scores]
            # writer.writerows(hotel_scores)
            output_list += hotel_scores

        return output_list

# train_data = pd.read_csv("/Users/kuba/VU/DMT/data-mining-techniques/A2/100k_train_data.csv")
# test_data = pd.read_csv("/Users/kuba/VU/DMT/data-mining-techniques/A2/processed_test_data.csv")
# cf = CollaborativeFiltering()
# out = cf.fit(train_data, test_data)

In [106]:
import random
import csv

# baseline random
def random_ordering(test_data):
    with open('sub_random.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["srch_id", "prop_id"])
        for srch_id in test_data["srch_id"].unique():
            search_list = []
            user_searches = test_data.loc[test_data["srch_id"] == srch_id]
            for idx, search in user_searches.iterrows():
                hotel_id = search["prop_id"]
                search_list.append([srch_id, int(hotel_id)])

            random.shuffle(search_list)
            writer.writerows(search_list)       

In [16]:
import math

def evaluate(predicted_ranking, actual_ranking, n=5):
    
    idcg = 5
    for i in range(2, n + 1):
        idcg += 1/math.log2(i + 1)
        
    i = 1
    prev = predicted_ranking[0][0] # srch_id
    all_ndcg = []
    dcg = 0
    
    for row in predicted_ranking:
    
        if row[0] != prev:
            i = 1
            dcg = 0
            
        # take only first n values per search
        if i < n + 1:
            hotel_id = row[1]
            # check relevance score based on if hotel was booked
            relevance_score = 0
            if actual_ranking[(actual_ranking['srch_id'] == row[0]) & (actual_ranking['prop_id'] == hotel_id)]['booking_bool'].any() == 1:
                relevance_score = 5
                # chceck relevance score based on if hotel was clicked
            elif actual_ranking[(actual_ranking['srch_id'] == row[0]) & (actual_ranking['prop_id'] == hotel_id)]['click_bool'].any() == 1:
                relevance_score = 1
            # print(f"{i}_{row[0]}_{hotel_id}: {relevance_score}")
            # calculate dcg iteratively per value
            dcg += relevance_score/math.log2(i+1)

        # if last partial score for dcg added then normalize and add to all ndcg list
        if i == n:
            # normalize
            ndcg = dcg/idcg
            all_ndcg.append(ndcg)
            
        prev = row[0]
        i+= 1

    return sum(all_ndcg)/len(all_ndcg)

In [11]:
import random

def train_test_split(dat, n=50):
    random.seed(1)
    train, test = get_train_test(dat, n)
    features_to_remove = ['click_bool','position','booking_bool', 'gross_bookings_usd']
    test_data_sorted = test.sort_values(['srch_id','booking_bool', 'click_bool','position'], ascending = [True, False, False, True])
    test_data_sorted.reset_index(drop=True, inplace=True)
    # correct_order just needed for debugging
    # correct_order = test_data_sorted[['srch_id', 'prop_id']]
    test_data_without_labels = test_data_sorted.drop(features_to_remove, axis=1)
    test_data_with_labels = test_data_sorted
    
    return test_data_with_labels, test_data_without_labels, train
    
def sample_ids(dat, n): # test_prop is proportionate to the number of unique srch_id's, not number of instances

    # unique_ids for searches that have more than 5 instances
    grouped = dat.groupby('srch_id').count()
    more_than_5_instances = grouped[grouped['site_id'] > 4]
    
    unique_ids = list(more_than_5_instances.index)
    test_ids = random.sample(unique_ids,n)
    
    return test_ids

def get_train_test(dat, n):
    
    test_ids = sample_ids(dat, n) # get ids for test set
    test_ids_df = pd.DataFrame({'srch_id':test_ids}) # convert to df for .merge function
    test = test_ids_df.merge(dat, on = 'srch_id', how = 'left')
    
    outer = dat.merge(test_ids_df, on = 'srch_id', how = 'outer', indicator = True)
    train = outer[(outer._merge=='left_only')].drop('_merge', axis=1)
    
    return train, test

## Experiment section

#### load processed data

In [None]:
import pandas as pd
dat = pd.read_csv('/Users/kuba/VU/DMT/data-mining-techniques/A2/100k_train_data.csv')

#### split data and save it (if you want to save it)

In [None]:
test_data_with_labels, test_data_without_labels, train = train_test_split(orginal_data)
# save locally
test_data_without_labels.to_csv('split_test_inference_data_100k.csv')
test_data_with_labels.to_csv('split_test_data_100k.csv')
train.to_csv('split_train_data_100k.csv')

#### load data

In [107]:
import pandas as pd
train_data = pd.read_csv('split_train_data_100k.csv')
test_data_with_labels = pd.read_csv('split_test_data_100k.csv')
test_data_inference = pd.read_csv('split_test_inference_data_100k.csv')

#### run cf experiment with evaluation
CF is being run for different n values
NDCG@5 scores are printed as an output


if you want to play with CF settings focus on modyfing scoring function

In [108]:
import numpy as np
# to turn off numpy warnings
np.seterr(all='ignore')

cf = CollaborativeFiltering()
# to check what is the best n for nearest searches
for i in range(5):
    out = cf.fit(train_data, test_data_inference, i)
    print(evaluate(out, test_data_with_labels))


0.3595469173306926
0.4561681861902457
0.4561531091801045
0.4495358276015838
0.45918203671908536


In [140]:
import xgboost as xgb
import pandas as pd
import csv

train_data = pd.read_csv("/Users/kuba/VU/DMT/2/data-mining-techniques/A2/100k_train_data_adjusted.csv")
test_data = pd.read_csv("/Users/kuba/VU/DMT/2/data-mining-techniques/A2/test_data_processed_adjusted.csv")

t_d = train_data
# create a label for train data
t_d['relevance_score'] = 5*t_d["booking_bool"] + t_d["click_bool"] - t_d["position"] - t_d['gross_bookings_usd']
t_d.drop(['click_bool', 'gross_bookings_usd', 'booking_bool', 'position'], inplace=True, axis=1)
label = pd.DataFrame({'relevance_score': t_d['relevance_score'].values})
t_d.drop(['relevance_score'], inplace=True, axis=1)
dtrain = xgb.DMatrix(t_d, label=label)

# train model
param = {'max_depth':6, 'eta':0.3, 'objective':'rank:pairwise'}
model = xgb.train(param, dtrain, 50)

# run inference
test = test_data
test = test.reindex(columns=t_d.columns)
test.reset_index(inplace=True, drop=True)
dtest = xgb.DMatrix(test)
scores = model.predict(dtest)
scores = list(scores)
output_list = []

# sort and save data due to their relevance scores
with open('relevance_scores_submission.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['srch_id', 'prop_id'])
    order_list = []
    prev = None
    for idx, score in enumerate(scores):
        hotel_id = test.iloc[idx]['prop_id']
        srch_id = test.iloc[idx]['srch_id']
        if(prev is not None and srch_id != prev):
            order_list = sorted(order_list, reverse=True)
            order_list = [[x[1], x[2]] for x in order_list]
            # for el in order_list:
            #    output_list.append(el)
            writer.writerows(order_list)
            order_list = []
        prev = srch_id
            
        order_list.append([score, int(srch_id), int(hotel_id)])
# print(output_list)            
# print(evaluate(output_list, test_data_with_labels))

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.