In [None]:
from numpy import dot
from numpy.linalg import norm
import csv
import pandas as pd

# user based collaborative filtering
class CollaborativeFiltering:
    
    # we need more search features
    # we can remove all hotel features
    def __init__(self):
        self.search_features = ["srch_adults_count", "srch_saturday_night_bool", "site_id", "visitor_location_country_id", "srch_length_of_stay", "srch_room_count"]
    
    def cos_sim(self, user_data, test_data):
        categorical = ["site_id", "visitor_location_country_id"]
        user_categorical, user_numeric = user_data[categorical] , user_data.drop(categorical)
        test_categorical, test_numeric = test_data[categorical], test_data.drop(categorical)
        
        # for numeric calc cos_sim
        a = user_numeric.to_numpy()
        b = test_numeric.to_numpy()
        
        cos_sim_score = dot(a, b)/(norm(a)*norm(b))
        
        # for cateforical_data increase cos_sim_score if it matches
        for idx, col in enumerate(user_categorical):
            if user_categorical[idx] == test_categorical[idx]:
                cos_sim_score += 0.15
    
        return cos_sim_score
    
    def get_cosine_similarities(self, train_data, user_data):
        cosine_scores = []
        for idx, t_data in train_data.iterrows():
            cosine_scores.append((self.cos_sim(user_data, t_data) , idx))
            
        return sorted(cosine_scores, reverse=True)
    
    def find_n_nearest_searches(self, user_data, n, hotel_id):
        # take only those users who have data for this hotel
        train_data = self.train_data.loc[self.train_data["prop_id"] == hotel_id]
        train_data = train_data[self.search_features]
        # get n best rows (with highest cosine similiarity)
        similiarities = self.get_cosine_similarities(train_data, user_data)[:n]
        idx = [i[1] for i in similiarities]
        return self.train_data.iloc[idx]
        
    def score_function(self, n_nearest_searches):
        if n_nearest_searches.empty:
            return -10
        score = 0
        b_w = 5 # booking weight
        for _, x in n_nearest_searches.iterrows():
            score += b_w * x["booking_bool"] + x["click_bool"] - x["position"]
        # return avg score 
        return score/len(n_nearest_searches)
    
    def fit(self, train_data, test_data, n=5):
        self.train_data = train_data
        output_list = []
        # for every user search
        with open('cf.csv', 'w') as f:
            writer = csv.writer(f)
            writer.writerow(["srch_id", "prop_id"])
            for srch_id in test_data["srch_id"].unique():
                user_searches = test_data.loc[test_data["srch_id"] == srch_id]
                hotel_scores = []
                user_data = user_searches[self.search_features].iloc[0]
                for idx, search in user_searches.iterrows():
                    # find n nearest users user_data + search_specific_data
                    hotel_id = search["prop_id"]
                    # n nearest searches that scored a given hotel
                    n_nearest_searches = self.find_n_nearest_searches(user_data, n, hotel_id)
            
                    score = self.score_function(n_nearest_searches)
                    hotel_scores.append((score, hotel_id))

                # ordered list of hotel_ids w.r.t. scores
                hotel_scores = sorted(hotel_scores, reverse=True)
                # only save hotel ids
                hotel_scores = [[srch_id, int(h[1])] for h in hotel_scores]
                writer.writerows(hotel_scores)
                # output_list += hotel_scores

            return output_list
    
    def eval(self, train_data, test_data):
        pass

train_data = pd.read_csv("/Users/kuba/VU/DMT/data-mining-techniques/A2/100k_train_data.csv")
test_data = pd.read_csv("/Users/kuba/VU/DMT/data-mining-techniques/A2/processed_test_data.csv")
cf = CollaborativeFiltering()
out = cf.fit(train_data, test_data)

In [None]:
# random ordering as a baseline

In [None]:
import random
import csv

def random_ordering(test_data):
    with open('sub_random.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["srch_id", "prop_id"])
        for srch_id in test_data["srch_id"].unique():
            search_list = []
            user_searches = test_data.loc[test_data["srch_id"] == srch_id]
            for idx, search in user_searches.iterrows():
                hotel_id = search["prop_id"]
                search_list.append([srch_id, int(hotel_id)])

            random.shuffle(search_list)
            writer.writerows(search_list)

random_ordering(test_data)            

In [None]:
import math

def eval(predicted_ranking, actual_ranking, n=5):
    
    idcg = 5
    for i in range(2, n + 1):
        idcg += 1/math.log2(i + 1)
        
    i = 1
    prev = predicted_ranking[0][0] # srch_id
    all_ndcg = []
    dcg = 0
    
    for row in predicted_ranking:
    
        if row[0] != prev:
            i = 1
            dcg = 0
            
        # take only first n values per search
        if i < n + 1:
            hotel_id = row[1]
            # check relevance score based on if hotel was booked
            relevance_score = 0
            if actual_ranking[actual_ranking['srch_id' == row[0]] && actual_ranking['prop_id' == hotel_id]]['book_bool'] == 1:
                relevance_score = 5
                # chceck relevance score based on if hotel was clicked
            elif actual_ranking[actual_ranking['srch_id' == row[0]] && actual_ranking['prop_id' == hotel_id]]['click_bool'] == 1:
                relevance_score = 1
                
            # calculate dcg iteratively per value
            dcg += relevance_score/math.log2(i+1)

        # if last partial score for dcg added then normalize and add to all ndcg list
        if i == n:
            # normalize
            ndcg = dcg/idcg
            all_ndcg.append(ndcg)
            
        prev = row[0]
        i+= 1

    return sum(all_ndcg)/len(all_ndcg)

In [2]:
import pandas as pd

train_data = pd.read_csv('split_train_data_100k.csv')
test_data_with_labels = pd.read_csv('split_test_data_100k.csv')
test_data_inference = pd.read_csv('split_test_inference_data_100k.csv')

In [None]:
cf = CollaborativeFiltering()
out = cf.fit(train_data, test_data)