In [1]:
import pandas as pd
import numpy as np

from collections import Counter
import operator
from fractions import gcd

### read data

In [2]:
%%time
wish = pd.read_csv('/group/amfs_saving_model/jli/santa/child_wishlist_v2.csv', header=None).as_matrix()[:, 1:]
gift = pd.read_csv('/group/amfs_saving_model/jli/santa/gift_goodkids_v2.csv', header=None).as_matrix()[:, 1:]

CPU times: user 8.79 s, sys: 743 ms, total: 9.54 s
Wall time: 9.53 s


In [3]:
print wish.shape
print gift.shape

(1000000, 100)
(1000, 1000)


In [19]:
# list to record the assigned child
answ = np.zeros(len(wish), dtype=np.int32)
answ[:] = -1

# list to track the number of assigned gift
gift_count = np.zeros(len(gift), dtype=np.int32)

### get overall happiness
Get all positive combinations, from the side of kid or Santa

In [5]:
def get_overall_happiness(wish, gift):
    
    res_child = dict()
    for child_idx in range(wish.shape[0]):
        for gift_rank in range(wish.shape[1]):
            res_child[(child_idx, wish[child_idx, gift_rank])] = int((wish.shape[1] - gift_rank) * 2000)
            
    res_santa = dict()
    for gift_idx in range(gift.shape[0]):
        for child_rank in range(gift.shape[1]):
            res_santa[(gift[gift_idx, child_rank], gift_idx)] = int((gift.shape[1] - child_rank) * 2)
    
    positive_cases = list(set(res_santa.keys()) | set(res_child.keys()))
    print('Positive case tuples (child, gift): {}'.format(len(positive_cases)))
    
    res = dict()
    for p in positive_cases:
        res[p] = 0
        if p in res_child:
            res[p] += res_child[p]
        if p in res_santa:
            res[p] += res_santa[p]
            
    return res

In [6]:
import time

In [7]:
start_time = time.time()
happiness = get_overall_happiness(wish, gift)

print('Cost %.3f seconds..' %(time.time() - start_time))

Positive case tuples (child, gift): 100899773
Cost 509.490 seconds..


In [8]:
len(happiness)

100899773

### get most desired gifts
Rank gifts by their popularity

In [9]:
def get_most_desired_gifts(wish):
    a = dict(Counter(np.ravel(wish)))
    best_gifts = sorted(a.items(), key=operator.itemgetter(1), reverse=True)
    return best_gifts

In [10]:
%time best_gifts = get_most_desired_gifts(wish)

CPU times: user 36.9 s, sys: 204 ms, total: 37.1 s
Wall time: 37 s


In [11]:
best_gifts[0]

(474, 193075)

### normalize happiness based on the popularity of gifts

In [12]:
def recalc_happiness(happiness, best_gifts):
    recalc = dict()
    for b in best_gifts:
        recalc[b[0]] = b[1] / 200000.
        
    for h in happiness:
        c, g = h
        happiness[h] /= recalc[g]
        
    return happiness

In [13]:
%time happiness_adj = recalc_happiness(happiness, best_gifts)

CPU times: user 1min 28s, sys: 1.47 s, total: 1min 29s
Wall time: 1min 29s


In [14]:
%time sorted_happiness = sorted(happiness_adj.items(), key=operator.itemgetter(1), reverse=True)

CPU times: user 7min 7s, sys: 8.7 s, total: 7min 16s
Wall time: 7min 16s


In [15]:
sorted_happiness[0]

((161400, 494), 2800000000.0000005)

### greedy assignment start

In [20]:
%%time
start_time = time.time()

for i in range(len(sorted_happiness)):
    if (i % 1000000 == 0) and (i != 0):
        print('Finsh %d happiness, cost %.3f mins' %(i, (time.time() - start_time) / 60.))
    
    c, g = sorted_happiness[i][0]
    
    # if the child is already assigned with gift
    if answ[c] != -1:
        continue
    
    # if no gift is left
    if gift_count[g] >= 1000:
        continue
        
    if c <= 5000 and gift_count[g] <= 997:
        if c % 3 == 0:
            answ[[c, c+1, c+2]] = g
        elif c % 3 == 1:
            answ[[c-1, c, c+1]] = g
        else:
            answ[[c-2, c-1, c]] = g
        gift_count[g] += 3
    if c > 5000 and c <= 45000 and gift_count[g] <= 998:
        if c % 2 == 0:
            answ[[c-1, c]] = g
        else:
            answ[[c, c+1]] = g
        gift_count[g] += 2
    if c > 45000:
        answ[c] = g
        gift_count[g] += 1
        
print('Left unhappy children: ', len(answ[answ == -1]))

Finsh 1000000 happiness, cost 0.079 mins
Finsh 2000000 happiness, cost 0.134 mins
Finsh 3000000 happiness, cost 0.191 mins
Finsh 4000000 happiness, cost 0.245 mins
Finsh 5000000 happiness, cost 0.297 mins
Finsh 6000000 happiness, cost 0.348 mins
Finsh 7000000 happiness, cost 0.398 mins
Finsh 8000000 happiness, cost 0.449 mins
Finsh 9000000 happiness, cost 0.497 mins
Finsh 10000000 happiness, cost 0.546 mins
Finsh 11000000 happiness, cost 0.594 mins
Finsh 12000000 happiness, cost 0.641 mins
Finsh 13000000 happiness, cost 0.691 mins
Finsh 14000000 happiness, cost 0.737 mins
Finsh 15000000 happiness, cost 0.785 mins
Finsh 16000000 happiness, cost 0.831 mins
Finsh 17000000 happiness, cost 0.876 mins
Finsh 18000000 happiness, cost 0.920 mins
Finsh 19000000 happiness, cost 0.964 mins
Finsh 20000000 happiness, cost 1.007 mins
Finsh 21000000 happiness, cost 1.050 mins
Finsh 22000000 happiness, cost 1.092 mins
Finsh 23000000 happiness, cost 1.133 mins
Finsh 24000000 happiness, cost 1.175 mins
F

### check unhappy children

In [21]:
np.where(answ==-1)

(array([ 45930,  48599,  49306,  50975,  54163,  55927,  56165,  66974,
         84896,  86570,  97053, 100988, 101304, 108352, 111584, 114636,
        121593, 126961, 133209, 134526, 146715, 147442, 159161, 161019,
        162161, 163596, 171147, 175655, 185782, 191471, 199577, 204784,
        238021, 242780, 245968, 253213, 259379, 261340, 263790, 266913,
        272474, 272623, 274132, 279534, 282775, 283226, 288614, 295265,
        297484, 307810, 312797, 317070, 341499, 341881, 350424, 356769,
        391300, 407820, 414268, 417867, 421403, 427746, 429093, 430830,
        431975, 440304, 445383, 451809, 459682, 460262, 461951, 466819,
        468677, 468704, 478028, 484360, 485448, 491454, 503161, 504823,
        508209, 508873, 508891, 518678, 533522, 534573, 535809, 543050,
        543676, 552422, 558267, 558978, 570656, 572814, 575932, 576822,
        590952, 603687, 603897, 611017, 634098, 640527, 652008, 653767,
        656896, 666092, 677813, 691809, 711715, 720589, 723535, 

In [22]:
for c in range(45001, len(answ)):
    if answ[c] == -1:
        g = np.argmin(gift_count)
        answ[c] = g
        gift_count[g] += 1

### check finish

In [23]:
if answ.min() == -1:
    print('Some children without present')

if gift_count.max() > 1000:
    print('Some error in kernel: {}'.format(gift_count.max()))

### evaluate

In [26]:
import math

def lcm(a, b):
    """Compute the lowest common multiple of a and b"""
    # in case of large numbers, using floor division
    return a * b // gcd(a, b)

def avg_normalized_happiness(pred, gift, wish):
    
    n_children = 1000000 # n children to give
    n_gift_type = 1000 # n types of gifts available
    n_gift_quantity = 1000 # each type of gifts are limited to this quantity
    n_gift_pref = 100 # number of gifts a child ranks
    n_child_pref = 1000 # number of children a gift ranks
    twins = math.ceil(0.04 * n_children / 2.) * 2    # 4% of all population, rounded to the closest number
    triplets = math.ceil(0.005 * n_children / 3.) * 3    # 0.5% of all population, rounded to the closest number
    ratio_gift_happiness = 2
    ratio_child_happiness = 2

    # check if triplets have the same gift
    for t1 in np.arange(0, triplets, 3):
        t1 = int(t1)
        triplet1 = pred[t1]
        triplet2 = pred[t1+1]
        triplet3 = pred[t1+2]
        # print(t1, triplet1, triplet2, triplet3)
        assert triplet1 == triplet2 and triplet2 == triplet3
                
    # check if twins have the same gift
    for t1 in np.arange(triplets, triplets+twins, 2):
        t1 = int(t1)
        twin1 = pred[t1]
        twin2 = pred[t1+1]
        # print(t1)
        assert twin1 == twin2

    max_child_happiness = n_gift_pref * ratio_child_happiness
    max_gift_happiness = n_child_pref * ratio_gift_happiness
    total_child_happiness = 0
    total_gift_happiness = np.zeros(n_gift_type)
    
    for i in range(len(pred)):
        child_id = i
        gift_id = pred[i]
        
        # check if child_id and gift_id exist
        assert child_id < n_children
        assert gift_id < n_gift_type
        assert child_id >= 0 
        assert gift_id >= 0
        child_happiness = (n_gift_pref - np.where(wish[child_id]==gift_id)[0]) * ratio_child_happiness
        if not child_happiness:
            child_happiness = -1

        gift_happiness = ( n_child_pref - np.where(gift[gift_id]==child_id)[0]) * ratio_gift_happiness
        if not gift_happiness:
            gift_happiness = -1

        total_child_happiness += child_happiness
        total_gift_happiness[gift_id] += gift_happiness
        
    denominator1 = n_children*max_child_happiness
    denominator2 = n_gift_quantity*max_gift_happiness*n_gift_type
    common_denom = lcm(denominator1, denominator2)
    multiplier = common_denom / denominator1

    ret = float(math.pow(total_child_happiness*multiplier,3) + \
        math.pow(np.sum(total_gift_happiness),3)) / float(math.pow(common_denom,3))
    return ret

In [27]:
score = avg_normalized_happiness(answ, gift, wish)
print('Predicted score: {:.8f}'.format(score))

Predicted score: 0.89410213
