In [60]:
import os
import pandas as pd
import numpy as np
import math
from scipy.optimize import linear_sum_assignment
import datetime as dt
from collections import defaultdict, Counter

## Visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [61]:
N_CHILDREN = 1000000
N_GIFT_TYPE = 1000
N_GIFT_QUANTITY = 1000
N_GIFT_PREF = 1000
N_CHILD_PREF = 100
TRIPLETS = 5001
TWINS = 45001

In [62]:
CHILD_PREF = pd.read_csv('../input/child_wishlist_v2.csv', header=None).drop(0, 1).values
GIFT_PREF = pd.read_csv('../input/gift_goodkids_v2.csv', header=None).drop(0, 1).values
print CHILD_PREF.shape
print GIFT_PREF.shape

(1000000, 100)
(1000, 1000)


In [63]:
import sys
sys.path.append('../src/')
from progress import ProgressBar
pbar = ProgressBar()

In [64]:
GIFT_HAPPINESS = {}
pbar.setBar(N_GIFT_TYPE)
for g in range(N_GIFT_TYPE):
    pbar.show(g)
    GIFT_HAPPINESS[g] = defaultdict(lambda: -1. / (2 * N_GIFT_PREF))
    for i, c in enumerate(GIFT_PREF[g]):
        GIFT_HAPPINESS[g][c] = 1. * (N_GIFT_PREF - i) / N_GIFT_PREF

CHILD_HAPPINESS = {}
pbar.setBar(N_CHILDREN)
for c in range(N_CHILDREN):
    pbar.show(c)
    CHILD_HAPPINESS[c] = defaultdict(lambda: -1. / (2 * N_CHILD_PREF))
    for i, g in enumerate(CHILD_PREF[c]):
        CHILD_HAPPINESS[c][g] = 1. * (N_CHILD_PREF - i) / N_CHILD_PREF

%time GIFT_IDS = np.array([[g] * N_GIFT_QUANTITY for g in range(N_GIFT_TYPE)]).flatten()
print(GIFT_IDS[:20])

CPU times: user 66.1 ms, sys: 1.07 ms, total: 67.1 ms
Wall time: 66.4 ms
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [65]:
N_TRIPLET = 1667
TRIPLET_GIFT_HAPPINESS = np.zeros((N_TRIPLET, N_GIFT_TYPE))
TRIPLET_CHILD_HAPPINESS = np.zeros((N_TRIPLET, N_GIFT_TYPE))
pbar.setBar(N_TRIPLET)
for i in range(N_TRIPLET):
    pbar.show(i)
    for g in range(N_GIFT_TYPE):
        TRIPLET_GIFT_HAPPINESS[i][g] = GIFT_HAPPINESS[g][3*i] + GIFT_HAPPINESS[g][3*i + 1] + GIFT_HAPPINESS[g][3*i +2]
        TRIPLET_CHILD_HAPPINESS[i][g] = CHILD_HAPPINESS[3*i][g]+CHILD_HAPPINESS[3*i+1][g]+CHILD_HAPPINESS[3*i+2][g]
print TRIPLET_CHILD_HAPPINESS[0]
print TRIPLET_GIFT_HAPPINESS[0]

[ 0.59  -0.015 -0.015  0.79   0.04  -0.015 -0.015 -0.015 -0.015 -0.015
 -0.015 -0.015 -0.015 -0.015  0.72  -0.015 -0.015 -0.015  0.05   0.99
 -0.015 -0.015 -0.015 -0.015  0.68  -0.015 -0.015 -0.015 -0.015  0.61
 -0.015  0.33  -0.015 -0.015  0.3   -0.015 -0.015 -0.015 -0.015  0.6    0.06
 -0.015 -0.015 -0.015 -0.015 -0.015 -0.015 -0.015  0.07   0.28  -0.015
  0.515 -0.015 -0.015  0.67  -0.015 -0.015 -0.015 -0.015  0.4   -0.015
  0.99   0.1   -0.015  0.43  -0.015  0.45  -0.015 -0.015 -0.015 -0.015
  1.525  0.44  -0.015 -0.015 -0.015  0.55  -0.015 -0.015  0.18   0.4   -0.015
  0.34  -0.015  0.395  0.68  -0.015 -0.015 -0.015 -0.015 -0.015  0.17
 -0.015  0.    -0.015  0.97   0.05  -0.015 -0.015 -0.015 -0.015 -0.015
 -0.015 -0.015  0.21  -0.015  0.81  -0.015  0.94  -0.015  0.51  -0.015
 -0.015 -0.015 -0.015  0.07  -0.015 -0.015 -0.015 -0.015  0.08   0.97
 -0.015  0.75  -0.015  0.82  -0.015  0.95  -0.015 -0.015 -0.015 -0.015
 -0.015  0.13   1.51  -0.015  0.8    0.66  -0.015 -0.015 -0.015 -0.0

In [66]:
N_TWIN = 20000
TWIN_GIFT_HAPPINESS = np.zeros((N_TWIN, N_GIFT_TYPE))
TWIN_CHILD_HAPPINESS = np.zeros((N_TWIN, N_GIFT_TYPE))
pbar.setBar(N_TWIN)
for i in range(N_TWIN):
    pbar.show(i)
    for g in range(N_GIFT_TYPE):
        TWIN_GIFT_HAPPINESS[i][g] = GIFT_HAPPINESS[g][TRIPLETS + i*2] + GIFT_HAPPINESS[g][TRIPLETS + i*2 + 1]
        TWIN_CHILD_HAPPINESS[i][g] = CHILD_HAPPINESS[TRIPLETS + i*2][g] + CHILD_HAPPINESS[TRIPLETS + i*2 + 1][g]
print TWIN_CHILD_HAPPINESS[0]
print TWIN_GIFT_HAPPINESS[0]

[ 0.855 -0.01  -0.01  -0.01  -0.01  -0.01  -0.01  -0.01  -0.01  -0.01  -0.01
  0.775 -0.01  -0.01  -0.01  -0.01  -0.01  -0.01   0.585  0.645 -0.01  -0.01
  0.305 -0.01  -0.01  -0.01   0.545 -0.01  -0.01  -0.01  -0.01  -0.01  -0.01
 -0.01  -0.01  -0.01  -0.01  -0.01  -0.01  -0.01  -0.01  -0.01   0.395
 -0.01  -0.01  -0.01  -0.01   0.445 -0.01  -0.01  -0.01   0.425 -0.01  -0.01
  0.695 -0.01   0.115 -0.01  -0.01  -0.01  -0.01   0.255 -0.01  -0.01
  0.625  0.355 -0.01  -0.01   0.885 -0.01  -0.01   0.135 -0.01   0.285
 -0.01  -0.01  -0.01  -0.01  -0.01  -0.01   0.005  0.035 -0.01  -0.01  -0.01
 -0.01  -0.01  -0.01  -0.01  -0.01  -0.01  -0.01  -0.01  -0.01  -0.01  -0.01
 -0.01  -0.01  -0.01  -0.01  -0.01   0.715 -0.01  -0.01   0.895 -0.01
  0.395  0.015  1.76  -0.01  -0.01  -0.01  -0.01  -0.01  -0.01  -0.01  -0.01
 -0.01  -0.01   0.735 -0.01  -0.01  -0.01  -0.01   0.575  0.805 -0.01  -0.01
 -0.01   0.155 -0.01  -0.01  -0.01   0.715 -0.01  -0.01  -0.01  -0.01  -0.01
 -0.01  -0.01  -0.01   0.

In [67]:
INITIAL_SUBMISSION = '../src/twtr.csv'

In [68]:
def my_avg_normalized_happiness(pred):
    total_child_happiness = 0
    total_gift_happiness = np.zeros(1000)
    print "COMPUTE NORMALIZED HAPPINESS:"
    pbar.setBar(len(pred))
    for i, [c,g] in enumerate(pred):
        pbar.show(i)
        #c = pred[i][0]
        #g = pred[i][1]
        total_child_happiness += CHILD_HAPPINESS[c][g]
        total_gift_happiness[g] += GIFT_HAPPINESS[g][c]
    nch = total_child_happiness / N_CHILDREN
    ngh = np.mean(total_gift_happiness) / 1000
    print('normalized child happiness', nch)
    print('normalized gift happiness', ngh)
    return nch**3. + ngh**3., ngh*N_CHILDREN, nch*N_CHILDREN

In [69]:
subm = pd.read_csv(INITIAL_SUBMISSION)
initial_anh, g, c = my_avg_normalized_happiness(subm[['ChildId', 'GiftId']].values.tolist())
print(initial_anh, g, c)
subm['gift_rank'] = subm.groupby('GiftId').rank() - 1
subm['gift_id'] = subm['GiftId'] * 1000 + subm['gift_rank']
subm['gift_id'] = subm['gift_id'].astype(np.int32)
current_gift_ids = subm['gift_id'].values

COMPUTE NORMALIZED HAPPINESS:
('normalized child happiness', 0.9659594199968226)
('normalized gift happiness', 0.00081820600000001033)
(0.90131509889959793, 818.20600000001036, 965959.4199968226)


In [70]:
from IPython.display import display
display(subm[:8])

Unnamed: 0,ChildId,GiftId,gift_rank,gift_id
0,0,121,0.0,121000
1,1,121,1.0,121001
2,2,121,2.0,121002
3,3,292,0.0,292000
4,4,292,1.0,292001
5,5,292,2.0,292002
6,6,791,0.0,791000
7,7,791,1.0,791001


In [71]:
tri_tmp = subm[['ChildId', 'GiftId']][:TRIPLETS].as_matrix()
print tri_tmp.shape
print tri_tmp[:10]

(5001, 2)
[[  0 121]
 [  1 121]
 [  2 121]
 [  3 292]
 [  4 292]
 [  5 292]
 [  6 791]
 [  7 791]
 [  8 791]
 [  9 240]]


In [72]:
tri_mat = np.array([[i, tri_tmp[3*i][1]] for i in range(N_TRIPLET)])
print tri_mat.shape
tri_df = pd.DataFrame({'ChildId' : tri_mat[:,0], 'GiftId' : tri_mat[:, 1]})
display(tri_df[:10])

(1667, 2)


Unnamed: 0,ChildId,GiftId
0,0,121
1,1,292
2,2,791
3,3,240
4,4,494
5,5,487
6,6,77
7,7,542
8,8,896
9,9,302


In [73]:
twin_tmp = subm[['ChildId', 'GiftId']][TRIPLETS: TWINS].as_matrix()
print twin_tmp.shape
print twin_tmp[:10]

(40000, 2)
[[5001  768]
 [5002  768]
 [5003  826]
 [5004  826]
 [5005  688]
 [5006  688]
 [5007  648]
 [5008  648]
 [5009  530]
 [5010  530]]


In [74]:
N_TWIN = 20000
twin_mat = np.array([[i, twin_tmp[2*i][1]] for i in range(N_TWIN)])
print twin_mat.shape
twin_df = pd.DataFrame({'ChildId' : twin_mat[:,0], 'GiftId' : twin_mat[:, 1]})
display(twin_df[:10])

(20000, 2)


Unnamed: 0,ChildId,GiftId
0,0,768
1,1,826
2,2,688
3,3,648
4,4,530
5,5,177
6,6,430
7,7,410
8,8,705
9,9,625


In [75]:
single_df = subm[['ChildId', 'GiftId']][TWINS:]
display(single_df[:10])

Unnamed: 0,ChildId,GiftId
45001,45001,315
45002,45002,183
45003,45003,803
45004,45004,927
45005,45005,129
45006,45006,590
45007,45007,42
45008,45008,911
45009,45009,525
45010,45010,273


In [76]:
def tri_happiness(pred):
    gh = 0.
    ch = 0.
    for i, [c,g] in enumerate(pred):
        gh += TRIPLET_GIFT_HAPPINESS[c][g]
        ch += TRIPLET_CHILD_HAPPINESS[c][g]
    print('triplet child happiness', ch)
    print('triplet gift happiness', gh)
    return gh, ch

In [77]:
tr_gh, tr_ch = tri_happiness(tri_df[['ChildId', 'GiftId']].values.tolist())

('triplet child happiness', 1557.2700000000123)
('triplet gift happiness', 17.174499999999924)


In [78]:
def twin_happiness(pred):
    gh = 0.
    ch = 0.
    for i, [c,g] in enumerate(pred):
        gh += TWIN_GIFT_HAPPINESS[c][g]
        ch += TWIN_CHILD_HAPPINESS[c][g]
    print('twin child happiness', ch)
    print('twin gift happiness', gh)
    return gh, ch
tw_gh, tw_ch = twin_happiness(twin_df[['ChildId', 'GiftId']].values.tolist())

('twin child happiness', 20168.060000001948)
('twin gift happiness', 109.3194999999758)


In [79]:
def single_happiness(pred):
    gh = 0.
    ch = 0.
    print("COMPUTE SINGLE HAPPINESS...")
    pbar.setBar(len(pred))
    for i, [c,g] in enumerate(pred):
        pbar.show(i)
        gh += GIFT_HAPPINESS[g][c]
        ch += CHILD_HAPPINESS[c][g]
    print('single child happiness', ch)
    print('single gift happiness', gh)
    return gh, ch
si_gh, si_ch = single_happiness(subm[['ChildId', 'GiftId']][TWINS:].values.tolist())

COMPUTE SINGLE HAPPINESS...
('single child happiness', 944234.0899969263)
('single gift happiness', 691.7120000092888)


In [80]:
gh = si_gh + tw_gh + tr_gh
ch = si_ch + tw_ch + tr_ch
score = (gh/N_CHILDREN)**3. + (ch/N_CHILDREN)**3.
print("CURRENT SCORE = {0}".format(str(score)))

CURRENT SCORE = 0.9013150989


In [81]:
### Define a new entropy term
def entropy(gh, ch, g, c):
    return 3.*gh*g*(g + gh) + g**3 + 3.*ch*c*(c + ch) + c**3
### Optimize the total entropy
def optimize_single_block(child_block, gift_block, gh, ch):
    SIZE = int(len(child_block))
    C = np.zeros((SIZE, SIZE))
    for i in range(SIZE):
        c = child_block[i]
        for j in range(SIZE):
            g = gift_block[j]
            C[i, j] = -1. * entropy(gh, ch, GIFT_HAPPINESS[g][c], CHILD_HAPPINESS[c][g])
    row_ind, col_ind = linear_sum_assignment(C)
    return (child_block[row_ind], gift_block[col_ind])

In [82]:
def optimize_twin_block(child_block, gift_block, gh, ch):
    SIZE = int(len(child_block))
    C = np.zeros((SIZE, SIZE))
    for i in range(SIZE):
        c = child_block[i]
        for j in range(SIZE):
            g = gift_block[j]
            C[i, j] = -1. * entropy(gh, ch, TWIN_GIFT_HAPPINESS[c][g], TWIN_CHILD_HAPPINESS[c][g])
    row_ind, col_ind = linear_sum_assignment(C)
    return (child_block[row_ind], gift_block[col_ind])

In [83]:
def optimize_triplet_block(child_block, gift_block, gh, ch):
    SIZE = int(len(child_block))
    C = np.zeros((SIZE, SIZE))
    for i in range(SIZE):
        c = child_block[i]
        for j in range(SIZE):
            g = gift_block[j]
            C[i, j] = -1. * entropy(gh, ch, TRIPLET_GIFT_HAPPINESS[c][g], TRIPLET_CHILD_HAPPINESS[c][g])
    row_ind, col_ind = linear_sum_assignment(C)
    return (child_block[row_ind], gift_block[col_ind])

In [84]:
BLOCK_SIZE = 261
N_BLOCKS = int((N_CHILDREN - TWINS + BLOCK_SIZE - 1) / BLOCK_SIZE)
## optimize single
print gh, ch
single_ids = subm['GiftId'].values
child_blocks = np.split(np.random.permutation(range(TWINS, N_CHILDREN)), N_BLOCKS)
pbar.setBar(200)
for j in range(200):
    pbar.show(j)
    child_block = child_blocks[j]
    gift_block = single_ids[child_block]
    cids, gids = optimize_single_block(child_block, gift_block, gh=gh, ch=ch)
    single_ids[cids] = gids
subm['GiftId'] = single_ids
si_gh, si_ch = single_happiness(subm[['ChildId', 'GiftId']][TWINS:].values.tolist())
gh = si_gh + tw_gh + tr_gh
ch = si_ch + tw_ch + tr_ch
score = (gh/N_CHILDREN)**3. + (ch/N_CHILDREN)**3.
print(score, gh, ch, si_gh, si_ch)
print "\n\n"

818.206000009 965959.419997
COMPUTE SINGLE HAPPINESS...
('single child happiness', 944235.499996926)
('single gift happiness', 691.7120000092888)
(0.90131904582390698, 818.20600000926459, 965960.82999692799, 691.7120000092888, 944235.499996926)





In [85]:
triplet_idx = tri_df['GiftId'].values
for j in range(1):
    triplet_block = np.random.permutation(range(0, N_TRIPLET))[:200]
    tr_g_block = triplet_idx[triplet_block]
    print tr_g_block.shape, triplet_block.shape
    cids, gids = optimize_triplet_block(triplet_block, tr_g_block, gh=gh, ch=ch)
    triplet_idx[cids] = gids
tri_df['GiftId'] = triplet_idx
tr_gh, tr_ch = tri_happiness(tri_df[['ChildId', 'GiftId']].values.tolist())
gh = si_gh + tw_gh + tr_gh
ch = si_ch + tw_ch + tr_ch
score = (gh/N_CHILDREN)**3. + (ch/N_CHILDREN)**3.
print(score, gh, ch, si_gh, si_ch)
print "\n\n"

(200,) (200,)
('triplet child happiness', 1573.1950000000124)
('triplet gift happiness', 13.806999999999924)
(0.90136362446462603, 814.83850000926452, 965976.75499692804, 691.7120000092888, 944235.499996926)





In [86]:
twin_idx = twin_df['GiftId'].values
for j in range(5):
    twin_block = np.random.permutation(range(0, N_TWIN))[:200]
    tw_g_block = twin_idx[twin_block]
    cids, gids = optimize_twin_block(twin_block, tw_g_block, gh=gh, ch=ch)
    twin_idx[cids] = gids
twin_df['GiftId'] = twin_idx
tw_gh, tw_ch = twin_happiness(twin_df[['ChildId', 'GiftId']].values.tolist())
gh = si_gh + tw_gh + tr_gh
ch = si_ch + tw_ch + tr_ch
score = (gh/N_CHILDREN)**3. + (ch/N_CHILDREN)**3.
print(score, gh, ch, si_gh, si_ch)
print "\n\n"

('twin child happiness', 20217.175000001916)
('twin gift happiness', 106.98849999997587)
(0.90150112070447541, 812.5075000092645, 966025.86999692803, 691.7120000092888, 944235.499996926)





In [88]:
tri_list = tri_df.GiftId.tolist()
tl = []
for g in tri_list:
    tl += [g]*3
print len(tl)
print tl

5001
[121, 121, 121, 292, 292, 292, 791, 791, 791, 240, 240, 240, 494, 494, 494, 487, 487, 487, 77, 77, 77, 542, 542, 542, 896, 896, 896, 302, 302, 302, 418, 418, 418, 632, 632, 632, 872, 872, 872, 359, 359, 359, 794, 794, 794, 467, 467, 467, 566, 566, 566, 692, 692, 692, 687, 687, 687, 804, 804, 804, 683, 683, 683, 302, 302, 302, 756, 756, 756, 46, 46, 46, 353, 353, 353, 894, 894, 894, 27, 27, 27, 455, 455, 455, 389, 389, 389, 88, 88, 88, 294, 294, 294, 671, 671, 671, 475, 475, 475, 186, 186, 186, 320, 320, 320, 894, 894, 894, 998, 998, 998, 409, 409, 409, 356, 356, 356, 872, 872, 872, 225, 225, 225, 176, 176, 176, 995, 995, 995, 950, 950, 950, 48, 48, 48, 998, 998, 998, 824, 824, 824, 692, 692, 692, 819, 819, 819, 796, 796, 796, 788, 788, 788, 791, 791, 791, 320, 320, 320, 903, 903, 903, 884, 884, 884, 46, 46, 46, 893, 893, 893, 180, 180, 180, 92, 92, 92, 437, 437, 437, 148, 148, 148, 921, 921, 921, 219, 219, 219, 171, 171, 171, 267, 267, 267, 309, 309, 309, 776, 776, 776, 640, 640, 

In [89]:
twin_list = twin_df.GiftId.tolist()
tw = []
for g in twin_list:
    tw += [g]*2
print len(tw)
print tw

40000
[768, 768, 826, 826, 688, 688, 648, 648, 530, 530, 177, 177, 430, 430, 410, 410, 705, 705, 625, 625, 193, 193, 943, 943, 336, 336, 118, 118, 815, 815, 963, 963, 13, 13, 709, 709, 784, 784, 760, 760, 614, 614, 123, 123, 42, 42, 686, 686, 199, 199, 812, 812, 684, 684, 279, 279, 154, 154, 351, 351, 681, 681, 645, 645, 800, 800, 670, 670, 272, 272, 118, 118, 272, 272, 45, 45, 144, 144, 998, 998, 136, 136, 489, 489, 970, 970, 457, 457, 409, 409, 609, 609, 654, 654, 494, 494, 163, 163, 282, 282, 933, 933, 338, 338, 494, 494, 109, 109, 907, 907, 837, 837, 978, 978, 67, 67, 734, 734, 671, 671, 412, 412, 192, 192, 824, 824, 468, 468, 300, 300, 785, 785, 273, 273, 171, 171, 102, 102, 917, 917, 813, 813, 629, 629, 599, 599, 974, 974, 573, 573, 631, 631, 304, 304, 922, 922, 10, 10, 176, 176, 220, 220, 958, 958, 460, 460, 112, 112, 500, 500, 92, 92, 433, 433, 190, 190, 150, 150, 118, 118, 270, 270, 751, 751, 150, 150, 663, 663, 875, 875, 682, 682, 240, 240, 894, 894, 997, 997, 832, 832, 999, 

In [91]:
single_list = subm.GiftId.tolist()[TWINS:]
dic = {}
dic['ChildId'] = [i for i in range(N_CHILDREN)]
dic['GiftId'] = tl + tw + single_list
output = pd.DataFrame(dic)
display(output[:10])

Unnamed: 0,ChildId,GiftId
0,0,121
1,1,121
2,2,121
3,3,292
4,4,292
5,5,292
6,6,791
7,7,791
8,8,791
9,9,240


In [93]:
output.to_csv('../src/twtr.csv', index=False)