In [6]:
import os
import pandas as pd
import numpy as np
import math
from scipy.optimize import linear_sum_assignment
import datetime as dt
from collections import defaultdict, Counter

## Visualization
import matplotlib.pyplot as plt
import seaborn as sns

###### Some constants

In [7]:
N_CHILDREN = 1000000
N_GIFT_TYPE = 1000
N_GIFT_QUANTITY = 1000
N_GIFT_PREF = 1000
N_CHILD_PREF = 100
TRIPLETS = 5001
TWINS = 45001

###### Load data

In [8]:
CHILD_PREF = pd.read_csv('../input/child_wishlist_v2.csv', header=None).drop(0, 1).values
GIFT_PREF = pd.read_csv('../input/gift_goodkids_v2.csv', header=None).drop(0, 1).values
print CHILD_PREF.shape
print GIFT_PREF.shape

(1000000, 100)
(1000, 1000)


###### To show ProgressBar

In [9]:
import sys
sys.path.append('../src/')
from progress import ProgressBar
pbar = ProgressBar()

###### Creating dicts to hash happiness

In [10]:
GIFT_HAPPINESS = {}
pbar.setBar(N_GIFT_TYPE)
for g in range(N_GIFT_TYPE):
    pbar.show(g)
    GIFT_HAPPINESS[g] = defaultdict(lambda: -1. / (2 * N_GIFT_PREF))
    for i, c in enumerate(GIFT_PREF[g]):
        GIFT_HAPPINESS[g][c] = 1. * (N_GIFT_PREF - i) / N_GIFT_PREF

CHILD_HAPPINESS = {}
pbar.setBar(N_CHILDREN)
for c in range(N_CHILDREN):
    pbar.show(c)
    CHILD_HAPPINESS[c] = defaultdict(lambda: -1. / (2 * N_CHILD_PREF))
    for i, g in enumerate(CHILD_PREF[c]):
        CHILD_HAPPINESS[c][g] = 1. * (N_CHILD_PREF - i) / N_CHILD_PREF

%time GIFT_IDS = np.array([[g] * N_GIFT_QUANTITY for g in range(N_GIFT_TYPE)]).flatten()
print(GIFT_IDS[:20])

CPU times: user 63.2 ms, sys: 7.03 ms, total: 70.2 ms
Wall time: 72.7 ms
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


###### To compute normalized happiness

In [11]:
def my_avg_normalized_happiness(pred):
    total_child_happiness = 0
    total_gift_happiness = np.zeros(1000)
    print "COMPUTE NORMALIZED HAPPINESS:"
    pbar.setBar(len(pred))
    for i, [c,g] in enumerate(pred):
        pbar.show(i)
        #c = pred[i][0]
        #g = pred[i][1]
        total_child_happiness += CHILD_HAPPINESS[c][g]
        total_gift_happiness[g] += GIFT_HAPPINESS[g][c]
    nch = total_child_happiness / N_CHILDREN
    ngh = np.mean(total_gift_happiness) / 1000
    print('normalized child happiness', nch)
    print('normalized gift happiness', ngh)
    return nch**3. + ngh**3., ngh*N_CHILDREN, nch*N_CHILDREN

In [12]:
### Define a new entropy term
def entropy(gh, ch, g, c):
    return 3.*gh*g*(g + gh) + g**3 + 3.*ch*c*(c + ch) + c**3
### Optimize the total entropy
def optimize_block(child_block, current_gift_ids, gh, ch):
    gift_block = current_gift_ids[child_block]
    C = np.zeros((BLOCK_SIZE, BLOCK_SIZE))
    for i in range(BLOCK_SIZE):
        c = child_block[i]
        for j in range(BLOCK_SIZE):
            g = GIFT_IDS[gift_block[j]]
            C[i, j] = -1. * entropy(gh, ch, GIFT_HAPPINESS[g][c], CHILD_HAPPINESS[c][g])
    row_ind, col_ind = linear_sum_assignment(C)
    return (child_block[row_ind], gift_block[col_ind])

###### Define Block dimenstions

In [13]:
BLOCK_SIZE = 261
INITIAL_SUBMISSION = '../src/twtr.csv'
N_BLOCKS = int((N_CHILDREN - TWINS + BLOCK_SIZE - 1) / BLOCK_SIZE)
print('Block size: {}, n_blocks {}'.format(BLOCK_SIZE, N_BLOCKS))

Block size: 261, n_blocks 3659


In [14]:
subm = pd.read_csv(INITIAL_SUBMISSION)
initial_anh, g, c = my_avg_normalized_happiness(subm[['ChildId', 'GiftId']].values.tolist())
print(initial_anh, g, c)
subm['gift_rank'] = subm.groupby('GiftId').rank() - 1
subm['gift_id'] = subm['GiftId'] * 1000 + subm['gift_rank']
subm['gift_id'] = subm['gift_id'].astype(np.int32)
current_gift_ids = subm['gift_id'].values

COMPUTE NORMALIZED HAPPINESS:
('normalized child happiness', 0.9659252349968284)
('normalized gift happiness', 0.00081807500000001033)
(0.90121941051243415, 818.07500000001028, 965925.2349968284)


###### Doing optimization

In [15]:
# number of iteration = 20
for i in range(1):
    print "Iteration #{0}".format(str(i))
    child_blocks = np.split(np.random.permutation(range(TWINS, N_CHILDREN)), N_BLOCKS)
    pbar.setBar(200)
    for j in range(200):
        pbar.show(j)
        child_block = child_blocks[j]
        cids, gids = optimize_block(child_block, current_gift_ids=current_gift_ids, gh=g, ch=c)
        current_gift_ids[cids] = gids
    subm['GiftId'] = GIFT_IDS[current_gift_ids]
    anh, g, c = my_avg_normalized_happiness(subm[['ChildId', 'GiftId']].values.tolist())
    print(i, anh, g, c)
    print "\n\n"
print(GIFT_IDS[:20])

Iteration #0
COMPUTE NORMALIZED HAPPINESS:
('normalized child happiness', 0.9659265699968284)
('normalized gift happiness', 0.0008175605000000102)
(0, 0.90122314722786245, 817.56050000001017, 965926.5699968283)



[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [16]:
from IPython.display import display
display(subm[45001: 45011])

Unnamed: 0,ChildId,GiftId,gift_rank,gift_id
45001,45001,315,10.0,315010
45002,45002,183,17.0,183017
45003,45003,803,16.0,803016
45004,45004,927,85.0,927085
45005,45005,129,39.0,129039
45006,45006,590,34.0,590034
45007,45007,42,54.0,42054
45008,45008,911,28.0,911028
45009,45009,525,41.0,525041
45010,45010,273,56.0,273056


In [23]:
child_block = np.arange(45001, 45101)
print child_block

[45001 45002 45003 45004 45005 45006 45007 45008 45009 45010 45011 45012
 45013 45014 45015 45016 45017 45018 45019 45020 45021 45022 45023 45024
 45025 45026 45027 45028 45029 45030 45031 45032 45033 45034 45035 45036
 45037 45038 45039 45040 45041 45042 45043 45044 45045 45046 45047 45048
 45049 45050 45051 45052 45053 45054 45055 45056 45057 45058 45059 45060
 45061 45062 45063 45064 45065 45066 45067 45068 45069 45070 45071 45072
 45073 45074 45075 45076 45077 45078 45079 45080 45081 45082 45083 45084
 45085 45086 45087 45088 45089 45090 45091 45092 45093 45094 45095 45096
 45097 45098 45099 45100]


In [24]:
gift_block = current_gift_ids[child_block]
print gift_block

[315010 183017 803016 927085 129039 590034  42054 911028 525041 273056
 848069 779018 791055 619090 160034 575012  49023 883029 893073 586091
 810020 192045 996028 851016 747024 648035 223022 108018 305014 274040
 891020 542029 851017 222088 205030  38010 780010 535043  17079 372058
 313029 189022 294069 600035 789046 458024 562012 192046  48023 683042
 787020 919045 400034 775043 839029 574037 709078 857030 785028 109047
 785029 704020 271029 639062 542030 439041  41082 532446 720027 969042
  18032 656081  62016  87096 332024 886022 331015 703094 481024 770016
  24022  39032 197016 728041 399022  73021 775044 502010  39033  39034
 928016 111054 184038 139026 252025 413016 513069  31048 449078 164014]


In [25]:
gidx = [GIFT_IDS[gift_block[j]] for j in range(100)]
print gidx
print len(set(gidx))

[315, 183, 803, 927, 129, 590, 42, 911, 525, 273, 848, 779, 791, 619, 160, 575, 49, 883, 893, 586, 810, 192, 996, 851, 747, 648, 223, 108, 305, 274, 891, 542, 851, 222, 205, 38, 780, 535, 17, 372, 313, 189, 294, 600, 789, 458, 562, 192, 48, 683, 787, 919, 400, 775, 839, 574, 709, 857, 785, 109, 785, 704, 271, 639, 542, 439, 41, 532, 720, 969, 18, 656, 62, 87, 332, 886, 331, 703, 481, 770, 24, 39, 197, 728, 399, 73, 775, 502, 39, 39, 928, 111, 184, 139, 252, 413, 513, 31, 449, 164]
93


In [26]:
gidx1 = subm['GiftId'].values[child_block]
print gidx1

[315 183 803 927 129 590  42 911 525 273 848 779 791 619 160 575  49 883
 893 586 810 192 996 851 747 648 223 108 305 274 891 542 851 222 205  38
 780 535  17 372 313 189 294 600 789 458 562 192  48 683 787 919 400 775
 839 574 709 857 785 109 785 704 271 639 542 439  41 532 720 969  18 656
  62  87 332 886 331 703 481 770  24  39 197 728 399  73 775 502  39  39
 928 111 184 139 252 413 513  31 449 164]


In [27]:
print gidx1 == np.array(gidx)

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True]


In [29]:
C = np.zeros((100, 100))
for i in range(100):
    for j in range(100):
        c = child_block[i]
        g = gidx[j]
        C[i, j] = (GIFT_HAPPINESS[g][c] + CHILD_HAPPINESS[c][g])
row_ind, col_ind = linear_sum_assignment(C)
print row_ind
print col_ind

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99]
[ 3 99  1  2  5  4  7  6  9  8 12 10 11 14 13 16 18 19 17 20 21 15 23 24 25
 22 27 28 26 30 29 32 31 34 33 36 37 35 39 38 41 40 43 42 46 44 45 48 47 51
 49 50 53 52 56 54 55 58 57 60 59 62 61 64 63 66 65 68 67 71 69 70 73 72 77
 78 74 76 79 75 81 83 80 82 85 84 88 86 87 90 89 92 91 94 93 96 95 98 97  0]


In [32]:
x = child_block[row_ind]
y = gidx1[col_ind]
print x
print y

[45001 45002 45003 45004 45005 45006 45007 45008 45009 45010 45011 45012
 45013 45014 45015 45016 45017 45018 45019 45020 45021 45022 45023 45024
 45025 45026 45027 45028 45029 45030 45031 45032 45033 45034 45035 45036
 45037 45038 45039 45040 45041 45042 45043 45044 45045 45046 45047 45048
 45049 45050 45051 45052 45053 45054 45055 45056 45057 45058 45059 45060
 45061 45062 45063 45064 45065 45066 45067 45068 45069 45070 45071 45072
 45073 45074 45075 45076 45077 45078 45079 45080 45081 45082 45083 45084
 45085 45086 45087 45088 45089 45090 45091 45092 45093 45094 45095 45096
 45097 45098 45099 45100]
[927 164 183 803 590 129 911  42 273 525 791 848 779 160 619  49 893 586
 883 810 192 575 851 747 648 996 108 305 223 891 274 851 542 205 222 780
 535  38 372  17 189 313 600 294 562 789 458  48 192 919 683 787 775 400
 709 839 574 785 857 785 109 271 704 542 639  41 439 720 532 656 969  18
  87  62 703 481 332 331 770 886  39 728  24 197  73 399  39 775 502 928
  39 184 111 252 139 513 