In [1]:
import os
import pandas as pd
import numpy as np
import math
from scipy.optimize import linear_sum_assignment
import datetime as dt
from collections import defaultdict, Counter

## Visualization
import matplotlib.pyplot as plt
import seaborn as sns

###### Some constants

In [2]:
N_CHILDREN = 1000000
N_GIFT_TYPE = 1000
N_GIFT_QUANTITY = 1000
N_GIFT_PREF = 1000
N_CHILD_PREF = 100
TRIPLETS = 5001
TWINS = 45001

###### Load data

In [3]:
CHILD_PREF = pd.read_csv('../input/child_wishlist_v2.csv', header=None).drop(0, 1).values
GIFT_PREF = pd.read_csv('../input/gift_goodkids_v2.csv', header=None).drop(0, 1).values
print CHILD_PREF.shape
print GIFT_PREF.shape

(1000000, 100)
(1000, 1000)


###### To show ProgressBar

In [7]:
import sys
sys.path.append('../src/')
from progress import ProgressBar
pbar = ProgressBar()

###### Creating dicts to hash happiness

In [10]:
GIFT_HAPPINESS = {}
pbar.setBar(N_GIFT_TYPE)
for g in range(N_GIFT_TYPE):
    pbar.show(g)
    GIFT_HAPPINESS[g] = defaultdict(lambda: -1. / (2 * N_GIFT_PREF))
    for i, c in enumerate(GIFT_PREF[g]):
        GIFT_HAPPINESS[g][c] = 1. * (N_GIFT_PREF - i) / N_GIFT_PREF

CHILD_HAPPINESS = {}
pbar.setBar(N_CHILDREN)
for c in range(N_CHILDREN):
    pbar.show(c)
    CHILD_HAPPINESS[c] = defaultdict(lambda: -1. / (2 * N_CHILD_PREF))
    for i, g in enumerate(CHILD_PREF[c]):
        CHILD_HAPPINESS[c][g] = 1. * (N_CHILD_PREF - i) / N_CHILD_PREF

%time GIFT_IDS = np.array([[g] * N_GIFT_QUANTITY for g in range(N_GIFT_TYPE)]).flatten()
print(GIFT_IDS[:20])

CPU times: user 69 ms, sys: 4.13 ms, total: 73.1 ms
Wall time: 72.2 ms
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


###### To compute normalized happiness

In [32]:
def my_avg_normalized_happiness(pred):
    total_child_happiness = 0
    total_gift_happiness = np.zeros(1000)
    print "COMPUTE NORMALIZED HAPPINESS:"
    pbar.setBar(len(pred))
    for i, [c,g] in enumerate(pred):
        pbar.show(i)
        #c = pred[i][0]
        #g = pred[i][1]
        total_child_happiness +=  -CHILD_HAPPINESS[c][g]
        total_gift_happiness[g] += -GIFT_HAPPINESS[g][c]
    nch = total_child_happiness / N_CHILDREN
    ngh = np.mean(total_gift_happiness) / 1000
    print('normalized child happiness', nch)
    print('normalized gift happiness', ngh)
    return nch**3. + ngh**3., ngh*N_CHILDREN, nch*N_CHILDREN

In [33]:
### Define a new entropy term
def entropy(gh, ch, g, c):
    return 3.*gh*g*(g + gh) + g**3 + 3.*ch*c*(c + ch) + c**3
### Optimize the total entropy
def optimize_block(child_block, current_gift_ids, gh, ch):
    gift_block = current_gift_ids[child_block]
    C = np.zeros((BLOCK_SIZE, BLOCK_SIZE))
    for i in range(BLOCK_SIZE):
        c = child_block[i]
        for j in range(BLOCK_SIZE):
            g = GIFT_IDS[gift_block[j]]
            C[i, j] = -1. * entropy(gh, ch, GIFT_HAPPINESS[g][c], CHILD_HAPPINESS[c][g])
    row_ind, col_ind = linear_sum_assignment(C)
    return (child_block[row_ind], gift_block[col_ind])

###### Define Block dimenstions

In [34]:
BLOCK_SIZE = 261
INITIAL_SUBMISSION = '../src/twtr.csv'
N_BLOCKS = int((N_CHILDREN - TWINS + BLOCK_SIZE - 1) / BLOCK_SIZE)
print('Block size: {}, n_blocks {}'.format(BLOCK_SIZE, N_BLOCKS))

Block size: 261, n_blocks 3659


In [35]:
subm = pd.read_csv(INITIAL_SUBMISSION)
initial_anh, g, c = my_avg_normalized_happiness(subm[['ChildId', 'GiftId']].values.tolist())
print(initial_anh, g, c)
subm['gift_rank'] = subm.groupby('GiftId').rank() - 1
subm['gift_id'] = subm['GiftId'] * 1000 + subm['gift_rank']
subm['gift_id'] = subm['gift_id'].astype(np.int32)
current_gift_ids = subm['gift_id'].values

COMPUTE NORMALIZED HAPPINESS:
('normalized child happiness', -0.9657394249968655)
('normalized gift happiness', -0.00083065200000001027)
(-0.90069942194483255, -830.65200000001028, -965739.4249968654)


###### Doing optimization

In [36]:
# number of iteration = 20
for i in range(20):
    print "Iteration #{0}".format(str(i))
    child_blocks = np.split(np.random.permutation(range(TWINS, N_CHILDREN)), N_BLOCKS)
    pbar.setBar(200)
    for j in range(200):
        pbar.show(j)
        child_block = child_blocks[j]
        cids, gids = optimize_block(child_block, current_gift_ids=current_gift_ids, gh=g, ch=c)
        current_gift_ids[cids] = gids
    subm['GiftId'] = GIFT_IDS[current_gift_ids]
    anh, g, c = my_avg_normalized_happiness(subm[['ChildId', 'GiftId']].values.tolist())
    print(i, anh, g, c)
    print "\n\n"

Iteration #0
COMPUTE NORMALIZED HAPPINESS:
('normalized child happiness', -0.9657422449968643)
('normalized gift happiness', -0.00082967450000001033)
(0, -0.90070731220715727, -829.67450000001031, -965742.2449968643)



Iteration #1
COMPUTE NORMALIZED HAPPINESS:
('normalized child happiness', -0.9657452649968624)
('normalized gift happiness', -0.00082967450000001033)
(1, -0.90071576211581472, -829.67450000001031, -965745.2649968624)



Iteration #2
COMPUTE NORMALIZED HAPPINESS:
('normalized child happiness', -0.9657484699968615)
('normalized gift happiness', -0.00082967450000001033)
(2, -0.90072472970913331, -829.67450000001031, -965748.4699968615)



Iteration #3


KeyboardInterrupt: 

In [None]:
subm[['ChildId', 'GiftId']].to_csv('../src/twtr.csv', index=False)