In [1]:
# 006(together)と009(trending)と012(trending&together)のアンサンブル
# 重みは最適化で求める


EXP = '013'
FOLD = '_fold1' # '_fold1' のように指定、全データ学習時は'' を指定

import gc
import os
from scipy import optimize
import numpy as np
import pandas as pd
from tqdm import tqdm

from validation import mean_average_precision

In [2]:
sub0 = pd.read_csv(f'../submissions/006_submission{FOLD}_1item.csv').sort_values('customer_id').reset_index(drop=True)
sub1 = pd.read_csv(f'../submissions/009_submission{FOLD}.csv').sort_values('customer_id').reset_index(drop=True)
sub2 = pd.read_csv(f'../submissions/012_submission{FOLD}.csv').sort_values('customer_id').reset_index(drop=True)

sub0.shape, sub1.shape, sub2.shape

((1371980, 2), (1371980, 2), (1371980, 2))

In [3]:
# How many predictions are in common between models

print((sub0['prediction']==sub1['prediction']).mean())
print((sub0['prediction']==sub2['prediction']).mean())
print((sub1['prediction']==sub2['prediction']).mean())

7.288735987405064e-07
0.00016180993892039243
0.23518710185279668


In [4]:
sub0.columns = ['customer_id', 'prediction0']
sub0['prediction1'] = sub1['prediction']
sub0['prediction2'] = sub2['prediction']

del sub1, sub2
gc.collect()

sub0.head()

Unnamed: 0,customer_id,prediction0,prediction1,prediction2
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0568601006 0909370001 0751471001 06...,0568601043 0568601006 0859416011 0745232001 07...,0568601043 0568601006 0859416011 0568597006 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0826211002 0706016001 0909370001 0865799006 09...,0826211002 0739590027 0764280001 0590928022 07...,0826211002 0706016001 0739590027 0764280001 05...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0794321011 0909370001 0865799006 09...,0794321007 0852643003 0852643001 0727808007 08...,0794321007 0794321011 0852643003 0852643001 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0742079001 0732413001 0730683001 0372860001 09...,0918522001 0751471001 0751471043 0910601003 07...,0918522001 0751471001 0751471043 0910601003 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0896152002 0730683050 0927530004 0791587015 08...,0896152002 0791587015 0730683050 0927530004 08...,0896152002 0791587015 0730683050 0927530004 08...


In [5]:
def cust_blend(dt, W = [1,1,1]):
    #Global ensemble weights
    #W = [1.15,0.95,0.85]
    
    #Create a list of all model predictions
    REC = []
    REC.append(dt['prediction0'].split())
    REC.append(dt['prediction1'].split())
    
    #Create a dictionary of items recommended. 
    #Assign a weight according the order of appearance and multiply by global weights
    res = {}
    for M in range(len(REC)):
        for n, v in enumerate(REC[M]):
            if v in res:
                res[v] += (W[M]/(n+1))
            else:
                res[v] = (W[M]/(n+1))
    
    # Sort dictionary by item weights
    res = list(dict(sorted(res.items(), key=lambda item: -item[1])).keys())
    
    # Return the top 12 items only
    return ' '.join(res[:12])

def cust_blend_ls(dt, W = [1,1,1]):
    #Global ensemble weights
    #W = [1.15,0.95,0.85]
    
    #Create a list of all model predictions
    REC = []
    REC.append(dt['prediction0'].split())
    REC.append(dt['prediction1'].split())
    
    #Create a dictionary of items recommended. 
    #Assign a weight according the order of appearance and multiply by global weights
    res = {}
    for M in range(len(REC)):
        for n, v in enumerate(REC[M]):
            if v in res:
                res[v] += (W[M]/(n+1))
            else:
                res[v] = (W[M]/(n+1))
    
    # Sort dictionary by item weights
    res = list(dict(sorted(res.items(), key=lambda item: -item[1])).keys())
    
    # Return the top 12 items only
    return res[:12]

if FOLD != '':
    np.random.seed(41)

    scores = []
    weights = []
    transaction = pd.read_csv(f'../input/valid/transactions_valid{FOLD}.csv', dtype={'article_id': str}, parse_dates=['t_dat'])
    # true and pred dataframe
    true = transaction.groupby('customer_id')['article_id'].apply(lambda items: list(set(items))).reset_index().rename(columns={'article_id': 'true'})

    def loss(x):
        sub0['prediction'] = sub0.apply(cust_blend_ls, W = x, axis=1)
        sub0['prediction'] = sub0['prediction'].map(lambda l: [v[1:] for v in l])
        true_and_pred = pd.merge(true, sub0, how='left')
        # true_and_pred['prediction'] = true_and_pred['prediction'].str.split(' ').map(lambda l: [v[1:] for v in l])
        return (-1) * mean_average_precision(true_and_pred['true'].to_list(), true_and_pred['prediction'].to_list(), k=12)

    # 重みの最適化を繰り返す
    for i in tqdm(range(5)):
        initial_weight = np.random.uniform(size=3)
        result = optimize.minimize(loss, initial_weight, method='Nelder-Mead')
        scores.append(result['fun'])
        weights.append(result['x'])

    best_score = np.min(scores)
    best_weight = weights[np.argmin(scores)]
    print(f'Best score: {best_score} \n Best weight: {best_weight}')
    sub0['prediction'] = sub0.apply(cust_blend, W = best_weight, axis=1)
else:
    sub0['prediction'] = sub0.apply(cust_blend, W = [0.05268751, 0.07144168, 0.66675557], axis=1)
sub0.head()

100%|██████████| 5/5 [2:41:23<00:00, 1936.75s/it]  


Best score: -0.027427004507830834 
 Best weight: [0.05268751 0.07144168 0.66675557]


Unnamed: 0,customer_id,prediction0,prediction1,prediction2,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0568601006 0909370001 0751471001 06...,0568601043 0568601006 0859416011 0745232001 07...,0568601043 0568601006 0859416011 0568597006 07...,0568601043 0568601006 0909370001 0859416011 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0826211002 0706016001 0909370001 0865799006 09...,0826211002 0739590027 0764280001 0590928022 07...,0826211002 0706016001 0739590027 0764280001 05...,0826211002 0739590027 0706016001 0764280001 05...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0794321011 0909370001 0865799006 09...,0794321007 0852643003 0852643001 0727808007 08...,0794321007 0794321011 0852643003 0852643001 07...,0794321007 0852643003 0794321011 0909370001 08...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0742079001 0732413001 0730683001 0372860001 09...,0918522001 0751471001 0751471043 0910601003 07...,0918522001 0751471001 0751471043 0910601003 07...,0918522001 0742079001 0751471001 0751471043 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0896152002 0730683050 0927530004 0791587015 08...,0896152002 0791587015 0730683050 0927530004 08...,0896152002 0791587015 0730683050 0927530004 08...,0896152002 0730683050 0791587015 0927530004 09...


In [6]:
# How many predictions are in common with ensemble

print((sub0['prediction']==sub0['prediction0']).mean())
print((sub0['prediction']==sub0['prediction1']).mean())
print((sub0['prediction']==sub0['prediction2']).mean())

7.288735987405064e-07
0.00023178180439948105
0.00013994373095817724


# Make a submission

In [7]:
del sub0['prediction0']
del sub0['prediction1']
del sub0['prediction2']
gc.collect()

sub0.to_csv(f'../submissions/{EXP}_submission{FOLD}.csv', index=False)
sub0.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0568601006 0909370001 0859416011 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0826211002 0739590027 0706016001 0764280001 05...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0852643003 0794321011 0909370001 08...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0918522001 0742079001 0751471001 0751471043 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0896152002 0730683050 0791587015 0927530004 09...
