In [1]:
import os
import numpy as np
import pandas as pd
import gc

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
submissions = pd.read_csv('/content/drive/MyDrive/Project_Data/sample_submission.csv')

In [4]:
sub0 = pd.read_csv('/content/drive/MyDrive/Project_Data/sub_svd.csv').sort_values('customer_id').reset_index(drop=True)
sub1 = pd.read_csv('/content/drive/MyDrive/Project_Data/sub_content.csv').sort_values('customer_id').reset_index(drop=True)
sub2 = pd.read_csv('/content/drive/MyDrive/Project_Data/sub_collaborative.csv').sort_values('customer_id').reset_index(drop=True)


sub0.shape, sub1.shape, sub2.shape

((1371980, 2), (1371980, 2), (1371980, 2))

In [5]:
# How many predictions are in common between models

print((sub0['prediction']==sub1['prediction']).mean())
print((sub0['prediction']==sub2['prediction']).mean())
print((sub1['prediction']==sub2['prediction']).mean())

0.0
0.0
0.9949241242583711


In [6]:
sub0.columns = ['customer_id', 'prediction0']
sub0['prediction1'] = sub1['prediction']
sub0['prediction2'] = sub2['prediction']

del sub1, sub2
gc.collect()
sub0.head()

Unnamed: 0,customer_id,prediction0,prediction1,prediction2
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0924243002 0448509014 0923758001 0751471001 07...,0'0568601043' 0'0841260003' 0'0887593002' 0'08...,0'0568601043' 0'0841260003' 0'0887593002' 0'08...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0924243002 0448509014 0923758001 0751471001 07...,0'0826211002' 0'0599580055' 0'0599580055' 0'08...,0'0826211002' 0'0599580055' 0'0599580055' 0'08...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0805000001 0924243002 0448509014 09...,0'0794321007' 0'0858883002' 0'0851400006' 0'07...,0'0794321007' 0'0858883002' 0'0851400006' 0'07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0924243002 0448509014 0923758001 0751471001 07...,0'0742079001' 0'0732413001' 00924243002 007514...,0'0742079001' 0'0732413001' 00924243002 007514...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0924243002 0448509014 0923758001 0751471001 07...,0'0896152002' 0'0730683050' 0'0927530004' 0'07...,0'0896152002' 0'0730683050' 0'0927530004' 0'07...


In [7]:
def cust_blend(dt, W = [1,1,1,1]):
    #Global ensemble weights
    #W = [1.15,0.95,0.85]
    
    #Create a list of all model predictions
    REC = []
    
    # Second Try
    REC.append(dt['prediction0'].split())
    REC.append(dt['prediction1'].split())
    REC.append(dt['prediction2'].split())
    
    #Create a dictionary of items recommended. 
    #Assign a weight according the order of appearance and multiply by global weights
    res = {}
    for M in range(len(REC)):
        for n, v in enumerate(REC[M]):
            if v in res:
                res[v] += (W[M]/(n+1))
            else:
                res[v] = (W[M]/(n+1))
    
    # Sort dictionary by item weights
    res = list(dict(sorted(res.items(), key=lambda item: -item[1])).keys())
    
    # Return the top 12 itens only
    return ' '.join(res[:12])

In [8]:
sub0['prediction'] = sub0.apply(cust_blend, W = [1.05,1.00,0.95,0.85], axis=1)
sub0.head()

Unnamed: 0,customer_id,prediction0,prediction1,prediction2,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0924243002 0448509014 0923758001 0751471001 07...,0'0568601043' 0'0841260003' 0'0887593002' 0'08...,0'0568601043' 0'0841260003' 0'0887593002' 0'08...,0'0568601043' 0924243002 0'0841260003' 0'08875...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0924243002 0448509014 0923758001 0751471001 07...,0'0826211002' 0'0599580055' 0'0599580055' 0'08...,0'0826211002' 0'0599580055' 0'0599580055' 0'08...,0'0826211002' 0'0599580055' 0'0811835004' 0924...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0805000001 0924243002 0448509014 09...,0'0794321007' 0'0858883002' 0'0851400006' 0'07...,0'0794321007' 0'0858883002' 0'0851400006' 0'07...,0'0794321007' 0794321007 0'0858883002' 0'07504...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0924243002 0448509014 0923758001 0751471001 07...,0'0742079001' 0'0732413001' 00924243002 007514...,0'0742079001' 0'0732413001' 00924243002 007514...,0'0742079001' 0924243002 0'0732413001' 0092424...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0924243002 0448509014 0923758001 0751471001 07...,0'0896152002' 0'0730683050' 0'0927530004' 0'07...,0'0896152002' 0'0730683050' 0'0927530004' 0'07...,0'0896152002' 0924243002 0'0730683050' 0'09275...


In [9]:
# How many predictions are in common with ensemble

print((sub0['prediction']==sub0['prediction0']).mean())
print((sub0['prediction']==sub0['prediction1']).mean())
print((sub0['prediction']==sub0['prediction2']).mean())

0.0
0.0
0.0


# Make a submission

In [10]:
del sub0['prediction0']
del sub0['prediction1']
del sub0['prediction2']

gc.collect()
sub0.to_csv('/content/drive/MyDrive/Project_Data/sub_embedded.csv', index=False)