In [1]:
import os
import numpy as np
from collections import Counter

In [2]:
cntr_seen = Counter()
cntr_buy = Counter()

# Train set

In [32]:
with open('train.txt') as f:
    content = f.read()

In [33]:
lines = content.split('\n')

In [34]:
lines_buy = []
for line in lines:
    if line:
        seen, buy = line.split(';')

        for item in seen.split(','):
            cntr_seen[item] += 1
            
        if buy:
            lines_buy.append(line)
            for item in buy.split(','):
                cntr_buy[item] += 1  

## Sorting by seen freq

In [35]:
prec_1 = []
rec_1 = []
prec_5 = []
rec_5 = []

for line in lines_buy:
    seen, buy = line.split(';')
    seen_arr, buy_arr = seen.split(','), buy.split(',')
    
    ## deduplication
    s = set()
    dedupl_seen = []
    for i in seen_arr:
        if i not in s:
            s.add(i)
            dedupl_seen.append(i)
    
    ## sorting
    n = 1 
    while n < len(dedupl_seen):
        for i in range(len(dedupl_seen)-n):
            if cntr_seen[dedupl_seen[i+1]] > cntr_seen[dedupl_seen[i]]:
                dedupl_seen[i],dedupl_seen[i+1] = dedupl_seen[i+1],dedupl_seen[i]
        n += 1      
    
    ## metrics 
    
    if dedupl_seen[0] in buy_arr:
        prec_1.append(1.0)
        rec_1.append(1.0/len(buy_arr))
    else:
        prec_1.append(0.0)
        rec_1.append(0.0)
        
    cntr = 0
    for i in dedupl_seen[:5]:
        if i in buy_arr:
            cntr += 1
    
    prec_5.append(cntr/5.0)
    rec_5.append(cntr/float(len(buy_arr)))

In [36]:
av_prec_1 = np.mean(prec_1)
av_prec_5 = np.mean(prec_5)
av_rec_1 = np.mean(rec_1)
av_rec_5 = np.mean(rec_5)

## Metrics for train set for seen freq

In [37]:
print 'Average precision@1', av_prec_1
print 'Average precision@5', av_prec_5

print 'Average recall@1', av_rec_1
print 'Average recall@5', av_rec_5

Average precision@1 0.512195121951
Average precision@5 0.212527716186
Average recall@1 0.442634316595
Average recall@5 0.824691824713


## Sorting by buying freq

In [38]:
round(0.82469, 2)

0.82

In [39]:
prec_1 = []
rec_1 = []
prec_5 = []
rec_5 = []

for line in lines_buy:
    seen, buy = line.split(';')
    seen_arr, buy_arr = seen.split(','), buy.split(',')
    
    ## deduplication
    s = set()
    dedupl_seen = []
    for i in seen_arr:
        if i not in s:
            s.add(i)
            dedupl_seen.append(i)
    
    ## sorting
    n = 1 
    while n < len(dedupl_seen):
        for i in range(len(dedupl_seen)-n):
            if cntr_buy[dedupl_seen[i+1]] > cntr_buy[dedupl_seen[i]]:
                dedupl_seen[i],dedupl_seen[i+1] = dedupl_seen[i+1],dedupl_seen[i]
        n += 1      
    
    ## metrics 
    
    if dedupl_seen[0] in buy_arr:
        prec_1.append(1.0)
        rec_1.append(1.0/len(buy_arr))
    else:
        prec_1.append(0.0)
        rec_1.append(0.0)
        
    cntr = 0
    for i in dedupl_seen[:5]:
        if i in buy_arr:
            cntr += 1

    prec_5.append(cntr/5.0)
    rec_5.append(cntr/float(len(buy_arr)))
    
    
av_prec_1 = np.mean(prec_1)
av_prec_5 = np.mean(prec_5)
av_rec_1 = np.mean(rec_1)
av_rec_5 = np.mean(rec_5)

print 'Average precision@1', av_prec_1
print 'Average precision@5', av_prec_5

print 'Average recall@1', av_rec_1
print 'Average recall@5', av_rec_5

Average precision@1 0.80376940133
Average precision@5 0.252549889135
Average recall@1 0.688449492427
Average recall@5 0.926307302423


# Test set

In [40]:
with open('test.txt') as f:
    content = f.read()

lines = content.split('\n')

lines_buy = []
for line in lines:
    if line:
        seen, buy = line.split(';')

        if buy:
            lines_buy.append(line)

## Sorting by seen freq

In [41]:
prec_1 = []
rec_1 = []
prec_5 = []
rec_5 = []

for line in lines_buy:
    seen, buy = line.split(';')
    seen_arr, buy_arr = seen.split(','), buy.split(',')
    
    ## deduplication
    s = set()
    dedupl_seen = []
    for i in seen_arr:
        if i not in s:
            s.add(i)
            dedupl_seen.append(i)
    
    ## sorting
    n = 1 
    while n < len(dedupl_seen):
        for i in range(len(dedupl_seen)-n):
            if cntr_seen[dedupl_seen[i+1]] > cntr_seen[dedupl_seen[i]]:
                dedupl_seen[i],dedupl_seen[i+1] = dedupl_seen[i+1],dedupl_seen[i]
        n += 1      
    
    ## metrics 
    
    if dedupl_seen[0] in buy_arr:
        prec_1.append(1.0)
        rec_1.append(1.0/len(buy_arr))
    else:
        prec_1.append(0.0)
        rec_1.append(0.0)
        
    cntr = 0
    for i in dedupl_seen[:5]:
        if i in buy_arr:
            cntr += 1
    
    prec_5.append(cntr/5.0)
    rec_5.append(cntr/float(len(buy_arr)))
    

av_prec_1 = np.mean(prec_1)
av_prec_5 = np.mean(prec_5)
av_rec_1 = np.mean(rec_1)
av_rec_5 = np.mean(rec_5)

print 'Average precision@1', av_prec_1
print 'Average precision@5', av_prec_5

print 'Average recall@1', av_rec_1
print 'Average recall@5', av_rec_5

Average precision@1 0.481309686221
Average precision@5 0.203765347885
Average recall@1 0.417332662033
Average recall@5 0.800034066354


In [42]:
#train
"""Average precision@1 0.512195121951
Average precision@5 0.212527716186
Average recall@1 0.442634316595
Average recall@5 0.824691824713"""

'Average precision@1 0.512195121951\nAverage precision@5 0.212527716186\nAverage recall@1 0.442634316595\nAverage recall@5 0.824691824713'

## Sorting by buying freq

In [43]:
prec_1 = []
rec_1 = []
prec_5 = []
rec_5 = []

for line in lines_buy:
    seen, buy = line.split(';')
    seen_arr, buy_arr = seen.split(','), buy.split(',')
    
    ## deduplication
    s = set()
    dedupl_seen = []
    for i in seen_arr:
        if i not in s:
            s.add(i)
            dedupl_seen.append(i)
    
    ## sorting
    n = 1 
    while n < len(dedupl_seen):
        for i in range(len(dedupl_seen)-n):
            if cntr_buy[dedupl_seen[i+1]] > cntr_buy[dedupl_seen[i]]:
                dedupl_seen[i],dedupl_seen[i+1] = dedupl_seen[i+1],dedupl_seen[i]
        n += 1      
    
    ## metrics 
    
    if dedupl_seen[0] in buy_arr:
        prec_1.append(1.0)
        rec_1.append(1.0/len(buy_arr))
    else:
        prec_1.append(0.0)
        rec_1.append(0.0)
        
    cntr = 0
    for i in dedupl_seen[:5]:
        if i in buy_arr:
            cntr += 1
    
    prec_5.append(cntr/5.0)
    rec_5.append(cntr/float(len(buy_arr)))
    
    
av_prec_1 = np.mean(prec_1)
av_prec_5 = np.mean(prec_5)
av_rec_1 = np.mean(rec_1)
av_rec_5 = np.mean(rec_5)

print 'Average precision@1', av_prec_1
print 'Average precision@5', av_prec_5

print 'Average recall@1', av_rec_1
print 'Average recall@5', av_rec_5

Average precision@1 0.527694406548
Average precision@5 0.210095497954
Average recall@1 0.460620166666
Average recall@5 0.820187433749


In [45]:
#train
"""Average precision@1 0.80376940133
Average precision@5 0.252549889135
Average recall@1 0.688449492427
Average recall@5 0.926307302423"""

'Average precision@1 0.80376940133\nAverage precision@5 0.252549889135\nAverage recall@1 0.688449492427\nAverage recall@5 0.926307302423'