# Getting started with #MeliDataChallenge

In [None]:
#Import modules
import gzip
import json
import gc
import math
import random

from collections import Counter, defaultdict
from tqdm.notebook import tqdm
from pathlib import Path
from sklearn.model_selection import train_test_split


In [None]:
#Auxiliary function
def jl_to_list(fname):
    output = []
    with gzip.open(fname, 'rb') as f:
        for line in f:
            output.append(json.loads(line))
    return output

#### Here specify the path where you data is located

In [None]:
path = Path('data')

#### Load train data

In [None]:
samples = 50000 ### Only a sample
rows = jl_to_list(path/'train_dataset.jl.gz')
if samples:
    rows = rows[:samples]
    
rows_train, rows_test= train_test_split(rows, test_size=0.2, random_state=42)

#### Load item metadata

In [None]:
item_data = jl_to_list(path/'item_data.jl.gz')
metadata = {x['item_id']:x for x in item_data} #We create a dictionary to access easily access the item metadata
all_items = list(metadata.keys())

# Different approaches to build a baseline model...

### 1) Tops items of the most visited domain  

Here the idea is the following: We find out which is the most visited domain by the user, and the we recommend the top selling items of that domain. 


First we generate a dict of the form: {'domain': {'item_id': no. of purchases } }.

This is the "learning" stage of this simple model (that's why we do it only with the train data!).


In [None]:
sales_x_domain = defaultdict(lambda: defaultdict(int))

for row in tqdm(rows_train):
    #viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    items = [row['item_bought']]
    for item in items:
        domain = metadata[item]['domain_id']
        sales_x_domain[domain][item]+=1

HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))




Then we define some auxiliary functions for making the predictions

In [None]:
def domains_visited(row, max_views=15):
    """
    For a given user story (row), returns a Counter 
    of the domains visited by the user.
    """
    
    domains = Counter()
    
    viewed = [ev['event_info'] for ev in row['user_history'] 
              if ev['event_type']=='view']
    
    if len(viewed) > max_views:
        viewed = viewed[:15]
        
    for item in viewed:
        domain = metadata[item]['domain_id']
        domains[domain] += 1
        
    return domains

In [None]:
domains_visited(rows_train[3])

Counter({'MLB-LEARNING_TOY_LAPTOPS': 1,
         'MLB-VEHICLE_LED_BULBS': 6,
         'MLB-AUTOMOTIVE_SIDE_VIEW_MIRRORS': 2,
         'MLB-MOTORCYCLE_REARVIEW_MIRRORS': 2})

In [None]:
def top_items(domain,k=10):
    """
    Given the sales_x_domain info and a certain domain, 
    returns the top k selling items in that domain.
    """
    
    top = sales_x_domain[domain]
    top = Counter(top)
    top = top.most_common(k)
    
    return [x[0] for x in top]

In [None]:
top_items('MLB-TOWEL_SETS')

[1180390, 1595048, 2548103, 483864]

In [None]:
def top_by_best_domain(row, k=10):
    """
    For a given user story (row) it returns the top k selling
    items of the most visited domain.
    """
    
    views = [ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view']
    if len(views) == 0:
        return random.choices(all_items, k=k)
    domain = domains_visited(row)    
    domain = domain.most_common(1)[0][0]
    return top_items(domain, k=k)

In [None]:
top_by_best_domain(rows_train[0])

[1846525, 457610, 517507]

Now we are ready to generate our recommendations for the test rows

In [None]:
y_pred = []
for row in tqdm(rows_test):
    recom = top_by_best_domain(row)
    y_pred.append(recom)

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




We extract the target value for the test rows

In [None]:
y_true = [row['item_bought'] for row in rows_test]

Measure performance

In [None]:
from challenge_metric import ndcg_score
score = ndcg_score(y_true, y_pred, item_data,n_predictions=10)
print(f'Your score is: {score}')

Your score is: 0.07249757471885276


### 2) Last viewed items  

We simply recommed the last items visited by the user


In [None]:
def last_viewed(row, k=10):
    """
    Given a user story (row) it extracts the last k unique items visited.
    If there are less than k, it fills the remaining spots with random items.
    """
    viewed = [ev for ev in row['user_history'] if ev['event_type']=='view']
    viewed = sorted(viewed, key=lambda x: x['event_timestamp'], reverse=True)
    viewed = [ev['event_info'] for ev in viewed]
    recom = []
    for item in viewed:
        if item not in recom:
            recom.append(item)
            
    if len(recom) > 10:
        recom = recom[:10]
    
    if len(recom) == 10:
        return recom
    
    k = 10 - len(recom)
    relleno = random.choices(all_items, k=k)   
    
    return recom + relleno

In [None]:
last_viewed(rows[0])

[2490000,
 96755,
 96103,
 1605110,
 1098704,
 2722263,
 2850363,
 2711975,
 2293666,
 1502408]

Now we are ready to generate the recommendations

y_pred = []
for row in tqdm(rows_test):
    recom = last_viewed(row)
    y_pred.append(recom)

Measure performance

In [None]:
from challenge_metric import ndcg_score
score = ndcg_score(y_true, y_pred, item_data,n_predictions=10)
print(f'Your score is: {score}')

Your score is: 0.07249757471885276


### 3) Views-purchases

The idea here is to predict what most users, who visited the very same item that a certain user, ended up buying .

First we build a dictionary that maps item viewed with item bought and their frequency.

This is the "learning" stage of this simple model (that's why we do it only with the train data!).

In [None]:
views_purchases = defaultdict(lambda: defaultdict(int))
for row in tqdm(rows_train):
    for ev in row['user_history']:
        if ev['event_type']=='view':
            views_purchases[int(ev['event_info'])][int(row['item_bought'])]+=1

HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))




In [None]:
def get_item_scores(row):
    """
    Given a user history (row) returns a counter of the items purchased 
    for the items viewed by the user
    """
    item_scores = defaultdict(int)
    for ev in row['user_history']:
        if ev['event_type']=='view':
            for k,v in views_purchases[int(ev['event_info'])].items():
                item_scores[k]+=v

    return Counter(item_scores)

In [None]:
def view_purchase_recom(row):
    """
    Given a user history (row) returns the top 10 items purchased 
    for the items viewed by the user. If there are less than k, 
    it fill the remaining spots with random items.
    
    """
    reco = []
    scores = get_item_scores(row)
    most_common = scores.most_common()
    for item, score in most_common:
        reco.append(item)
        if len(reco) == 10:
            return reco
    
    k = 10 - len(reco)    
    relleno = random.choices(all_items, k=k)   
    
    return reco + relleno
       

Now we are ready to generate our recommendations for the test rows

In [None]:
y_pred = []
for row in tqdm(rows_test):
    recom = view_purchase_recom(row)
    y_pred.append(recom)

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




Measure performance

In [None]:
from challenge_metric import ndcg_score
score = ndcg_score(y_true, y_pred, item_data,n_predictions=10)
print(f'Your score is: {score}')

Your score is: 0.02111666387024183


#### How could these baselines be improved?

* We could combine different baselines, using them sequentially to fill the recommendations.

* We could also make the different baselines vote and choose with some algotithm each vote's weight.

* Use these baselines as generatoros for a more complex model signal-vs-noise style.