# Aula 08 - Aprendendo a Ranquear - Exemplos

In [1]:
import pandas as pd
import numpy as np
import random

## Implementação do BPR

### Importar base de dados

In [2]:
# import wget
# !python3 -m wget https://github.com/mmanzato/MBABigData/raw/master/ml-20m-compact.tar.gz
# !tar -xvzf ml-20m-compact.tar.gz

In [3]:
movies = pd.read_csv('./dataset/movies_sample.csv', names=['itemId', 'title', 'genre'], header=0)
ratings = pd.read_csv('./dataset/ratings_sample.csv', names=['userId', 'itemId', 'rating', 'timestamp'], header=0)
df = ratings[['userId', 'itemId', 'rating']]
df = df.merge(movies[['itemId', 'title']])
df

  ratings = pd.read_csv('./dataset/ratings_sample.csv', names=['userId', 'itemId', 'rating', 'timestamp'], header=0)


Unnamed: 0,userId,itemId,rating,title
0,11,7481,5.0,Enemy Mine (1985)
1,11,1046,4.5,Beautiful Thing (1996)
2,11,616,4.0,"Aristocats, The (1970)"
3,11,3535,2.0,American Psycho (2000)
4,11,5669,5.0,Bowling for Columbine (2002)
...,...,...,...,...
190616,138493,288,5.0,Natural Born Killers (1994)
190617,138493,1748,5.0,Dark City (1998)
190618,138493,616,4.0,"Aristocats, The (1970)"
190619,138493,1597,4.5,Conspiracy Theory (1997)


### Mapeamento de ids

In [4]:
map_users = {user: idx for idx, user in enumerate(df.userId.unique())}
map_items = {item: idx for idx, item in enumerate(df.itemId.unique())}
df['userId'] = df['userId'].map(map_users)
df['itemId'] = df['itemId'].map(map_items)
map_title = {}

for _, row in df.iterrows():
    map_title[row.itemId] = row.title


### Funções para obter informações específicas do DataFrame

In [5]:
# Obter a nota que um usuário deu para um item.
def get_rating(df, userId, itemId):
    if len(df[(df['userId']==userId)&(df['itemId']==itemId)]) == 0:
        return 0
    return (df.loc[(df.userId==userId) & (df.itemId == itemId),'rating'].iloc[0])

get_rating(df, 1, 5)

0

In [6]:
# Obter a lista de todos os itens que um usuário avaliou.
def get_item_ids(df, userId):
    if userId not in df['userId'].values:
        return []
    return (df.loc[(df.userId==userId),'itemId'].tolist())

get_item_ids(df, 0)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [7]:
# Obter o título do item dado o seu id.
def get_item_title(itemId):
    if itemId not in df['itemId'].values:
        return ''
    return (df.loc[(df.itemId == itemId),'title'].iloc[0])

get_item_title(0)

'Enemy Mine (1985)'

In [8]:
# Obter a lista de ratings de um usuário.
def get_user_ratings(df, userId):
    if userId not in df['userId'].values:
        return []
    return (df.loc[(df.userId==userId),'rating'].tolist())

get_user_ratings(df, 0)

[5.0, 4.5, 4.0, 2.0, 5.0, 5.0, 5.0, 5.0, 3.0, 5.0, 5.0, 3.5, 5.0]

In [9]:
# Obter a média de ratings de um usuário
def get_user_mean(df, userId):
    return np.mean(get_user_ratings(df, userId))

get_user_mean(df, 1)

3.9285714285714284

In [10]:
# Obter a lista de todos os usuários que avaliaram o item
def get_user_ids(df, itemId):
    if itemId not in df['itemId'].values:
        return []
    return (df.loc[(df.itemId==itemId),'userId'].tolist())

# Obter a média de notas do item
def get_item_mean(df, itemId):
    return np.mean(get_item_ratings(df, itemId))

# Obter todas as notas do item
def get_item_ratings(df, itemId):
    if itemId not in df['itemId'].values:
        return []
    return (df.loc[(df.itemId==itemId),'rating'].tolist())

get_item_ratings(df, 0)

[5.0,
 4.0,
 3.0,
 3.0,
 3.0,
 4.0,
 5.0,
 2.0,
 3.5,
 3.0,
 2.5,
 2.5,
 2.5,
 4.0,
 3.5,
 3.0,
 3.5,
 5.0,
 5.0,
 3.5,
 3.0,
 3.5,
 3.5,
 3.5,
 4.0,
 3.0,
 2.5,
 3.0,
 4.5,
 2.5,
 4.0,
 4.0,
 3.5,
 4.0,
 4.0,
 3.0,
 4.0,
 3.0,
 3.0,
 3.0,
 3.5,
 3.0,
 4.5,
 3.5,
 2.5,
 3.0,
 3.5,
 3.0,
 3.5,
 2.0,
 0.5,
 3.5,
 5.0,
 3.5,
 2.5,
 4.0,
 3.0,
 2.5,
 4.0,
 2.5,
 4.5,
 4.0,
 1.5,
 3.0,
 3.0,
 3.5,
 3.0,
 3.5,
 2.5,
 4.5,
 3.5,
 3.5,
 4.0,
 1.5,
 3.5,
 3.5,
 4.0,
 3.0,
 2.0,
 4.0,
 1.5,
 3.5,
 2.5,
 3.5,
 5.0,
 1.5,
 3.5,
 3.5,
 1.5,
 3.0,
 3.0,
 2.5,
 5.0,
 3.5,
 2.5,
 3.5,
 3.5,
 4.0,
 4.0,
 3.0,
 3.5,
 3.0,
 2.5,
 4.0,
 4.0,
 3.0,
 3.0,
 3.0,
 2.0,
 3.5,
 4.0,
 4.0,
 4.0,
 3.0,
 4.0,
 3.0,
 5.0,
 2.0,
 5.0,
 3.5,
 4.0,
 3.0,
 4.0,
 4.0,
 3.5,
 4.5,
 3.0,
 3.5,
 3.0,
 3.0,
 5.0,
 3.5,
 4.5,
 4.5,
 4.0,
 3.5,
 4.0,
 3.5,
 2.0,
 3.0,
 3.0,
 5.0,
 5.0,
 3.0,
 4.0,
 4.0,
 4.0,
 3.5,
 3.0,
 3.0,
 3.0,
 3.0,
 3.5,
 2.0,
 3.5,
 2.5,
 1.0,
 4.0,
 3.5,
 4.0,
 5.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 3.0

### Divisão da base em treino e teste

In [11]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=.2, random_state=2)

### Função para recuperar um par de itens (i, j) de um dado usuário, onde i é um item conhecido e j é um item desconhecido

In [12]:
observed = dict()
unobserved = dict()
all_users = df['userId'].unique().tolist() # usar conj. total
all_items = df['itemId'].unique().tolist() # usar conj. total

for u in all_users:
    observed[u] = get_item_ids(train, u) # usar conj. de treinamento
    unobserved[u] = list(set(all_items)-set(observed[u]))

def draw(userId):    
    i = random.choice(observed[userId])
    j = random.choice(unobserved[userId])
    return i, j

draw(2)


(23, 350)

### Treinamento do modelo

In [13]:
def train_bprmf(train, n_factors, lr=0.05, reg=0.02, miter=30):    
    n_users = df['userId'].max()+1
    n_items = df['itemId'].max()+1    
    item_bias = np.zeros(n_items)
    p = np.random.normal(0, 0.1, (n_users, n_factors))
    q = np.random.normal(0, 0.1, (n_items, n_factors))
    
    error = []
    for t in range(miter):
        print('Iter #', t)
        sq_error = 0
        random_users = random.choices(train['userId'].unique(), k=len(train))
        for u in random_users:
            i, j = draw(u)
            x_uij = item_bias[i] - item_bias[j] + (np.dot(p[u], q[i]) - np.dot(p[u], q[j]))
            sq_error += x_uij
            
            eps = 1 / (1 + np.exp(x_uij))

            item_bias[i] += lr * (eps - reg * item_bias[i])
            item_bias[j] += lr * (-eps - reg * item_bias[j])            

            # Adjust the factors
            u_f = p[u]
            i_f = q[i]
            j_f = q[j]

            # Compute and apply factor updates
            p[u] += lr * ((i_f - j_f) * eps - reg * u_f)
            q[i] += lr * (u_f * eps - reg * i_f)
            q[j] += lr * (-u_f * eps - reg * j_f)
            
        error.append(sq_error/len(random_users))
            
    return item_bias, p, q, error

In [None]:
b, p, q, error = train_bprmf(train, 4)

Iter # 0
Iter # 1
Iter # 2
Iter # 3
Iter # 4
Iter # 5


In [None]:
import matplotlib.pyplot as plt

plt.plot(error)
plt.show()

### Gerar N recomendações para cada usuário

In [None]:
def predict(N=10):
    w = b.T + np.dot(p, q.T)
    ranking = []
    
    for u, user in enumerate(all_users):
        partial_ranking = list()
        candidate_items = sorted(range(len(w[u])), key=lambda k: w[u][k], reverse=True)
        
        for i in candidate_items:
            if i not in observed[user]:
                partial_ranking.append((user, i, w[u][i]))

            if len(partial_ranking) == N:
                break

        ranking += partial_ranking
        
    return pd.DataFrame(ranking, columns=['userId', 'movieId', 'score'])
    

In [None]:
ranking = predict()

In [None]:
ranking['title'] = ranking.movieId.map(map_title)
ranking[ranking['userId'] == 0]

## Case Recommender

In [None]:
train.to_csv('train.dat', index=False, header=False, sep='\t')
test.to_csv('test.dat', index=False, header=False, sep='\t')

In [None]:
from caserec.recommenders.item_recommendation.bprmf import BprMF

BprMF('train.dat', 'test.dat', 'ir_bprmf.dat').compute()

In [None]:
recs = pd.read_csv('ir_bprmf.dat', sep='\t', names=['userId', 'itemId', 'score'])
recs['title'] = recs.itemId.map(map_title)
recs[recs['userId'] == 0]