In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from tqdm import tqdm

import torch

In [2]:
def similarity_cosine_by_index(indicies):
    return cosine_similarity(X=synop_matrix[indicies], Y=synop_matrix)

In [3]:
book = pd.read_csv('../data/ver2/Book.tsv', sep='\t')
ratings = pd.read_csv('../data/ver2/Rating.tsv', sep='\t', low_memory=False)

tfidf = TfidfVectorizer(analyzer='word', stop_words='english')
synop_matrix = tfidf.fit_transform(book['synopsis'].values.astype('U'))
synop_matrix = synop_matrix.astype('float32')

chunk_size = 10000
matrix_len = synop_matrix.shape[0]

user_encoder = LabelEncoder()
item_encoder = LabelEncoder()


ratings['item'] = item_encoder.fit_transform(ratings['item'])
ratings['user'] = user_encoder.fit_transform(ratings['user'])

user_cnt = ratings.groupby('user').agg({'item' : 'count'})
valid_users = list(user_cnt[user_cnt['item'] < 10].index)


In [4]:
from sklearn.model_selection import train_test_split

In [12]:
gr = ratings.groupby('user').agg({'item' : 'count'})
val_user = list(gr[gr['item'] >= 5].index)
val_ratings = ratings[ratings['user'].isin(val_user)]
train_ratings, valid_ratings = train_test_split(val_ratings, stratify=val_ratings['user'], test_size=0.2)

In [7]:
train_ratings['user'].nunique()

1075591

In [8]:
train_ratings['item']

6439164     20092
9066138     27753
8827609     26827
10343734    32032
10803408    34300
            ...  
8216848     24742
739711       1792
13859507    41213
4020999     10977
2177336      5587
Name: item, Length: 6720157, dtype: int64

In [13]:
rows, cols, data = train_ratings['user'], train_ratings['item'], train_ratings['rating']
n_users = ratings['user'].nunique()
n_items = ratings['item'].nunique()

rating_matrix = sparse.csr_matrix((data, (rows, cols)),
                            dtype='float32',
                            shape=(n_users, n_items))

In [10]:
cosine_similarities = cosine_similarity(synop_matrix)

In [15]:
for st_idx in tqdm(range(0, n_users, 10000)):

    next_idx = st_idx + 10000
    end_idx = min(next_idx, n_users)
    input_data = rating_matrix[st_idx:end_idx]
    users = range(st_idx, end_idx)
    
    result = torch.tensor(input_data @ cosine_similarities)
    result[torch.nonzero(torch.FloatTensor(input_data.todense()), as_tuple=True)] = -np.inf
    
    scores, items = torch.topk(result, k=10, dim=1)
    

    users = np.tile(users, (10,1)).T
    user_list = np.concatenate([user for user in users])
    score_list = torch.cat([score for score in scores])
    item_list = torch.cat([item for item in items])

    temp_df = pd.DataFrame()
    temp_df['user'] = user_list
    temp_df['item'] = item_list.cpu().numpy()
    temp_df['score'] = score_list.cpu().numpy()

    temp_df['user'] = user_encoder.inverse_transform(temp_df['user'])
    temp_df['item'] = item_encoder.inverse_transform(temp_df['item'])

    if st_idx == 0 : inference_df = temp_df
    else : inference_df = pd.concat([inference_df, temp_df])

inference_df = inference_df.sort_values(['user', 'score'], ascending=[True, False])

  0%|          | 0/569 [00:02<?, ?it/s]


NameError: name 'k' is not defined

In [20]:
inference_df

Unnamed: 0,user,item,score
0,0,103474,1.488660
1,0,103465,1.481825
2,0,43048,1.449955
3,0,33071,1.426067
4,0,71938,1.398082
...,...,...,...
5,13,94009,0.677304
6,13,44158,0.661345
7,13,95783,0.631429
8,13,44126,0.628387
