In [None]:
!pip install pinecone-client[grpc] -q

In [2]:
import numpy as np
import pandas as pd
from transformers import AutoModel, AutoTokenizer
import torch
from torch.cuda.amp import autocast
from torch.utils.data import Dataset, DataLoader
from pinecone import Pinecone, ServerlessSpec
from tqdm import tqdm
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
import math

In [3]:
train = pd.read_csv('/kaggle/input/datahotel/train_total_by_user_38k1.csv')
test = pd.read_csv('/kaggle/input/datahotel/test_by_user_695.csv')

In [4]:
total = pd.concat((train, test), axis=0)

In [5]:
n_users = len(np.unique(total['UserID'].values))

In [7]:
#Chech if a GPU is available or not 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
def get_config():
    config = {
        'model_ckp': 'uitnlp/visobert',
        "pinecone_api": '08f8fdc1-cbd1-474c-b1e0-d123d7759078',
        'max_length': 1024,
        'topk': 11,
        'topk_items': 10,
        'pc_index_name': 'visobert-user-vectors',
        'ratio': 0.4,
        'batch_size': 256
    }
    return config

In [13]:
config = get_config()

In [14]:
pc = Pinecone(api_key=config['pinecone_api'])

In [15]:
model = AutoModel.from_pretrained(config['model_ckp']).to(device)
tokenizer = AutoTokenizer.from_pretrained(config['model_ckp'])

config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/390M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of XLMRobertaModel were not initialized from the model checkpoint at uitnlp/visobert and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


sentencepiece.bpe.model:   0%|          | 0.00/471k [00:00<?, ?B/s]

In [16]:
#Pre-processing lowecrcase, stopword 
def preprocessing_data(data, config):
    data = data.lower()
    # Handle punctuation and special characters
    data = re.sub(r"[^\w\s]", " ", data)
    # 3. Loại bỏ khoảng trắng thừa
    data = re.sub(r'\s+', ' ', data).strip()
    # Remove stopwords if needed
    # data = ' '.join([word for word in data.split() if word not in list_stopword])
    # Word segment
    # data = word_tokenize(data, format='text')
    # Convert to word embedding
    data_tokenized = tokenizer(data, max_length=config['max_length'], truncation=True, padding=True, return_tensors='pt').to(device)
    with torch.no_grad():
        embedding = model(**data_tokenized).last_hidden_state.mean(dim=1).detach().cpu().numpy()
    
    return embedding

In [17]:
user_vectors = []
for i in tqdm(range(n_users)):  
    user_rows = train[train['UserID'] == (i+1)]
    
    weighted_embeddings = [preprocessing_data(row['Descriptions'], config) * (row['Rating']) for _, row in user_rows.iterrows()]
    sum_rating = sum([row['Rating'] for _, row in user_rows.iterrows()])
    # Sum all weighted embeddings
    user_vector = sum(weighted_embeddings)[0] / sum_rating
    user_vectors.append(user_vector)

100%|██████████| 6471/6471 [08:07<00:00, 13.27it/s] 


In [24]:
# Create retriver index
index_name = config['pc_index_name']

if index_name not in pc.list_indexes().names():
    pc.create_index(name=index_name, dimension=user_vectors[0].shape[0], 
    spec=ServerlessSpec(cloud='aws', region='us-east-1') 
)

In [25]:
retriver = pc.Index(index_name)

## Load data into pinecone

### Create dataset

In [40]:
class GetDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return {"id": str(idx + 1), "values": self.data[idx]}

In [41]:
train_dataset = GetDataset(user_vectors)

In [42]:
train_loader = DataLoader(train_dataset, shuffle=False, batch_size=config['batch_size'], drop_last=False)

In [43]:
# Upsert vectors in batches
for batch in tqdm(train_loader):
    vectors = [{"id": id_, "values": values} for id_, values in zip(batch['id'], batch['values'])]
    retriver.upsert(vectors=vectors)

100%|██████████| 26/26 [00:52<00:00,  2.03s/it]


In [46]:
retriver.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 6471}},
 'total_vector_count': 6471}

# Find similar items

In [49]:
df_items = train.drop_duplicates(subset=['HotelID']).sort_values(by=['HotelID'])
df_items = df_items[['HotelID', 'Descriptions']].reset_index()

In [50]:
item_embeddings = []
for i in tqdm(range(0, len(df_items))):
    embedding = preprocessing_data(df_items['Descriptions'].iloc[i], config)[0]
    item_embeddings.append(embedding)

100%|██████████| 4506/4506 [00:56<00:00, 79.54it/s]


In [52]:
similarity_matrix = cosine_similarity(item_embeddings)

In [53]:
def get_top_similar_items(similarity_matrix, itemid, topk):
    sim_scores = list(enumerate(similarity_matrix[itemid]))
    sim_item = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    list_sim_itemid = [item[0] for item in sim_item[1:topk]]
    return list_sim_itemid 

In [54]:
def update_rating(similarity_matrix, itemid, topk, train):
    top_sim_items = get_top_similar_items(similarity_matrix, itemid, topk)
    community_rating = [rating for item in top_sim_items for rating in train[train['HotelID'] == item]['Rating'].values]
    avg_community_rating = sum(community_rating)/len(community_rating)
                
    return avg_community_rating

# Getting data from pinecone 

In [55]:
def predict_rating(list_similarities, similarity_matrix, itemid, train, config):
    user_ratings = [
        train.loc[(train['UserID'] == int(user['id'])) & (train['HotelID'] == itemid), 'Rating'].values
        for user in list_similarities
    ]
    user_ratings = [rating for sublist in user_ratings for rating in sublist]  # Flatten the list
        
    if len(user_ratings) == 0:
        return update_rating(similarity_matrix, itemid, config['topk_items'], train)
    else:
        average_rating = config['ratio']*sum(user_ratings)/len(user_ratings) + (1-config['ratio'])*update_rating(similarity_matrix, itemid, config['topk_items'], train)
        return average_rating

In [56]:
def evaluate(retriver, train, test, similarity_matrix, config):
    pred_ratings = []
    for _, row in tqdm(test.iterrows()):
        userid = row['UserID']
        itemid = row['HotelID']
        list_similarities = retriver.query(top_k=config['topk'], id=str(userid),include_metadata=True)['matches'][1:]
        pred_rating = predict_rating(list_similarities, similarity_matrix, itemid, train, config)
        pred_ratings.append(pred_rating)
        
    return pred_ratings

In [57]:
pred_ratings = evaluate(retriver, train, test, similarity_matrix, config)

695it [01:24,  8.23it/s]


In [58]:
labels = test['Rating'].values

In [59]:

mse = mean_squared_error(labels , pred_ratings)
print('MSE =',mse)

MSE = 2.5895798506241317


In [60]:
rmse = math.sqrt(mse)
print('RMSE =',rmse)     

RMSE = 1.609217154589191
