In [1]:
import sys

sys.path.append("..")
import argparse
import warnings
from logging import getLogger

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from torch.utils.tensorboard import SummaryWriter
from mpl_toolkits.mplot3d import Axes3D
from recbole.utils import init_seed, set_color
from sklearn.decomposition import PCA

from config.configuration import Config
from data.dataset import GeneralDataset, GeneralGraphDataset
from data.utils import data_reparation
from models.embedding import (EmbeddingHelper, EmbeddingModel, EmbeddingType,
                              TemplateType)
from trainer import Trainer
from utils.logger import init_logger
from utils.utils import get_flops, get_model


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
writer = SummaryWriter("./embeddings")

In [None]:

parser = argparse.ArgumentParser()

parser.add_argument(
    "--dataset", "-d", type=str, default="wsdream-tp", help="name of datasets"
)

parser.add_argument(
    "--model", "-m", type=str, default="XXX", help="name of models"
)

args, _ = parser.parse_known_args()

config = Config(model=args.model, dataset=args.dataset)

dataset = GeneralGraphDataset(config)
train_data, test_data = data_reparation(config, dataset)


In [None]:
dataset.user_feat["country"]

In [None]:
def get_pretrained_embedding(dataset, template_type:TemplateType):
    eh = EmbeddingHelper()
    user_invocations = {}
    item_invocations = {}
    for uid in dataset.uids_in_inter_feat:
        user_invocations[uid] = dataset.inter_data_by_type("user", uid)
    for iid in dataset.iids_in_inter_feat:
        item_invocations[iid] = dataset.inter_data_by_type("item", iid)
    # user_embedding = torch.Tensor(eh.fit(EmbeddingType.USER, template_type,
    #                                 EmbeddingModel.INSTRUCTOR_BGE_SMALL, invocations=user_invocations, auto_save=False))
    # item_embedding = torch.Tensor(eh.fit(EmbeddingType.ITEM, template_type,
    #                                 EmbeddingModel.INSTRUCTOR_BGE_SMALL, invocations=item_invocations, auto_save=False))
    
    user_embedding = torch.nn.Embedding(
            num_embeddings=339, embedding_dim=384).weight
    item_embedding = torch.nn.Embedding(
            num_embeddings=5825, embedding_dim=384).weight
    return user_embedding,item_embedding

u_embedding, i_embedding = get_pretrained_embedding(train_data.dataset, TemplateType.IMPROVED)

uids = list(range(len(u_embedding)))
iids = list(range(len(i_embedding)))

writer.add_embedding(u_embedding, metadata=uids, tag="User Embeddings - DEFAULT")
writer.add_embedding(i_embedding, metadata=iids, tag="Item Embeddings - DEFAULT")


In [None]:
def get_pretrained_embedding(dataset, template_type:TemplateType):
    eh = EmbeddingHelper()
    user_invocations = {}
    item_invocations = {}
    for uid in dataset.uids_in_inter_feat:
        user_invocations[uid] = dataset.inter_data_by_type("user", uid)
    for iid in dataset.iids_in_inter_feat:
        item_invocations[iid] = dataset.inter_data_by_type("item", iid)
    user_embedding = torch.Tensor(eh.fit(EmbeddingType.USER, template_type,
                                    EmbeddingModel.INSTRUCTOR_BGE_SMALL, invocations=user_invocations, auto_save=False))
    item_embedding = torch.Tensor(eh.fit(EmbeddingType.ITEM, template_type,
                                    EmbeddingModel.INSTRUCTOR_BGE_SMALL, invocations=item_invocations, auto_save=False))
    
    return user_embedding, item_embedding

u_embedding, i_embedding = get_pretrained_embedding(train_data.dataset, TemplateType.IMPROVED)

uids = dataset.user_feat["country"].tolist()
iids = dataset.item_feat["country"].tolist()

writer.add_embedding(u_embedding, metadata=uids, tag="User Embeddings - IMPROVED_country")
writer.add_embedding(i_embedding, metadata=iids, tag="Item Embeddings - IMPROVED_country")


In [None]:
import hashlib
import os
from typing import List, Tuple, Dict

import numpy as np
import pandas as pd
from langchain.embeddings import (HuggingFaceEmbeddings,
                                  HuggingFaceInstructEmbeddings)

from models.embedding.template import BasicTempalte, ImprovedTemplate, StaticTemplate
from root import ORIGINAL_DATASET_DIR, RESOURCE_DIR
from utils.enums import *

embedding_models = {
    "il": (HuggingFaceInstructEmbeddings, "hkunlp/instructor-large"),
    "e5": (HuggingFaceInstructEmbeddings, "intfloat/e5-large-v2"),
    "ixl": (HuggingFaceInstructEmbeddings, "hkunlp/instructor-xl"),
    "bge-small": (HuggingFaceInstructEmbeddings, "BAAI/bge-small-en"),
    "bge-large": (HuggingFaceInstructEmbeddings, "BAAI/bge-large-en-v1.5"),
    "bge-base": (HuggingFaceInstructEmbeddings, "BAAI/bge-base-en-v1.5")
}

class EmbeddingHelper:

    def __init__(self) -> None:

        self.upath = os.path.join(ORIGINAL_DATASET_DIR, "userlist.txt")
        self.ipath = os.path.join(ORIGINAL_DATASET_DIR, "wslist.txt")
        self.suffix = ".npy"
        self._load_user_and_item()

    @property
    def _user_info_header(self):
        return ["user_id", "ip_address", "country", "ip_number", "AS", "latitude", "longitude"]

    @property
    def _item_info_header(self):
        return ["service_id", "wsdl_address", "provider", "ip_address", "country", "ip_number", "AS", "latitude", "longitude"]

    def _load_user_and_item(self):
        self.user_info = pd.read_csv(
            self.upath, sep="\t", header=0, names=self._user_info_header)
        self.item_info = pd.read_csv(
            self.ipath, sep="\t", header=0, names=self._item_info_header)

    def info2template(self, type_: EmbeddingType, template_type: TemplateType, invocations: Dict[str, List]) -> List[str]:
        if type_ == EmbeddingType.USER:
            info = self.user_info
            id_label = "user_id"
        else:
            info = self.item_info
            id_label = "service_id"

        if template_type == TemplateType.BASIC:
            template_func = BasicTempalte
        elif template_type == TemplateType.IMPROVED:
            template_func = ImprovedTemplate
        elif template_type == TemplateType.STATIC:
            template_func = StaticTemplate
        else:
            raise ValueError
        
        res = []
        print(len(info))
        for row_dict in info.to_dict(orient="records"):
            id_ = row_dict[id_label]
            if issubclass(template_func, BasicTempalte):
                template = template_func(row_dict)
            else:
                if type_ == EmbeddingType.USER:
                    template = template_func(
                        type="user", invocations=invocations.get(id_, []), content=row_dict)  # type: ignore
                else:
                    template = template_func(type="item", invocations=invocations.get(id_, []), content=row_dict)
            res.append(str(template))

        return res

    @property
    def embedding_path(self):
        embedding_path = os.path.join(RESOURCE_DIR, "embedding")
        if not os.path.exists(embedding_path):
            os.makedirs(embedding_path)
        return embedding_path

    def get_models(self, type_: EmbeddingModel) -> Union[HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings]:
        model, model_name = embedding_models[type_.value]
        return model(model_name=model_name)

    def save_embedding(self, embed_data, embed_name):
        saved_path = os.path.join(self.embedding_path, embed_name)
        if not os.path.exists(saved_path):
            np.save(saved_path, embed_data)

    def load_embedding(self, embed_name):
        saved_path = os.path.join(
            self.embedding_path, embed_name + self.suffix)
        if not os.path.exists(saved_path):
            raise FileNotFoundError
        return np.load(saved_path)

    def fit(self, type_: EmbeddingType, template_type: TemplateType, model_type: EmbeddingModel, auto_save=True, *arg, **kwarg):
        combined_string = f"{type_.value}_{template_type.value}_{model_type.value}"
        file_name = hashlib.md5(combined_string.encode()).hexdigest()[:6]
        try:
            return self.load_embedding(file_name)
        except FileNotFoundError as e:
            pass
        model = self.get_models(model_type)
        embeddings = model.embed_documents(
            self.info2template(type_, template_type, kwarg["invocations"]))
        if auto_save:
            self.save_embedding(embeddings, file_name)
        return embeddings

eh = EmbeddingHelper()
eh.fit(EmbeddingType.USER, TemplateType.IMPROVED,
        EmbeddingModel.INSTRUCTOR_BGE_SMALL)

