In [1]:
import sys
import pickle

# TO CHANGE
BASEDIR = "../../"
sys.path.insert(0, BASEDIR)

In [2]:
TRIALS = 2
FIX_FILE_PATH = "./import_fix.py"
for _ in range(TRIALS):
    try:
      from src import PersonalAI, PersonalAIConfig, QAPipelineConfig, MemPipelineConfig, \
            GraphModelConfig, EmbeddingsModelConfig, EmbedderModelConfig

      from src.db_drivers import KeyValueDriverConfig, GraphDriverConfig, VectorDriverConfig
      from src.db_drivers.kv_driver import DEFAULT_INMEMORYKV_CONFIG
      from src.db_drivers.graph_driver import DEFAULT_INMEMORYGRAPH_CONFIG
      from src.db_drivers.vector_driver import VectorDBConnectionConfig

      from src.pipelines.qa.knowledge_retriever import AStarGraphSearchConfig, AStarMetricsConfig, BFSSearchConfig, MixturedGraphSearchConfig
      from src.pipelines.qa import QueryLLMParserConfig, KnowledgeComparatorConfig, KnowledgeRetrieverConfig, QALLMGeneratorConfig

      from src.pipelines.memorize import LLMExtractorConfig, LLMUpdatorConfig

      from src.utils import NodeType, Logger
    except RuntimeError as e:
        from pathlib import Path
        fix_path = Path(FIX_FILE_PATH)
        if fix_path.is_file():
            %run {fix_path} --base_dir BASEDIR
        else:
            raise e

  from tqdm.autonotebook import tqdm, trange


#### 1. Загружем датасет с триплетами, на основе которого будет построен граф знаний

In [6]:
#PKL_GRAPH_PATH = 'C:/Users/nikit/temp_files/pickled_graphs/DiaasqGigachat.pickle'
PKL_GRAPH_PATH = '../../data/pickled_graphs/DiaasqGPT4omini.pickle'
#PKL_GRAPH_PATH = '../../data/pickled_graphs/DiaasqGigachat.pickle'

with open(PKL_GRAPH_PATH, 'rb') as f:
    formated_triplets = pickle.load(f)
print(len(formated_triplets))

283268


#### 2. Задаём конфигурацию графа знаний

In [5]:
# Graph model configuration
GRAPH_STORAGE_CONFIG = GraphDriverConfig(db_vendor='inmemory_graph', db_config=DEFAULT_INMEMORYGRAPH_CONFIG)
GRAPH_MODEL_CONFIG = GraphModelConfig(driver_config=GRAPH_STORAGE_CONFIG)

In [6]:
# Vector model configuration
NODES_DB_PATH = '../../data/graph_structures/vectorized_nodes/testing' # TO CHANGE
TRIPLETS_DB_PATH = '../../data/graph_structures/vectorized_triplets/testing' # TO CHANGE
NEED_TO_CLEAR = True

VECTOR_NODES_STORAGE_CONFIG = VectorDriverConfig(db_config=VectorDBConnectionConfig(path=NODES_DB_PATH, need_to_clear=NEED_TO_CLEAR))
VECTOR_TRIPLETS_STIRAGE_CONFIG = VectorDriverConfig(db_config=VectorDBConnectionConfig(path=TRIPLETS_DB_PATH, need_to_clear=NEED_TO_CLEAR))

DEVICE = 'cuda' # TO CHANGE
EMBEDDER_MODEL_PATH = '../../models/intfloat/multilingual-e5-small' # TO CHANGE
EMBEDDER_MODEL_CONFIG = EmbedderModelConfig(model_name_or_path=EMBEDDER_MODEL_PATH, device=DEVICE)

VECTOR_MODEL_CONFIG = EmbeddingsModelConfig(
    nodesdb_driver_config=VECTOR_NODES_STORAGE_CONFIG,
    tripletsdb_driver_config=VECTOR_TRIPLETS_STIRAGE_CONFIG,
    embedder_config=EMBEDDER_MODEL_CONFIG)

In [7]:
# QA-pipeline retrieve stage configuration (configuring mixture graph search/retriever)
KV_STORAGE_CONFIG = KeyValueDriverConfig(db_vendor='inmemory_kv', db_config=DEFAULT_INMEMORYKV_CONFIG)
ASTAR_RETRIEVER_CONFIG = AStarGraphSearchConfig(
    metrics_config=AStarMetricsConfig(
        h_metric_name='ip', # TO CHANGE 
        kvdriver_config=KV_STORAGE_CONFIG),
    max_depth=20, max_passed_nodes=1000, # TO CHANGE
    accepted_node_types=[NodeType.object , NodeType.hyper, NodeType.episodic]) # TO CHANGE

BFS_RETRIEVER_CONFIG = BFSSearchConfig(
    strict_filter = True, hyper_episodic_num = 15, # TO CHANGE
    chain_triplets_num = 25, other_triplets_num = 6) # TO CHANGE

RETRIEVER_NAME = 'mixture'
RETRIEVER_CONFIG = MixturedGraphSearchConfig(
    astar_config=ASTAR_RETRIEVER_CONFIG,
    bfs_config=BFS_RETRIEVER_CONFIG
)

In [8]:
LANGUAGE = 'en' # TO CHANGE ('ru' | 'en' | 'auto')

In [None]:
# QA-pipeline configuration
QA_PIPELINE_CONFIG = QAPipelineConfig(
    query_parser_config=QueryLLMParserConfig(lang=LANGUAGE),
    knowledge_comparator_config=KnowledgeComparatorConfig(),
    knowledge_retriever_config=KnowledgeRetrieverConfig(
        retriever_method=RETRIEVER_NAME,retriever_config=RETRIEVER_CONFIG),
    answer_generator_config=QALLMGeneratorConfig(lang=LANGUAGE))

# Memorize-pipeline configuration
MEM_PIPELINE_CONFIG = MemPipelineConfig(
    extractor_config=LLMExtractorConfig(lang=LANGUAGE),
    updator_config=LLMUpdatorConfig(lang=LANGUAGE))

PERSONALAI_CONFIG = PersonalAIConfig(
    graph_struct_config=GRAPH_MODEL_CONFIG,
    embedds_struct_config=VECTOR_MODEL_CONFIG,
    qa_pipeline_config=QA_PIPELINE_CONFIG,
    mem_pipeline_config=MEM_PIPELINE_CONFIG,
    log=Logger('log/main'))

#### 3. Инициализируем граф знаний

In [9]:
personalai = PersonalAI(config=PERSONALAI_CONFIG)

No sentence-transformers model found with name ../../models/intfloat/multilingual-e5-small. Creating a new one with mean pooling.


#### 4. Добавляем в граф загруженные триплеты

In [None]:
print("uploading data to graph-storage")
graph_info = personalai.kg_model.graph_struct.create_triplets(formated_triplets)

In [None]:
print("uploading data to vector-storage")
vector_info = personalai.kg_model.embeddings_struct.create_triplets(formated_triplets)

#### 5. Q&A

In [None]:
# N = 1000
# QUESTION_NUM = 20
# episodic_text = []
# for i in range(N):
#     if formated_triplets[i].end_node.type.value == 'episodic':
#         episodic_text.append(formated_triplets[i].end_node.name)
# unique_episodic_texts = list(set(episodic_text))
# print(len(episodic_text), len(unique_episodic_texts))

# llm_agent = GigaChatAgent()

# QUESTION_GEN_PROMPT = "Generate one question based on given dialogue below. Generate question in English.\n\nDialogue:\n{d}\n\nQuestion:\n"
# ANSWER_GEN_PROMPT = "Generate answer for the question based on given dialogue below. Generate answer in English. Answer needs to be generated in a short format: in several phrases; dont generate full sentence as an answer.\n\nDialogue:\n{d}\n\nQuestion:{q}\n\nAnswer:\n"

# qa_examples = []
# for text in tqdm(unique_episodic_texts[:QUESTION_NUM]):
#     question = llm_agent.generate(user_prompt=QUESTION_GEN_PROMPT.format(d=text)).strip()
#     answer = llm_agent.generate(user_prompt=ANSWER_GEN_PROMPT.format(d=text, q=question)).strip()
#     print("Q: ", question)
#     print("A: ", answer)
#     qa_examples.append((question, answer))

In [10]:
qa_examples = [
  ("What is Lauren's opinion about the 13promax battery compared to other batteries in terms of battery life?",
  'Lauren thinks the 13promax battery has better battery life than other batteries in mobile phones.'),
  ("What does Emily think about the performance of her brother's GT2PRO and IQOO7 smartphones during voice calls.",
  'Emily thinks GT2PRO is not as hot during voice calls compared to IQOO7.'),
  ("What is the difference between the heat dissipation performance of the IQOO9 and Xiaomi Mi 12Pro smartphones according to Laura's experience?",
  'Laura finds the heat dissipation performance of the IQOO9 superior to the Xiaomi Mi 12Pro, resulting in less heating while using it for long periods.'),
  ("What are the reasons behind Rodrigo's dislike for Xiaomi's MIUI software, specifically mentioning the frequent bugs he has experienced over the past two years?",
  "Rodrigo dislikes Xiaomi's MIUI software due to frequent bugs he has experienced over the past two years, including small bugs that affect usage, such as force restarts."),
  ("What is Margaret's main concern regarding her MIX4 smartphone?",
  "problem with the photography camera module and its poor performance."),
  ('What are the advantages of the Xiaomi Mi 10pro mentioned by Horace and Jesse, especially in comparison to other devices?',
  'Advantages of Xiaomi Mi 10pro: splitscreen capability, fast charging, screen durability, waterproofing, good signal strength, fast gaming performance. Comparatively better than other devices (especially Apple) in terms of battery life and signal strength.'),
  ('What issues have James and Hailey experienced with their MIX4 devices, especially regarding the photography camera module?',
  "James experienced multiple crashes in a short period, while Hailey mentioned her device being used many times."),
  ("What was Kevin's experience with his brother's GT2PRO and Xiaomi 10Pro?",
  "Kevin found his brother's GT2PRO less hot than expected during voice calls, while Xiaomi 10Pro didn't heat up when recording videos."),
  ("What mobile phone does Kayla's brother have that doesn't get hot during voice calls, according to her?",
  "IQOO7")]

In [11]:
answer, info = personalai.answer_question(qa_examples[-2][0])
print(answer)

"Kevin's experiences with his brother's GT2PRO and Xiaomi 10Pro included observations about the devices' relative performance compared to Apple products, particularly regarding signal strength and battery life. He expressed concern about signal issues with Xiaomi and hesitation in switching to Apple due to signal and electrical issues."