In [1]:
import json
import os
import re
import pandas as pd
import numpy as np

## 1. 读取本地元数据文件

In [2]:
df_fpath = {
    'tag':'../data/dumps/tag_241201.pkl',
    'character':'../data/dumps/character_241201.pkl',
    'group':'../data/dumps/group_241201.pkl',
    'parody':'../data/dumps/parody_241201.pkl',
    'artist':'../data/dumps/artist_241201.pkl'
}

In [3]:
meta_df_map = {}

In [5]:
for k in df_fpath:
    meta_df_map[k] = pd.read_pickle(df_fpath[k])

## 2. 读取本地漫画节点数据

In [6]:
manga_df = pd.read_pickle('../data/dumps/manga_241130.pkl')

## 3. 建立neo4j链接

In [7]:
import neo4j
from neo4j import GraphDatabase

URI = "bolt://localhost:7687"

AUTH = ("neo4j", "11111111")

### 3.1 插入元数据节点

In [8]:
# 批量创建节点
def create_nodes_in_batch(tx, meta_name , nodes):
    query = f"UNWIND $nodes AS node CREATE (t:{meta_name} {{name: node.name, count: node.count, href: node.link, id:node.id}})"
    tx.run(query, nodes=nodes)

# def update_nodes_in_batch(tx, meta_name , nodes):
#     query = f" UNWIND $nodes AS node MATCH (u:{meta_name}) WHERE u.name = node.name SET u.count = node.count "
#     tx.run(query, nodes=nodes)

In [12]:
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    
    driver.verify_connectivity()
    
    for key in meta_df_map:
        node_df = meta_df_map[key]

        # 示例：批量插入数据
        nodes_to_insert = list(node_df.T.to_dict().values())
        
        with driver.session() as session:
            
            session.execute_write(create_nodes_in_batch, key, nodes_to_insert)
            

### 3.2 插入漫画节点

In [13]:
manga_df.columns

Index(['id', 'title', 'scanlator', 'url', 'api', 'media_id', 'epos',
       'num_favorites', 'tag', 'group', 'parody', 'character', 'language',
       'artist', 'category', 'cover', 'thumbnail', 'image_urls', 'num_pages'],
      dtype='object')

In [14]:
node_property = ['id','title','url','api','media_id','epos','num_favorites','category',
                 'language','cover','thumbnail','image_urls','num_pages']

In [15]:
relation_property = ['tag', 'group', 'parody', 'character','artist']

In [16]:
def create_manga_nodes_in_batch(tx , nodes):
    in_query = '{' + ','.join([f'{key}:node.{key}' for key in node_property]) + '}'
    query = f"UNWIND $nodes AS node CREATE (t:manga {in_query})"
    
    tx.run(query, nodes=nodes)

In [17]:
# 示例：批量插入数据
nodes_to_insert = list(manga_df[node_property].T.to_dict().values())

In [18]:
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    
    driver.verify_connectivity()

    with driver.session() as session:

        batch_size = 256
        for i in range(0,len(nodes_to_insert),batch_size):
            nodes_batch = nodes_to_insert[i:i+batch_size]
        
            session.execute_write(create_manga_nodes_in_batch , nodes_batch)
del nodes_to_insert

### 3.3 插入节点关系

```python
relationships_to_insert = [
    {"start_node": "A", "end_node": "B", "relationship_type": "RELATIONSHIP_1", "properties": {"prop1": "value1"}},
    {"start_node": "A", "end_node": "B", "relationship_type": "RELATIONSHIP_2", "properties": {"prop2": "value2"}},
    # 更多关系数据...
]
```

In [20]:
rel_id_map = {}
for key in meta_df_map:
    if not key in rel_id_map:
        rel_id_map[key] = {}
    for _,tpl in meta_df_map[key].iterrows():
        rel_id_map[key][tpl['name']] = tpl['id']

In [24]:
rel_id_map['tag']['handjob']

1033

In [57]:
def create_relation_batch(tx,relationships):
    rel_type = relationships[0]['relationship_type']
    schema = rel_type.removeprefix('has_')
    query = f'''UNWIND $relationships AS rel
    MATCH (a:manga where a.id=rel.start_node), (b:{schema} where b.id=rel.end_node)
    CREATE (a)-[:{rel_type}]->(b)
    '''
    tx.run(query, relationships=relationships)

In [37]:
relation_lst[0]

{'start_node': 228132, 'end_node': 1033, 'relationship_type': 'has_tag'}

In [None]:
# building relation and destory
for rel_type in relation_property:
    relation_lst = []
    for idx,(_,tpl) in enumerate(manga_df.iterrows()):
        rel_builder = lambda x:{'start_node':tpl['id'],'end_node':rel_id_map[rel_type][x],'relationship_type':f'has_{rel_type}'}
        relation_lst += map(rel_builder,tpl[rel_type])
    
    # insert relationship
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
    
        driver.verify_connectivity()
    
        with driver.session() as session:
    
            batch_size = 256
            for i in range(0,len(relation_lst),batch_size):
                relations_batch = relation_lst[i:i+batch_size]
            
                session.execute_write(create_relation_batch, relations_batch)

    print(f'{rel_type} insert successful. Total volume:{len(relation_lst)}')
    del relation_lst