In [1]:
import json
import os
import re
import pandas as pd
import numpy as np

## 1. 读取本地元数据文件

In [2]:
df_fpath = {
    'tag':'../data/dumps/tag_241201.pkl',
    'character':'../data/dumps/character_241201.pkl',
    'group':'../data/dumps/group_241201.pkl',
    'parody':'../data/dumps/parody_241201.pkl',
    'artist':'../data/dumps/artist_241201.pkl'
}

In [3]:
meta_df_map = {}

In [4]:
for k in df_fpath:
    meta_df_map[k] = pd.read_pickle(df_fpath[k])

## 2. 读取数据

In [5]:
manga_df = pd.read_pickle('../data/dumps/manga_241130.pkl')

In [15]:
meta_df_map = {k:pd.read_pickle(df_fpath[k]) for k in df_fpath}

## 2.1 构建数据

In [16]:
node_property = ['id','title','url','api','media_id','epos','num_favorites','category',
                 'language','cover','thumbnail','image_urls','num_pages']

In [17]:
relation_property = ['tag', 'group', 'parody', 'character','artist']

In [18]:
rel_id_map = {}
for key in meta_df_map:
    if not key in rel_id_map:
        rel_id_map[key] = {}
    for _,tpl in meta_df_map[key].iterrows():
        rel_id_map[key][tpl['name']] = tpl[f'{key}_id']

In [19]:
rel_id_map['tag']['handjob']

1033

## 3. 建立neo4j链接

远程链接本地链接均可。

In [74]:
import neo4j
from neo4j import GraphDatabase

URI = "bolt://localhost:7687"

AUTH = ("neo4j", "11111111")

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    
    driver.verify_connectivity()

### 3.1 更新meta节点值

In [57]:
# 批量创建节点
def create_nodes_in_batch(tx, meta_name , nodes):
    query = f'''
        CALL apoc.create.nodes(["{meta_name}"],$nodes)
    '''
    tx.run(query, nodes=nodes)

# def update_nodes_in_batch(tx, meta_name , nodes):
#     query = f" UNWIND $nodes AS node MATCH (u:{meta_name}) WHERE u.name = node.name SET u.count = node.count "
#     tx.run(query, nodes=nodes)

In [58]:
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    
    driver.verify_connectivity()
    
    for key in meta_df_map:
        node_df = meta_df_map[key]

        # 示例：批量插入数据
        nodes_to_insert = list(node_df.T.to_dict().values())
        
        with driver.session() as session:
            
            session.execute_write(create_nodes_in_batch, key, nodes_to_insert)
            

### 3.2 更新漫画节点值

In [59]:
def create_manga_nodes_in_batch(tx , nodes):
    query = f'''
        CALL apoc.create.nodes(["manga"],$nodes)
    '''
    tx.run(query, nodes=nodes)

In [62]:
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    
    driver.verify_connectivity()
    
    nodes_to_insert = list(manga_df.T.to_dict().values())
    batch_size = 1024
    
    with driver.session() as session:
        for i in range(0,len(nodes_to_insert),batch_size):
            batch_to_insert = nodes_to_insert[i:i+batch_size]
            session.execute_write(create_manga_nodes_in_batch , batch_to_insert)
        

### 3.3 构建索引

将对应id构建为索引

In [64]:
def create_range_index(tx,meta_name):
    query = f'''
    CREATE INDEX FOR (n:{meta_name}) ON (n.{meta_name}_id)
    '''
    tx.run(query)

In [63]:
indeces_name = ['tag','parody','character','manga','artist','group']

In [65]:
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    
    driver.verify_connectivity()
    
    with driver.session() as session:
        for name in indeces_name:
            session.execute_write(create_range_index , name)
        

## 3.4 构建关系

关系总数远大于节点数时首先建立关系，之后更新节点的properties

```python
relationships_to_insert = [
    {"start_node": "A", "end_node": "B", "relationship_type": "RELATIONSHIP_1", "properties": {"prop1": "value1"}},
    {"start_node": "A", "end_node": "B", "relationship_type": "RELATIONSHIP_2", "properties": {"prop2": "value2"}},
    # 更多关系数据...
]
```

In [100]:
def create_relation_batch(tx,relationships):
    rel_type = relationships[0]['relationship_type']
    schema = rel_type.removeprefix('has_')
    query = f'''CALL apoc.periodic.iterate(
        "UNWIND $relationships AS rel
        match (a:manga {{manga_id:rel.start_node}})
        match (b:{schema} {{{schema}_id:rel.end_node}})
        return a as a,b as b
        ",
        "MERGE (a)-[:{rel_type}]->(b)",
        {{batchSize: 20480, iterateList:true, parallel:false,
        params: {{relationships: $relationships}}}}
    )
    '''
    tx.run(query, relationships=relationships)

In [101]:
batch_size = 204800
# building relation and destory
for rel_type in relation_property:
    relation_lst = []
    for idx,(_,tpl) in enumerate(manga_df.iterrows()):
        rel_builder = lambda x:{'start_node':tpl[f'manga_id'],
                                'end_node':rel_id_map[rel_type][x],
                                'relationship_type':f'has_{rel_type}'}
        
        relation_lst += map(rel_builder,tpl[rel_type])

    print(f'{rel_type} data build successful. Total volume:{len(relation_lst)}')
    # insert relationship
    with GraphDatabase.driver(URI, auth=AUTH,max_connection_lifetime=600) as driver:
    
        driver.verify_connectivity()
    
        with driver.session() as session:
            for i in range(0,len(relation_lst),batch_size):
                batch_to_relation = relation_lst[i:i+batch_size]
                print(f'Round:{i//batch_size}, Building relationship on batchsize:{batch_size}')
                session.execute_write(create_relation_batch, batch_to_relation)

    print(f'{rel_type} insert successful. Total volume:{len(relation_lst)}')
    del relation_lst

tag data build successful. Total volume:3876841
Round:0, Building relationship on batchsize:204800
Round:1, Building relationship on batchsize:204800
Round:2, Building relationship on batchsize:204800
Round:3, Building relationship on batchsize:204800
Round:4, Building relationship on batchsize:204800
Round:5, Building relationship on batchsize:204800
Round:6, Building relationship on batchsize:204800
Round:7, Building relationship on batchsize:204800
Round:8, Building relationship on batchsize:204800
Round:9, Building relationship on batchsize:204800
Round:10, Building relationship on batchsize:204800
Round:11, Building relationship on batchsize:204800
Round:12, Building relationship on batchsize:204800
Round:13, Building relationship on batchsize:204800
Round:14, Building relationship on batchsize:204800
Round:15, Building relationship on batchsize:204800
Round:16, Building relationship on batchsize:204800
Round:17, Building relationship on batchsize:204800
Round:18, Building relatio

### 3.2 插入漫画节点

### 3.3 插入节点关系