In [1]:
import json
import os
import re
import pandas as pd
import numpy as np

## 1. 读取本地元数据文件

In [3]:
json_fpath = {
    'tag':'../data/meta/tags-id.json',
    'character':'../data/meta/characters-id.json',
    'group':'../data/meta/groups-id.json',
    'parody':'../data/meta/parodies-id.json',
    'artist':'../data/meta/artists-id.json'
}

In [4]:
meta_json_map = {
    k:json.load(open(json_fpath[k],'r')) for k in json_fpath
}

### 1.1 存储为dataframe

In [5]:
meta_df_map = {k:pd.DataFrame(data=meta_json_map[k]) for k in meta_json_map}

In [6]:
num_to_int = lambda x:int(x) if not 'K' in x else int(x.removesuffix('K'))*1000

In [7]:
for k in meta_df_map:
    meta_df_map[k]['count'] = meta_df_map[k]['count'].apply(num_to_int)

In [8]:
for k in meta_df_map:
    meta_df_map[k].to_pickle(f'../data/dumps/{k}_241201.pkl')

## 2. 读取本地漫画节点数据

In [8]:
all_manga_data = []
empty_fpath = []
for fpath in os.listdir('../data/manga/'):
    fstr = open(f'../data/manga/{fpath}','r',encoding='utf-8').read()
    if not fstr=='':
        all_manga_data += json.loads(fstr)
    else:
        empty_fpath.append(fpath)

In [9]:
len(empty_fpath)

3042

In [10]:
with open('../data/empty_file.txt','w',encoding='utf-8') as file:
    file.write('\n'.join(empty_fpath))

In [11]:
manga_df = pd.DataFrame(data=all_manga_data)

### 2.1 存储文件

In [12]:
manga_df.to_pickle('../data/dumps/manga_241130.pkl')

## 3. 建立neo4j链接

In [13]:
import neo4j
from neo4j import GraphDatabase

URI = "bolt://localhost:7687"

AUTH = ("neo4j", "11111111")

### 3.1 插入元数据节点

In [14]:
# 批量创建节点
def create_nodes_in_batch(tx, meta_name , nodes):
    query = f"UNWIND $nodes AS node CREATE (t:{meta_name} {{name: node.name, count: node.count, href: node.link}})"
    tx.run(query, nodes=nodes)

# def update_nodes_in_batch(tx, meta_name , nodes):
#     query = f" UNWIND $nodes AS node MATCH (u:{meta_name}) WHERE u.name = node.name SET u.count = node.count "
#     tx.run(query, nodes=nodes)

In [17]:
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    
    driver.verify_connectivity()
    
    # for key in meta_df_map:
    #     node_df = meta_df_map[key]

    #     # 示例：批量插入数据
    #     nodes_to_insert = list(node_df.T.to_dict().values())
    
    #     with driver.session() as session:
            
    #         session.execute_write(create_nodes_in_batch, key, nodes_to_insert)
    #         # session.execute_write(update_nodes_in_batch, key , nodes_to_insert)
            

ServiceUnavailable: Couldn't connect to localhost:7687 (resolved to ()):
Failed to establish connection to ResolvedIPv6Address(('::1', 7687, 0, 0)) (reason [Errno 61] Connection refused)
Failed to establish connection to ResolvedIPv4Address(('127.0.0.1', 7687)) (reason [Errno 61] Connection refused)

### 3.2 插入漫画节点

In [52]:
manga_df.columns

Index(['id', 'title', 'scanlator', 'url', 'api', 'media_id', 'epos',
       'num_favorites', 'tag', 'group', 'parody', 'character', 'language',
       'artist', 'category', 'cover', 'thumbnail', 'image_urls', 'num_pages'],
      dtype='object')

In [149]:
node_property = ['id','title','url','api','media_id','epos','num_favorites','category',
                 'language','cover','thumbnail','image_urls','num_pages']

In [150]:
relation_property = ['tag', 'group', 'parody', 'character','artist']

In [151]:
def create_manga_nodes_in_batch(tx , nodes):
    in_query = '{' + ','.join([f'{key}:node.{key}' for key in node_property]) + '}'
    query = f"UNWIND $nodes AS node CREATE (t:manga {in_query})"
    
    tx.run(query, nodes=nodes)

In [152]:
# 示例：批量插入数据
nodes_to_insert = list(manga_df[node_property].T.to_dict().values())

In [154]:
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    
    driver.verify_connectivity()

    with driver.session() as session:

        batch_size = 256
        for i in range(0,len(nodes_to_insert),batch_size):
            nodes_batch = nodes_to_insert[i:i+batch_size]
        
            session.execute_write(create_manga_nodes_in_batch , nodes_batch)
del nodes_to_insert

ServiceUnavailable: Couldn't connect to localhost:7687 (resolved to ()):
Failed to establish connection to ResolvedIPv6Address(('::1', 7687, 0, 0)) (reason [Errno 61] Connection refused)
Failed to establish connection to ResolvedIPv4Address(('127.0.0.1', 7687)) (reason [Errno 61] Connection refused)

### 3.3 插入节点关系

In [146]:
def create_relation(tx,mid,tag_key,relation_lst):
    for tag_name in relation_lst:
        query = f'MATCH (a:manga where a.id="{mid}"), (b:{tag_key} where b.name="{tag_name}") CREATE (a)-[r:has_{tag_key}]->(b)'
        tx.run(query)

In [147]:
# def create_relation_batch(tx,mid,tag_key,relation_lst):
#     query = f'UNWIND $relation_lst as row \
#     MATCH (s:manga where s.id="{mid}"), (e:{tag_key} where e.name=row.name) CREATE (s)-[r:has_{tag_key}]->(e)'
#     tx.run(query, relation_lst)

In [148]:
for idx,(_,tpl) in enumerate(manga_df.iterrows()):
    if idx%1000==0:
        print(f'Building relation on {relkey} on {idx}')
    manga_id = tpl['id']
    
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        
        driver.verify_connectivity()
    
        with driver.session() as session:
    
            for relkey in relation_property:
                relation_lst = tpl[relkey]
                
                message = session.execute_write(create_relation, manga_id, relkey, relation_lst)
                print(message)
    break

Building relation on tag on 0
MATCH (a:manga where a.id="228132"), (b:tag where b.name="handjob") CREATE (a)-[r:has_tag]->(b)
MATCH (a:manga where a.id="228132"), (b:tag where b.name="big breasts") CREATE (a)-[r:has_tag]->(b)
MATCH (a:manga where a.id="228132"), (b:tag where b.name="full color") CREATE (a)-[r:has_tag]->(b)
MATCH (a:manga where a.id="228132"), (b:tag where b.name="paizuri") CREATE (a)-[r:has_tag]->(b)
MATCH (a:manga where a.id="228132"), (b:tag where b.name="inverted nipples") CREATE (a)-[r:has_tag]->(b)
MATCH (a:manga where a.id="228132"), (b:tag where b.name="sole female") CREATE (a)-[r:has_tag]->(b)
MATCH (a:manga where a.id="228132"), (b:tag where b.name="sole male") CREATE (a)-[r:has_tag]->(b)
None
MATCH (a:manga where a.id="228132"), (b:group where b.name="shimoyakedou") CREATE (a)-[r:has_group]->(b)
None
MATCH (a:manga where a.id="228132"), (b:parody where b.name="touhou project") CREATE (a)-[r:has_parody]->(b)
None
MATCH (a:manga where a.id="228132"), (b:charact

In [134]:
relation_lst

['ouma tokiichi']