In [27]:
from py2neo import Graph, Node, Relationship, Subgraph,NodeMatcher,RelationshipMatcher

In [40]:
g = Graph('http://localhost:7474',auth=("neo4j", "123456"))
#在任何一个cell运行时必须得有该语句
tx = g.begin() 

jiazhen = Node("Person", name="陈家珍", age=66)
fugui = Node("Person", name='徐福贵', age=67)
youqian = Node("Person", name="徐有钱")
renxing = Node("Person", name="徐任性")
cat = Node("Person", name='cat')
dog = Node("Person", name='dog')

wife = Relationship(fugui, "WIFE", jiazhen)
brother_1 = Relationship(fugui, "BROTHER", youqian)
brother_2 = Relationship(fugui, "BROTHER", renxing)
hus = Relationship(jiazhen, 'HUS', fugui)
know = Relationship(cat, 'KNOWS', dog)

#先构建Subgraph实例，再创建图
relation_list = Subgraph(relationships=[wife, brother_2, brother_1, hus,know])

tx.create(relation_list)
#类似spark必须要有action才会进行操作，neo4j也需要用commit来执行操作
g.commit(tx)

## query

所有节点

In [31]:
nodes = g.nodes.match()

for node in nodes:
    print(node)
print(node.items())
print(node.labels)

(_0:Person {name: '\u2b50Alice'})
(_1:Person {name: '\u2b50Bob'})
(_2:Person {age: 67, name: '\u5f90\u798f\u8d35'})
(_3:Person {age: 66, name: '\u2b50\u9648\u5bb6\u73cd'})
(_4:Person {name: '\u2b50dog'})
(_5:Person {name: '\u2b50\u5f90\u4efb\u6027'})
(_6:Person {name: '\u5f90\u6709\u94b1'})
(_7:Person {name: '\u2b50cat'})
(_8:Person {name: '\u5f90\u4efb\u6027'})
(_9:Person {age: 66, name: '\u9648\u5bb6\u73cd'})
(_10:Person {name: 'cat'})
(_11:Person {name: 'dog'})
dict_items([('name', 'dog')])
:Person


限定条件

In [32]:
# 用来查找节点的对象
matcher = NodeMatcher(g)

# first 返回第一个符合条件的节点
node1 = matcher.match('Person', name='徐有钱').first()
print(node1)
print(node1['name'])

# all 返回所有符合条件的节点
nodes = matcher.match('Person').all()

for node in nodes:
    print(node['name'])
    print(node['age'])

nodes = matcher.match('Person', age=66).all()
print('*' * 25 + '年龄66的节点' + '*' * 25)
for node in nodes:
    print(node['name'])
    print(node['age'])
# 模糊匹配 要用Cypher
nodes = matcher.match("Person").where("_.name =~ '徐.*'").all()

print('*' * 25 + '姓徐的节点' + '*' * 25)
for node in nodes:
    print(node['name'])
    print(node['age'])

(_6:Person {name: '\u5f90\u6709\u94b1'})
徐有钱
⭐Alice
None
⭐Bob
None
徐福贵
67
⭐陈家珍
66
⭐dog
None
⭐徐任性
None
徐有钱
None
⭐cat
None
徐任性
None
陈家珍
66
cat
None
dog
None
*************************年龄66的节点*************************
⭐陈家珍
66
陈家珍
66
*************************姓徐的节点*************************
徐福贵
67
徐有钱
None
徐任性
None


## 更新 Update
更新先要找出Nodes，再使用事务的push更新

In [33]:
tx = g.begin()
# 找到你要找的Nodes
matcher = NodeMatcher(g)

# 修改单个节点
# init_node = matcher.match("Person", name="福贵")
# new_node = init_node.first()
# new_node['name'] = "徐福贵"
# sub = Subgraph(nodes=[new_node])
# tx.push(sub)
# tx.commit()

# 修改多个节点
init_node = matcher.match("Person")
new_nodes = []
for node in init_node.all():
    node['name'] = '⭐'+node['name']
    new_nodes.append(node)

sub = Subgraph(nodes=new_nodes)
tx.push(sub)
g.commit(tx)

  tx.commit()


两个节点新加关系

In [34]:
matcher = NodeMatcher(g)

fugui = matcher.match('Person', name='⭐徐福贵').first()
youqian = matcher.match('Person', name='⭐徐有钱').first()

relation = Relationship(fugui, 'Brother', youqian)

g.create(relation)

## 删除
删除关系链
delete方法

注意！！这一方法会同时把节点给删除！！！

In [35]:
matcher = NodeMatcher(g)
r_matcher = RelationshipMatcher(g)
fugui = matcher.match('Person', name='⭐徐福贵').first()
youqian = matcher.match('Person', name='⭐徐有钱').first()

relation = r_matcher.match(nodes=[fugui, youqian]).first()
print(relation)
g.delete(relation)

(⭐徐福贵)-[:Brother {}]->(⭐徐有钱)


只删除关系

sepatate 方法 \
很明显这才符合常理

In [36]:
matcher = NodeMatcher(g)
r_matcher = RelationshipMatcher(g)
cat = matcher.match('Person', name='⭐cat').first()
dog = matcher.match('Person', name='⭐dog').first()

relation = r_matcher.match(nodes=[cat, dog]).first()
print(relation)
g.separate(relation)

(⭐cat)-[:KNOWS {}]->(⭐dog)


## 批处理
对于大量的插入一般是很费时的，首先我们可以使用事务，加快一定速度，\
而插入的方法一样重要，我们很多时候是遍历一个文件然后生成图，例子中我们生成每个Node后,  \
先把他们放入一个List中，再变为Subgraph实例,然后再create(),耗时比一条条插入至少快10倍以上

创建多个节点

In [38]:
tx = g.begin()
node_list = [Node("Num", name=str(i)) for i in range(4)]

node_list = Subgraph(nodes=node_list)

tx.create(node_list)
tx.commit()

  tx.commit()


删除所有的关系

In [41]:
matcher = RelationshipMatcher(g)
tx = g.begin()
relationship_list = matcher.match().all()

node_list = Subgraph(relationships=relationship_list)

tx.separate(node_list)
g.commit(tx)

In [42]:
import pandas as pd
import os
import numpy as np

file_name = 'ownthink_v2.csv'
data = pd.read_csv(file_name, chunksize=1000)  # chunksize用于指定每次提取的行数

In [43]:
def save_signal_chunk(index, data, base_path):
    if not os.path.exists(base_path):
        os.mkdir(base_path)

    line_count = data.shape[0]
    entity_df = pd.DataFrame()
    relationship_df = pd.DataFrame()

    for _, line in data.iterrows():
        # 不要用data.loc[0] 会报错
        entity0 = line.实体
        label = line.属性
        entity1 = line.值

        entity_df = entity_df.append({
            ":ID": hash(entity0),
            "name": entity0,
            ":LABEL": 'ENTITY'
        }, ignore_index=True)

        entity_df = entity_df.append({
            ":ID": hash(entity1),
            "name": entity1,
            ":LABEL": 'ENTITY'
        }, ignore_index=True)

        relationship_df = relationship_df.append({
            ':START_ID': hash(entity0),
            'name': label,
            ':END_ID': hash(entity1),
            ':TYPE': 'RELATIONSHIP'
        }, ignore_index=True)

    entity_df.to_csv('{}/entity{}.csv'.format(base_path, index), index=None)
    relationship_df.to_csv('{}/relationship{}.csv'.format(base_path, index), index=None)

In [46]:
for index, d in enumerate(data):
    save_signal_chunk(index, d, 'out')
    if index == 4:
        break

In [47]:
import pandas as pd
import os

from joblib import Parallel, delayed


def save_signal_chunk(index, data, base_path):
    if not os.path.exists(base_path):
        os.mkdir(base_path)

    entity_df = pd.DataFrame()
    relationship_df = pd.DataFrame()

    for _, line in data.iterrows():
        # 不要用data.loc[0] 会报错
        entity0 = line.实体
        label = line.属性
        entity1 = line.值

        entity_df = entity_df.append({
            ":ID": hash(entity0),
            "name": entity0,
            ":LABEL": 'ENTITY'
        }, ignore_index=True)

        entity_df = entity_df.append({
            ":ID": hash(entity1),
            "name": entity1,
            ":LABEL": 'ENTITY'
        }, ignore_index=True)

        relationship_df = relationship_df.append({
            ':START_ID': hash(entity0),
            'name': label,
            ':END_ID': hash(entity1),
            ':TYPE': 'RELATIONSHIP'
        }, ignore_index=True)

    # 实体要去重
    entity_df = entity_df.drop_duplicates(subset=':ID')
    entity_df = entity_df[[':ID', 'name', ':LABEL']]

    entity_df.to_csv('{}/entity{}.csv'.format(base_path, index), index=None)
    relationship_df.to_csv('{}/relationship{}.csv'.format(base_path, index), index=None)

    if index <= 4:
        print('index:', index, 'time:', round(time.time() - star, 2))




In [None]:
'''
并行前
for index, d in enumerate(data):
    save_signal_chunk(index, d, 'out')
'''
import time

star = time.time()

Parallel(n_jobs=5)(delayed(save_signal_chunk)(index, d, 'out') for index, d in enumerate(data))

In [50]:
import pandas as pd
tmp = [pd.read_csv('out/relationship'+str(i)+'.csv') for i in range(10)]
tmp = pd.concat(tmp)
tmp[':TYPE'] = tmp['name']
tmp.to_csv('out/relationship.csv', index=False)

In [52]:
from tqdm import tqdm
tmp = pd.read_csv('out/relationship.csv')
max_len = 65535-1
counts = {}
for i in tqdm(tmp[':TYPE']):
    if i in counts:
        counts[i] += 1
    else:
        counts[i] = 1
counts = sorted(counts.items(), key=lambda counts:counts[1], reverse=True)
no_name = [i[0] for i in counts[max_len:]]
new = tmp[tmp[':TYPE'].isin(no_name)]
new[':TYPE'] = 'RELATIONSHIP'
tmp[tmp[':TYPE'].isin(no_name)] = new
tmp.to_csv('out/relationship.csv', index=False)
tmp,len(set(tmp[':TYPE']))

100%|███████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 2000335.75it/s]


(           :END_ID     :START_ID :TYPE  name
 0     2.145722e+18  1.450828e+18   中文名   中文名
 1     3.544093e+18  1.450828e+18    别称    别称
 2    -8.656026e+18  1.450828e+18  形成时期  形成时期
 3    -5.112965e+18  1.450828e+18  流行时期  流行时期
 4    -8.111141e+18  1.450828e+18  代表诗人  代表诗人
 ...            ...           ...   ...   ...
 9995  5.196552e+18  7.076521e+18     目     目
 9996  4.312501e+18  7.076521e+18    亚目    亚目
 9997  2.799001e+18  7.076521e+18     科     科
 9998  4.255932e+18  7.076521e+18    亚科    亚科
 9999 -3.854866e+17  7.076521e+18     属     属
 
 [10000 rows x 4 columns],
 1528)

## 用py2neo进行CSV文件导入

In [2]:
from py2neo import Graph,Node,Relationship

In [30]:
graph = Graph('http://localhost:7474',auth=("neo4j", "123456"))

删除所有已知节点

In [46]:
graph.run('MATCH (r) DETACH DELETE r')

导入节点数据

注意要提前把节点和关系文件放入该数据库的import文件夹内

In [42]:
graph.run('using periodic commit 10000 load csv with headers from "file:/node_gzh.csv" \
as line with line create (:gzh {item:line.item,  trans_amount_sum:line.trans_amount_sum,\
trans_cnt:line.trans_cnt,  type:line.type});')

In [43]:
graph.run('CREATE INDEX ON :gzh(item)')

ClientError: [Schema.EquivalentSchemaRuleAlreadyExists] An equivalent index already exists, 'Index( id=1, name='index_70d15b32', type='GENERAL BTREE', schema=(:gzh {item}), indexProvider='native-btree-1.0' )'.

In [44]:
graph.run('using periodic commit 10000 load csv with headers from "file:/node_rela_gzh.csv" as line \
match (from:gzh {item: line.item_l}),(to:gzh {item:line.item_r}) merge (from)-[c:gzh{relation:line.relation}]->(to)')

以函数的形式进行批量输入

In [None]:
#读取节点数据
def LoacCsvNodes(num,files_csv,list_label):
    '''
    num: 每次导入的行数，用来防止内存溢出，str类型
    files_csv: csv文件名列表
    list_label: 节点标签列表，和file_csv同长度
    '''
    for i  in range(len(files_csv)):
        # graph.run ()中为cypher 语句
        graph.run('USING PERIODIC COMMIT '+num+' \
                    LOAD CSV  WITH HEADERS FROM "file:///'+files_csv[i]+'" AS line \
                    merge (n:'+list_label[i][0]+ ':'+list_label[i][1]+ \
                  '{ID:line.ID,name:line.name,card:line.card,IP:line.IP})') 
LoacCsvNodes("3",files_node_csv,list_label)

In [None]:
# 读取关系数据
files_rel_csv = list(filter(lambda x: x[-8:]=='_rel.csv' , dirs))
print("the csv files under path1:\n",files_rel_csv)

# 导入关系数据函数
def LoadCsvRel(num,files_csv,list_rel_label,node_label1="",node_label2=""):
    '''
    num: 每次导入的行数，防止内存溢出，str类型
    files_csv:文件名列表
    list_label: 节点标签列表，和file_csv同长度
    '''
    for i  in range(len(files_csv)):
        graph.run('USING PERIODIC COMMIT '+num+' \
                    LOAD CSV  FROM "file:///'+files_csv[i]+'" AS line \
                    MATCH (a'+node_label1+'{ID:line[0]}),(b'+node_label2+'{ID:line[2]}) \
                    merge (a)-[r:'+list_rel_label[i]+'{prop:line[1]}]-> (b) '
                 )

list_rel_label=["contacter","IP_equal","phone_equal"]# 和file_rel_csv 对应
LoadCsvRel("3",files_rel_csv,list_rel_label)