In [None]:
import pandas as pd
from sqlalchemy import create_engine

# 数据库配置
database_username = ''
database_password = ''
database_ip       = '127.0.0.1'
database_name     = 'myemail'
database_connection = f'mysql+pymysql://{database_username}:{database_password}@{database_ip}/{database_name}'

# 创建数据库引擎
engine = create_engine(database_connection)

# 从数据库读取表到DataFrame
df = pd.read_sql_table('emails2', con=engine)
print(df.shape)

# 关闭数据库引擎
engine.dispose()

In [None]:
# 确定新的DataFrame大小为原始大小的1/100
sampled_size = int(len(df) / 100)

# 随机抽取1/100的行
df_sampled = df.sample(n=sampled_size, random_state=42)

# 显示新DataFrame的大小和前几行数据
print("新DataFrame的行数:", len(df_sampled))
print(df_sampled.shape)
print(df_sampled.head())

In [None]:
from neo4j import GraphDatabase


# 连接到 Neo4j 数据库
uri = "bolt://localhost:7687"
user = "neo4j"
password = ""
driver = GraphDatabase.driver(uri, auth=(user, password))

def add_email(tx, message_id, sender, recipient, subject, content, date):
    query = """
    MERGE (p1:Person {email: $sender})
    MERGE (p2:Person {email: $recipient})
    MERGE (p1)-[r:SENDS]->(p2)
    ON CREATE SET r.message_id = $message_id, r.subject = $subject, r.content = $content, r.date = $date, r.weight = 1
    ON MATCH SET r.weight = r.weight + 1, r.subject = coalesce(r.subject, '') + '; ' + $subject, r.content = coalesce(r.content, '') + '\\n\\n' + $content
    """
    tx.run(query, message_id=message_id, sender=sender, recipient=recipient, subject=subject, content=content, date=date)

# 将 DataFrame 中的数据添加到 Neo4j
with driver.session() as session:
    for index, row in df_sampled.iterrows():
        # 确保直接引用 DataFrame 的列名
        session.execute_write(add_email, row['Message-ID'], row['From'], row['To'], row['Subject'], row['content'], row['Date'])

driver.close()