In [1]:
import pandas as pd
import numpy as np
from clickhouse_driver import Client
import networkx as nx
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

# # 读取数据
# npm_packages_df = pd.read_csv('/OpenSource/npm_data/2024-05-13-16-54-33_EXPORT_CSV_13529018_459_0.csv')
# npm_dependencies_df = pd.read_csv('/OpenSource/npm_data/2024-05-13-17-23-22_EXPORT_CSV_13529969_552_0.csv')
# print(npm_packages_df.shape)
# print(npm_dependencies_df.shape)

In [2]:
# 从clickhouse读取数据
client = Client(host='cc-uf6764sn662413tc9.public.clickhouse.ads.aliyuncs.com',
                user='liangchen',
                password='Liangchen123',
                database='supply_chain')
tables = client.execute('SHOW TABLES')

npm_record_query = 'SELECT * FROM npm_records limit 1000000'
npm_record_result = client.execute(npm_record_query)
npm_dependencies_query = 'SELECT * FROM npm_dependencies limit 1000000'
npm_dependencies_result = client.execute(npm_dependencies_query)

npm_packages_df = pd.DataFrame(npm_record_result, columns=['package_id', 'name', 'version', 'description', 'repository_type', 'repository_url', 'license', 'homepage', 'time'])
npm_dependencies_df = pd.DataFrame(npm_dependencies_result, columns=['package_id', 'dependency_name', 'dependency_verison', 'type'])

In [3]:
# # 构建一个包id到名称的映射字典
# package_id_to_name = pd.Series(npm_packages_df.name.values, index=npm_packages_df.package_id).to_dict()

# # 创建一个有向图
# G = nx.DiGraph()

# for package_id, package_name in package_id_to_name.items():
#     G.add_node(package_name)

# # 添加边（依赖关系）
# for index, row in npm_dependencies_df.iterrows():
#     package_id = row['package_id']
#     dependency_name = row['dependency_name']

#     if package_id in package_id_to_name.keys():
#         package_name = package_id_to_name[package_id]
#         if dependency_name in package_id_to_name.values():
#             G.add_edge(package_name, dependency_name)

# print("Add edges over.")

# num_edges = G.number_of_edges()
# print("Number of edges:", num_edges)

# isolated_nodes = list(nx.isolates(G))
# G.remove_nodes_from(isolated_nodes)
# num_nodes = G.number_of_nodes()
# print("Number of nodes:", num_nodes)

Add edges over.
Number of edges: 6180
Number of nodes: 2709


In [None]:
############### 并行化 + 持久化 
from concurrent.futures import ProcessPoolExecutor
import json
  
# 假设npm_packages_df和npm_dependencies_df已经加载和定义  
  
# 构建包id到名称的映射  
package_id_to_name = pd.Series(npm_packages_df.name.values, index=npm_packages_df.package_id).to_dict()  
  
# 创建一个有向图  
G = nx.DiGraph()  
  
# 添加所有节点  
for package_id, package_name in package_id_to_name.items():  
    G.add_node(package_name)  
  
# 定义一个函数来并行处理依赖关系  
def add_dependencies(row):  
    package_id = row['package_id']  
    dependency_name = row['dependency_name']  
    if package_id in package_id_to_name and dependency_name in package_id_to_name.values():  
        # 注意：这里我们不直接修改G，而是返回需要添加的边  
        package_name = package_id_to_name[package_id]  
        # 假设dependency_name直接对应一个具体的package_name，这里需要转换  
        # 这可能需要额外的逻辑，因为dependency_name可能不在package_id_to_name的keys中  
        dependency_package_name = next((name for name, id_ in package_id_to_name.items() if name == dependency_name), None)  
        if dependency_package_name:  
            return (package_name, dependency_package_name)  
    return None  
  
# 使用ProcessPoolExecutor并行处理  
edges_to_add = []  
with ProcessPoolExecutor() as executor:  
    future_to_row = {executor.submit(add_dependencies, row): row for index, row in npm_dependencies_df.iterrows()}  
    for future in concurrent.futures.as_completed(future_to_row):  
        edge = future.result()  
        if edge:  
            edges_to_add.append(edge)  
  
# 在主进程中添加边  
for src, dest in edges_to_add:  
    G.add_edge(src, dest)  
  
print("Add edges over.")  

## 持久化...
json_data = nx.readwrite.json_graph.node_link_data(G)
with open('./npm_graph.json', 'w') as file:
    json.dump(json_data, file)

num_edges = G.number_of_edges()  
print("Number of edges:", num_edges)  
  
isolated_nodes = list(nx.isolates(G))  
G.remove_nodes_from(isolated_nodes)  
num_nodes = G.number_of_nodes()  
print("Number of nodes:", num_nodes)

In [10]:
# 可视化依赖网络
plt.figure(figsize=(12, 12))
pos = nx.spring_layout(G, k=0.1)
nx.draw(G, pos, with_labels=False, node_size=5, font_size=8, node_color='lightblue', edge_color='gray', arrows=True, alpha=0.6)
plt.title('npm Dependency Network')
plt.savefig('./network.svg')
plt.savefig('./network.png')
plt.close()

In [6]:
# 分析网络
degree_centrality = nx.degree_centrality(G)
clustering_coefficient = nx.clustering(G.to_undirected())
average_path_length = nx.average_shortest_path_length(G) if nx.is_weakly_connected(G) else None
diameter = nx.diameter(G) if nx.is_weakly_connected(G) else None

In [7]:
print("Degree Centrality: ", degree_centrality)
print("Clustering Coefficient: ", clustering_coefficient)
print("Average Path Length: ", average_path_length)
print("Diameter: ", diameter)

Degree Centrality:  {'0x-hunter-constracts': 0.00036927621861152144, '0x-hunter-core': 0.00036927621861152144, '0x.js': 0.005908419497784343, '0x81-utils': 0.00036927621861152144, '0x81_string': 0.00036927621861152144, '1k-tasks': 0.004062038404726736, '3box': 0.0025849335302806503, '3d-force-graph': 0.0007385524372230429, '3d-force-graph-ar': 0.00036927621861152144, '3d-force-graph-vr': 0.00036927621861152144, '3dio': 0.0014771048744460858, '3h-ast': 0.0011078286558345643, '3h-test': 0.00036927621861152144, '3oilerpl4te': 0.0033234859675036928, '42-cent-base': 0.00036927621861152144, '42-cent-model': 0.00036927621861152144, '42-cent-stripe': 0.0011078286558345643, '42-cent-util': 0.00036927621861152144, '4all-ui': 0.00036927621861152144, 'gulp-ice-builder': 0.00036927621861152144, 'gulp-iconfont': 0.0029542097488921715, 'gulp-if': 0.0018463810930576072, 'gulp-image-resize': 0.00036927621861152144, 'gulp-imagemin': 0.0018463810930576072, 'gulp-include-sources': 0.00036927621861152144, 

In [12]:
##### 为G做社区发现
import community

# 使用 Louvain 算法进行社区发现
partition = community.best_partition(G)

# 对这个社区发现进行可视化
pos = nx.spring_layout(G)# 创建布局

cmap = plt.cm.get_cmap('viridis', max(partition.values()) + 1)# 绘制节点，根据社区着色
nx.draw_networkx_nodes(G, pos, node_color=list(partition.values()), cmap=cmap, node_size=50, alpha=0.8)

nx.draw_networkx_edges(G, pos, alpha=0.5)# 绘制边
plt.show()

TypeError: Bad graph type, use only non directed graph