# 网络数据处理与分析完整流程

本 notebook 将 util 文件夹中的各个步骤串联在一起，实现从原始专利数据到网络分析的完整流程。

## 流程概述

1. **数据预处理**
   - 清洗专利数据
   - 去除个人申请

2. **网络构建**
   - 知识网络构建
   - 技术网络构建
   - 协作研发网络构建
   - 跨层耦合网络构建

3. **网络分析**
   - 网络层权重计算
   - 结构洞耦合分析
   - 关键性指数计算
   - 中心性指数计算
   - 综合数据库构建

## 环境设置和导入

In [1]:
import sys
import os
import warnings
import importlib.util
warnings.filterwarnings('ignore')

# 添加 util 目录到 Python 路径
sys.path.append('./util')

print("正在导入模块...")

正在导入模块...


In [4]:
# 导入基础步骤的函数
try:
    from util.step_1_clean_patent_data import clean_patent_data
    from util.step_1_remove_personal_application import remove_personal_applications
    from util.step_2_knowledge_network_construction import construct_knowledge_network
    from util.step_2_technology_network_construction import construct_technology_network
    from util.step_2_collaborative_RD_network_construction import construct_collaborative_network
    print("✓ 基础模块导入成功")
except ImportError as e:
    print(f"✗ 基础模块导入失败: {e}")

✓ 基础模块导入成功


In [8]:
# 动态导入带连字符的模块
def import_module_with_dash(module_path, function_name):
    """导入文件名包含特殊字符的模块"""
    try:
        spec = importlib.util.spec_from_file_location("temp_module", module_path)
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        return getattr(module, function_name)
    except Exception as e:
        print(f"导入 {module_path} 失败: {e}")
        return None

# 导入跨层网络构建函数
construct_knowledge_technology_network = import_module_with_dash(
    'util/step_2_knowledge-technology_network_construction.py', 
    'construct_knowledge_technology_network'
)

construct_technology_collaborative_network = import_module_with_dash(
    'util/step_2_technology-collaborative_RD_network_construction.py', 
    'construct_tech_collaborative_network'
)

construct_knowledge_collaborative_network = import_module_with_dash(
    'util/step_2_knowledge-collaborative_RD_network_construction.py', 
    'construct_knowledge_collaborative_network'
)

if all([construct_knowledge_technology_network, 
        construct_technology_collaborative_network, 
        construct_knowledge_collaborative_network]):
    print("✓ 跨层网络模块导入成功")
else:
    print("✗ 部分跨层网络模块导入失败")

✓ 跨层网络模块导入成功


In [11]:
# 导入分析步骤的函数
try:
    from util.step_3_network_layer_weights import calculate_network_weights
    from util.step_4_structural_hole_coupling_calculation import calculate_structural_hole
    from util.step_4_structural_hole_coupling_database_construction import build_structural_hole_database
    from util.step_4_criticality_index_calculation import calculate_criticality
    from util.step_5_centrality_coupling_calculation import calculate_centrality_coupling
    from util.step_5_centrality_coupling_database_construction import build_centrality_coupling_database
    from util.step_5_centrality_index_calculation import calculate_centrality_index
    from util.step_6_criticality_and_centrality_database_construction import build_criticality_centrality_database
    print("✓ 分析模块导入成功")
except ImportError as e:
    print(f"✗ 分析模块导入失败: {e}")

print("\n所有模块导入完成！")

✗ 分析模块导入失败: cannot import name 'construct_structural_hole_database' from 'util.step_4_structural_hole_coupling_database_construction' (/Users/zhangxudong/Gits/network/util/step_4_structural_hole_coupling_database_construction.py)

所有模块导入完成！


## 第一步：数据预处理

### 1.1 清洗专利数据

In [None]:
print("=== 步骤 1.1: 清洗专利数据 ===")
try:
    result_1_1 = clean_patent_data()
    print(f"结果: {result_1_1}")
except Exception as e:
    print(f"执行失败: {e}")
    result_1_1 = f"执行失败: {e}"
print("\n" + "="*50 + "\n")

### 1.2 去除个人申请

In [None]:
print("=== 步骤 1.2: 去除个人申请 ===")
try:
    result_1_2 = remove_personal_applications()
    print(f"结果: {result_1_2}")
except Exception as e:
    print(f"执行失败: {e}")
    result_1_2 = f"执行失败: {e}"
print("\n" + "="*50 + "\n")

## 第二步：网络构建

### 2.1 单层网络构建

In [None]:
print("=== 步骤 2.1: 知识网络构建 ===")
try:
    result_2_1 = construct_knowledge_network()
    print(f"结果: {result_2_1}")
except Exception as e:
    print(f"执行失败: {e}")
    result_2_1 = f"执行失败: {e}"
print("\n" + "="*50 + "\n")

In [None]:
print("=== 步骤 2.2: 技术网络构建 ===")
try:
    result_2_2 = construct_technology_network()
    print(f"结果: {result_2_2}")
except Exception as e:
    print(f"执行失败: {e}")
    result_2_2 = f"执行失败: {e}"
print("\n" + "="*50 + "\n")

In [None]:
print("=== 步骤 2.3: 协作研发网络构建 ===")
try:
    result_2_3 = construct_collaborative_network()
    print(f"结果: {result_2_3}")
except Exception as e:
    print(f"执行失败: {e}")
    result_2_3 = f"执行失败: {e}"
print("\n" + "="*50 + "\n")

### 2.2 跨层耦合网络构建

In [None]:
print("=== 步骤 2.4: 知识-技术耦合网络构建 ===")
try:
    if construct_knowledge_technology_network:
        result_2_4 = construct_knowledge_technology_network()
        print(f"结果: {result_2_4}")
    else:
        result_2_4 = "函数导入失败"
        print("函数导入失败")
except Exception as e:
    print(f"执行失败: {e}")
    result_2_4 = f"执行失败: {e}"
print("\n" + "="*50 + "\n")

In [None]:
print("=== 步骤 2.5: 技术-协作研发耦合网络构建 ===")
try:
    if construct_technology_collaborative_network:
        result_2_5 = construct_technology_collaborative_network()
        print(f"结果: {result_2_5}")
    else:
        result_2_5 = "函数导入失败"
        print("函数导入失败")
except Exception as e:
    print(f"执行失败: {e}")
    result_2_5 = f"执行失败: {e}"
print("\n" + "="*50 + "\n")

In [None]:
print("=== 步骤 2.6: 知识-协作研发耦合网络构建 ===")
try:
    if construct_knowledge_collaborative_network:
        result_2_6 = construct_knowledge_collaborative_network()
        print(f"结果: {result_2_6}")
    else:
        result_2_6 = "函数导入失败"
        print("函数导入失败")
except Exception as e:
    print(f"执行失败: {e}")
    result_2_6 = f"执行失败: {e}"
print("\n" + "="*50 + "\n")

## 第三步：网络层权重计算

In [None]:
print("=== 步骤 3: 网络层权重计算 ===")
try:
    result_3 = calculate_network_weights()
    print(f"结果: {result_3}")
except Exception as e:
    print(f"执行失败: {e}")
    result_3 = f"执行失败: {e}"
print("\n" + "="*50 + "\n")

## 第四步：结构洞耦合分析和关键性指数计算

### 4.1 结构洞耦合计算

In [None]:
print("=== 步骤 4.1: 结构洞耦合计算 ===")
try:
    result_4_1 = calculate_structural_hole()
    print(f"结果: {result_4_1}")
except Exception as e:
    print(f"执行失败: {e}")
    result_4_1 = f"执行失败: {e}"
print("\n" + "="*50 + "\n")

### 4.2 结构洞耦合数据库构建

In [None]:
print("=== 步骤 4.2: 结构洞耦合数据库构建 ===")
try:
    result_4_2 = construct_structural_hole_database()
    print(f"结果: {result_4_2}")
except Exception as e:
    print(f"执行失败: {e}")
    result_4_2 = f"执行失败: {e}"
print("\n" + "="*50 + "\n")

### 4.3 关键性指数计算

In [None]:
print("=== 步骤 4.3: 关键性指数计算 ===")
try:
    result_4_3 = calculate_criticality()
    print(f"结果: {result_4_3}")
except Exception as e:
    print(f"执行失败: {e}")
    result_4_3 = f"执行失败: {e}"
print("\n" + "="*50 + "\n")

## 第五步：中心性耦合分析和中心性指数计算

### 5.1 中心性耦合计算

In [None]:
print("=== 步骤 5.1: 中心性耦合计算 ===")
try:
    result_5_1 = calculate_centrality_coupling()
    print(f"结果: {result_5_1}")
except Exception as e:
    print(f"执行失败: {e}")
    result_5_1 = f"执行失败: {e}"
print("\n" + "="*50 + "\n")

### 5.2 中心性耦合数据库构建

In [None]:
print("=== 步骤 5.2: 中心性耦合数据库构建 ===")
try:
    result_5_2 = construct_centrality_database()
    print(f"结果: {result_5_2}")
except Exception as e:
    print(f"执行失败: {e}")
    result_5_2 = f"执行失败: {e}"
print("\n" + "="*50 + "\n")

### 5.3 中心性指数计算

In [None]:
print("=== 步骤 5.3: 中心性指数计算 ===")
try:
    result_5_3 = calculate_centrality_index()
    print(f"结果: {result_5_3}")
except Exception as e:
    print(f"执行失败: {e}")
    result_5_3 = f"执行失败: {e}"
print("\n" + "="*50 + "\n")

## 第六步：综合数据库构建

In [None]:
print("=== 步骤 6: 关键性和中心性综合数据库构建 ===")
try:
    result_6 = construct_final_database()
    print(f"结果: {result_6}")
except Exception as e:
    print(f"执行失败: {e}")
    result_6 = f"执行失败: {e}"
print("\n" + "="*50 + "\n")

## 流程总结和结果展示

### 执行结果汇总

In [None]:
print("=== 网络分析流程执行完成 ===")
print("\n各步骤执行结果汇总:")
print("\n1. 数据预处理:")
print(f"   1.1 数据清洗: {result_1_1.split('\n')[0] if 'result_1_1' in locals() else '未执行'}")
print(f"   1.2 去除个人申请: {result_1_2.split('\n')[0] if 'result_1_2' in locals() else '未执行'}")

print("\n2. 网络构建:")
print(f"   2.1 知识网络: {result_2_1.split('\n')[0] if 'result_2_1' in locals() else '未执行'}")
print(f"   2.2 技术网络: {result_2_2.split('\n')[0] if 'result_2_2' in locals() else '未执行'}")
print(f"   2.3 协作研发网络: {result_2_3.split('\n')[0] if 'result_2_3' in locals() else '未执行'}")
print(f"   2.4 知识-技术耦合: {result_2_4.split('\n')[0] if 'result_2_4' in locals() else '未执行'}")
print(f"   2.5 技术-协作耦合: {result_2_5.split('\n')[0] if 'result_2_5' in locals() else '未执行'}")
print(f"   2.6 知识-协作耦合: {result_2_6.split('\n')[0] if 'result_2_6' in locals() else '未执行'}")

print("\n3. 网络分析:")
print(f"   3.1 网络权重计算: {result_3.split('\n')[0] if 'result_3' in locals() else '未执行'}")
print(f"   4.1 结构洞耦合: {result_4_1.split('\n')[0] if 'result_4_1' in locals() else '未执行'}")
print(f"   4.2 结构洞数据库: {result_4_2.split('\n')[0] if 'result_4_2' in locals() else '未执行'}")
print(f"   4.3 关键性指数: {result_4_3.split('\n')[0] if 'result_4_3' in locals() else '未执行'}")
print(f"   5.1 中心性耦合: {result_5_1.split('\n')[0] if 'result_5_1' in locals() else '未执行'}")
print(f"   5.2 中心性数据库: {result_5_2.split('\n')[0] if 'result_5_2' in locals() else '未执行'}")
print(f"   5.3 中心性指数: {result_5_3.split('\n')[0] if 'result_5_3' in locals() else '未执行'}")
print(f"   6.1 综合数据库: {result_6.split('\n')[0] if 'result_6' in locals() else '未执行'}")

print("\n=== 流程执行完毕 ===")

### 数据文件结构查看

In [None]:
import pandas as pd
import os

def show_data_structure():
    """展示生成的数据文件结构"""
    data_dir = './data'
    
    if not os.path.exists(data_dir):
        print("数据目录不存在")
        return
    
    print("=== 数据文件结构 ===")
    
    for root, dirs, files in os.walk(data_dir):
        level = root.replace(data_dir, '').count(os.sep)
        indent = ' ' * 2 * level
        print(f"{indent}{os.path.basename(root)}/")
        
        subindent = ' ' * 2 * (level + 1)
        for file in files:
            if file.endswith(('.xlsx', '.csv', '.txt')):
                file_path = os.path.join(root, file)
                try:
                    if file.endswith('.xlsx'):
                        df = pd.read_excel(file_path)
                        print(f"{subindent}{file} ({len(df)} 行, {len(df.columns)} 列)")
                    elif file.endswith('.csv'):
                        df = pd.read_csv(file_path)
                        print(f"{subindent}{file} ({len(df)} 行, {len(df.columns)} 列)")
                    else:
                        print(f"{subindent}{file}")
                except Exception as e:
                    print(f"{subindent}{file} (读取失败: {str(e)[:50]}...)")

show_data_structure()

### 关键结果文件预览

In [None]:
def preview_key_results():
    """预览关键结果文件"""
    key_files = [
        './data/step6_output/criticality_and_centrality_database.xlsx',
        './data/step5_output/centrality_coupling_database.xlsx',
        './data/step4_output/structural_hole_coupling_database.xlsx',
        './data/step3_output/network_layer_weights.txt'
    ]
    
    for file_path in key_files:
        if os.path.exists(file_path):
            print(f"\n=== {os.path.basename(file_path)} ===")
            try:
                if file_path.endswith('.xlsx'):
                    df = pd.read_excel(file_path)
                    print(f"数据形状: {df.shape}")
                    print(f"列名: {list(df.columns)}")
                    print("前5行数据:")
                    print(df.head())
                elif file_path.endswith('.txt'):
                    with open(file_path, 'r') as f:
                        content = f.read()
                    print(f"文件内容: {content}")
            except Exception as e:
                print(f"读取失败: {str(e)}")
        else:
            print(f"\n文件不存在: {file_path}")

preview_key_results()

## 可选：单独执行某个步骤

如果需要单独执行某个步骤，可以使用下面的代码块：

In [None]:
# 示例：单独执行某个步骤
# 取消注释下面的代码来执行特定步骤

# 执行数据清洗
# clean_patent_data()

# 执行知识网络构建
# construct_knowledge_network()

# 执行权重计算
# calculate_network_weights()

print("可以根据需要取消注释上面的代码来单独执行特定步骤")

## 批量执行所有步骤

如果想要一次性执行所有步骤，可以运行下面的代码：

In [None]:
def run_full_pipeline():
    """执行完整的分析流程"""
    steps = [
        ("1.1 数据清洗", clean_patent_data),
        ("1.2 去除个人申请", remove_personal_applications),
        ("2.1 知识网络构建", construct_knowledge_network),
        ("2.2 技术网络构建", construct_technology_network),
        ("2.3 协作研发网络构建", construct_collaborative_network),
        ("2.4 知识-技术耦合网络", construct_knowledge_technology_network),
        ("2.5 技术-协作耦合网络", construct_technology_collaborative_network),
        ("2.6 知识-协作耦合网络", construct_knowledge_collaborative_network),
        ("3.1 网络权重计算", calculate_network_weights),
        ("4.1 结构洞耦合计算", calculate_structural_hole_coupling),
        ("4.2 结构洞数据库构建", construct_structural_hole_database),
        ("4.3 关键性指数计算", calculate_criticality),
        ("5.1 中心性耦合计算", calculate_centrality_coupling),
        ("5.2 中心性数据库构建", construct_centrality_database),
        ("5.3 中心性指数计算", calculate_centrality_index),
        ("6.1 综合数据库构建", construct_final_database),
    ]
    
    results = {}
    
    for step_name, step_func in steps:
        print(f"\n=== 执行步骤: {step_name} ===")
        try:
            if step_func is not None:
                result = step_func()
                results[step_name] = "成功"
                print(f"✓ {step_name} 执行成功")
            else:
                results[step_name] = "函数未导入"
                print(f"✗ {step_name} 函数未导入")
        except Exception as e:
            results[step_name] = f"失败: {str(e)[:100]}"
            print(f"✗ {step_name} 执行失败: {e}")
    
    print("\n=== 执行结果汇总 ===")
    for step_name, result in results.items():
        status = "✓" if result == "成功" else "✗"
        print(f"{status} {step_name}: {result}")
    
    return results

# 取消注释下面的行来执行完整流程
# pipeline_results = run_full_pipeline()

print("取消注释上面的代码来执行完整的分析流程")

## 注意事项

1. **数据文件路径**: 确保原始数据文件 `original_patent_data.xlsx` 位于 `./data/input/` 目录下

2. **依赖关系**: 各步骤之间存在依赖关系，建议按顺序执行

3. **错误处理**: 如果某个步骤执行失败，请检查:
   - 输入文件是否存在
   - 数据格式是否正确
   - 必要的列是否存在

4. **性能考虑**: 某些步骤（如去除个人申请）可能需要较长时间，请耐心等待

5. **结果文件**: 所有结果文件将保存在 `./data/` 目录的相应子文件夹中

6. **模块导入**: 由于某些文件名包含特殊字符，使用了动态导入方式

7. **环境要求**: 确保已安装所有必要的 Python 包（pandas, numpy, networkx, requests, beautifulsoup4, googletrans, tqdm 等）