In [10]:
import pandas as pd

df = pd.read_csv("获奖论文_机构信息.csv")

# 确保机构名称和领域列存在
required_cols = ['Affiliation', 'Field']
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
    raise ValueError(f"数据缺少必要的列: {missing_cols}")

# 统计每个机构在各领域的出现次数
grouped = df.groupby(['Affiliation', 'Field']).size().reset_index(name='count')

# 转换为透视表（行：机构，列：领域，值：次数）
pivot_df = grouped.pivot_table(index='Affiliation', columns='Field', values='count', fill_value=0)

# 将索引重置为普通列
pivot_df = pivot_df.reset_index()

# 只对数值列求和
numeric_cols = pivot_df.select_dtypes(include=['int64', 'float64']).columns
pivot_df['total'] = pivot_df[numeric_cols].sum(axis=1)

# 按总获奖次数排序，取前10名
top_10 = pivot_df.sort_values('total', ascending=False).head(10).drop(columns='total')

# 调试输出：查看排序后的前10名
print("\n排序后的前10名机构:")
print(top_10.to_csv(sep='\t', na_rep='nan'))

# 确保 Affiliation 列存在且正确处理
if 'Affiliation' in top_10.columns:
    affiliation_col = top_10['Affiliation'].copy()
    
    #分离非数值列和数值列
    non_numeric_cols = ['Affiliation']
    numeric_data = top_10.drop(columns=non_numeric_cols)
    
    #将所有非数值转换为0并转为整数
    numeric_data = numeric_data.apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)
    
    #合并回 Affiliation 列
    top_10 = pd.concat([affiliation_col, numeric_data], axis=1)
    
    print("\n处理后的前10名数据:")
    print(top_10.to_csv(sep='\t', na_rep='nan'))
    
    # 过滤掉机构名称为'Unknown'的记录（如果需要）
    original_count = len(top_10)
    top_10 = top_10[top_10['Affiliation'] != 'Unknown']
    filtered_count = len(top_10)
    
    print(f"\n过滤前记录数: {original_count}")
    print(f"过滤后记录数: {filtered_count}")
    
    if filtered_count == 0:
        print("警告: 过滤后没有剩余记录!")
else:
    print("警告: 数据中没有'Affiliation'列!")

#存为JSON格式
top_10.to_json("top10_institutions.json", orient='records', force_ascii=False)
print("\n修正后的数据已保存至 top10_institutions.json")

print("\n最终结果基本信息:")
top_10.info()

print("\n最终结果的列名:", top_10.columns.tolist())

print("\nJSON内容预览:")
print(top_10.head().to_json(orient='records', force_ascii=False, indent=2))


排序后的前10名机构:
	Affiliation	Chemistry	Medicine	Physics
146	unknown	99.0	112.0	84.0
40	harvard university	12.0	15.0	7.0
93	stanford university	8.0	5.0	13.0
67	max planck society	11.0	7.0	5.0
87	rockefeller university	5.0	18.0	0.0
13	california institute of technology	4.0	4.0	10.0
66	massachusetts institute of technology	3.0	2.0	11.0
23	columbia university	1.0	8.0	5.0
74	national institutes of health	1.0	12.0	0.0
113	university of cambridge	1.0	5.0	7.0


处理后的前10名数据:
	Affiliation	Chemistry	Medicine	Physics
146	unknown	99	112	84
40	harvard university	12	15	7
93	stanford university	8	5	13
67	max planck society	11	7	5
87	rockefeller university	5	18	0
13	california institute of technology	4	4	10
66	massachusetts institute of technology	3	2	11
23	columbia university	1	8	5
74	national institutes of health	1	12	0
113	university of cambridge	1	5	7


过滤前记录数: 10
过滤后记录数: 10

修正后的数据已保存至 top10_institutions.json

最终结果基本信息:
<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, 146 to 113
Data columns (