In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.font_manager as fm
from scipy import stats
import numpy as np
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.diagnostic import het_breuschpagan
import statsmodels.api as sm

In [None]:
def setup_chinese_font():
    # 重置matplotlib设置
    plt.rcParams.update(plt.rcParamsDefault)
    
    # 显式加载中文字体
    font_path = '/usr/share/fonts/truetype/wqy/wqy-microhei.ttc' # 使用时自己替换字体就好
    chinese_font = fm.FontProperties(fname=font_path)
    
    # 注册字体
    fm.fontManager.addfont(font_path)
    
    # 设置全局字体
    plt.rcParams['font.family'] = ['WenQuanYi Micro Hei']
    
    return chinese_font

chinese_font = setup_chinese_font()

# 1. 简单Prompt

In [None]:
def process_data(data):
    # 提取metadata和evaluation score
    processed_data = []
    for item in data:
        row = item['metadata'].copy()  # 获取metadata
        row['score'] = item['evaluation'].get('score')  # 直接从evaluation字典中获取score
        processed_data.append(row)
    
    return pd.DataFrame(processed_data)

with open('output/simple_evaluated_resumes.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

df = process_data(data)

In [None]:
def print_statistics(df):
    variables = ['gender', 'marriage', 'hukou', 'political', 'age']
    for var in variables:
        print(f"\n{var}的统计信息:")
        stats_df = df.groupby(var)['score'].describe()
        display(stats_df)
print_statistics(df)

In [None]:
# 创建箱线图
def create_boxplots(df, chinese_font):
    variables = ['gender', 'marriage', 'hukou', 'age','disability']
    variable_names = {
        'gender': '性别',
        'marriage': '婚姻状况',
        'hukou': '户口',
        'age': '年龄',
        'disability':'身体状况'
    }
        
    # 创建一个5x1的子图
    fig, axes = plt.subplots(4, 1, figsize=(12, 20))
    fig.suptitle('不同变量的分数分布', fontsize=16, fontproperties=chinese_font)
    
    # 为每个变量创建箱线图
    for i, var in enumerate(variables):
        # 创建箱线图
        sns.boxplot(x=var, y='score', data=df, ax=axes[i])
        
        # 计算每个组的平均值
        means = df.groupby(var)['score'].mean()
        
        # 在每个箱子上方添加平均值标注
        for j, mean_val in enumerate(means):
            axes[i].text(j, mean_val, f'{mean_val:.1f}', 
                        horizontalalignment='center',
                        verticalalignment='bottom',
                        fontproperties=chinese_font,
                        fontsize=10,
                        color='red')
        
        # 设置标题和标签
        axes[i].set_title(f'{variable_names[var]}的分数分布', 
                         fontproperties=chinese_font, 
                         pad=15)
        axes[i].set_xlabel(variable_names[var], 
                          fontproperties=chinese_font)
        axes[i].set_ylabel('分数', 
                          fontproperties=chinese_font)
        
        # 如果是age，旋转x轴标签以防重叠
        if var == 'age':
            axes[i].tick_params(axis='x', rotation=45)
        
        # 设置x轴标签的字体
        for label in axes[i].get_xticklabels():
            label.set_fontproperties(chinese_font)
        
        # 设置y轴标签的字体
        for label in axes[i].get_yticklabels():
            label.set_fontproperties(chinese_font)
    
    # 调整子图之间的间距
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    
    return fig

fig = create_boxplots(df, chinese_font)
plt.show()

In [None]:
def perform_statistical_tests(df):
    variables = ['gender', 'marriage', 'hukou', 'age']
    variable_names = {
        'gender': '性别',
        'marriage': '婚姻状况',
        'hukou': '户口',
        'age': '年龄'
    }
    
    results = {}
    
    for var in variables:
        # 获取该变量的所有唯一值
        unique_groups = sorted(df[var].unique())  # 确保分组顺序一致
        
        if len(unique_groups) == 2:  # 如果只有两组，使用t检验
            group1 = df[df[var] == unique_groups[0]]['score'].dropna()  # 移除缺失值
            group2 = df[df[var] == unique_groups[1]]['score'].dropna()  # 移除缺失值
            
            try:
                # 执行独立样本t检验，增加错误处理
                t_stat, p_value = stats.ttest_ind(group1, group2)
                
                # 检查是否为nan
                if np.isnan(t_stat) or np.isnan(p_value):
                    print(f"警告：{var}的t检验结果无效")
                    t_stat = 0
                    p_value = 1
            except Exception as e:
                print(f"警告：{var}的t检验出错: {str(e)}")
                t_stat = 0
                p_value = 1
            
            results[var] = {
                'test_type': 't-test',
                't_statistic': t_stat,
                'p_value': p_value,
                'groups': unique_groups,
                'group_means': {
                    unique_groups[0]: group1.mean(),
                    unique_groups[1]: group2.mean()
                },
                'group_sizes': {
                    unique_groups[0]: len(group1),
                    unique_groups[1]: len(group2)
                }
            }
            
        else:  # 如果有两组以上，使用单因素方差分析
            # 准备数据，移除每组中的缺失值
            groups_data = [df[df[var] == group]['score'].dropna().values 
                         for group in unique_groups]
            
            try:
                # 执行单因素方差分析
                f_stat, p_value = stats.f_oneway(*groups_data)
                
                # 检查是否为nan
                if np.isnan(f_stat) or np.isnan(p_value):
                    print(f"警告：{var}的ANOVA结果无效")
                    f_stat = 0
                    p_value = 1
            except Exception as e:
                print(f"警告：{var}的ANOVA检验出错: {str(e)}")
                f_stat = 0
                p_value = 1
            
            # 获取每组的均值和样本量
            group_means = {group: df[df[var] == group]['score'].mean() 
                         for group in unique_groups}
            group_sizes = {group: len(df[df[var] == group]) 
                         for group in unique_groups}
            
            results[var] = {
                'test_type': 'ANOVA',
                'f_statistic': f_stat,
                'p_value': p_value,
                'groups': unique_groups,
                'group_means': group_means,
                'group_sizes': group_sizes
            }
    
    # 打印结果
    for var, result in results.items():
        print(f"\n{variable_names[var]}的统计检验结果：")
        print(f"检验方法：{result['test_type']}")
        
        if result['test_type'] == 't-test':
            print(f"t统计量：{result['t_statistic']:.4f}")
        else:
            print(f"F统计量：{result['f_statistic']:.4f}")
            
        print(f"p值：{result['p_value']:.4f}")
        print("各组平均分和样本量：")
        for group in result['groups']:
            mean = result['group_means'][group]
            size = result['group_sizes'][group]
            print(f"  {group}: 平均分={mean:.2f}, n={size}")
            
        # 显示显著性结论
        alpha = 0.05
        if result['p_value'] < alpha:
            print(f"结论：在{alpha}显著性水平下，{variable_names[var]}对分数有显著影响")
        else:
            print(f"结论：在{alpha}显著性水平下，{variable_names[var]}对分数没有显著影响")
        print("-" * 80)  # 添加分隔线，使输出更清晰

perform_statistical_tests(df)

# 2. 复杂Prompt

In [None]:
def process_data(data):
    # 提取metadata和evaluation score
    processed_data = []
    for item in data:
        row = item['metadata'].copy()  # 获取metadata
        row['score'] = item['evaluation'].get('scores').get("total").get("mean")  # 直接从evaluation字典中获取score
        processed_data.append(row)
    
    return pd.DataFrame(processed_data)

with open('output/evaluated_resumes.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

df = process_data(data)

In [None]:
def print_statistics(df):
    variables = ['gender', 'marriage', 'hukou', 'age']
    for var in variables:
        print(f"\n{var}的统计信息:")
        stats_df = df.groupby(var)['score'].describe()
        display(stats_df)
print_statistics(df)

In [None]:
# 创建箱线图
def create_boxplots(df, chinese_font):
    variables = ['gender', 'marriage', 'hukou', 'age']
    variable_names = {
        'gender': '性别',
        'marriage': '婚姻状况',
        'hukou': '户口',
        'age': '年龄'
    }
        
    # 创建一个4x1的子图
    fig, axes = plt.subplots(4, 1, figsize=(12, 20))
    fig.suptitle('不同变量的分数分布', fontsize=16, fontproperties=chinese_font)
    
    # 为每个变量创建箱线图
    for i, var in enumerate(variables):
        # 创建箱线图
        sns.boxplot(x=var, y='score', data=df, ax=axes[i])
        
        # 计算每个组的平均值
        means = df.groupby(var)['score'].mean()
        
        # 在每个箱子上方添加平均值标注
        for j, mean_val in enumerate(means):
            axes[i].text(j, mean_val, f'{mean_val:.1f}', 
                        horizontalalignment='center',
                        verticalalignment='bottom',
                        fontproperties=chinese_font,
                        fontsize=10,
                        color='red')
        
        # 设置标题和标签
        axes[i].set_title(f'{variable_names[var]}的分数分布', 
                         fontproperties=chinese_font, 
                         pad=15)
        axes[i].set_xlabel(variable_names[var], 
                          fontproperties=chinese_font)
        axes[i].set_ylabel('分数', 
                          fontproperties=chinese_font)
        
        # 如果是age，旋转x轴标签以防重叠
        if var == 'age':
            axes[i].tick_params(axis='x', rotation=45)
        
        # 设置x轴标签的字体
        for label in axes[i].get_xticklabels():
            label.set_fontproperties(chinese_font)
        
        # 设置y轴标签的字体
        for label in axes[i].get_yticklabels():
            label.set_fontproperties(chinese_font)
    
    # 调整子图之间的间距
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    
    return fig

fig = create_boxplots(df, chinese_font)
plt.show()

In [None]:
def perform_statistical_tests(df):
    variables = ['gender', 'marriage', 'hukou', 'age']
    variable_names = {
        'gender': '性别',
        'marriage': '婚姻状况',
        'hukou': '户口',
        'age': '年龄'
    }
    
    results = {}
    
    for var in variables:
        # 获取该变量的所有唯一值
        unique_groups = sorted(df[var].unique())  # 确保分组顺序一致
        
        if len(unique_groups) == 2:  # 如果只有两组，使用t检验
            group1 = df[df[var] == unique_groups[0]]['score'].dropna()  # 移除缺失值
            group2 = df[df[var] == unique_groups[1]]['score'].dropna()  # 移除缺失值
            
            try:
                # 执行独立样本t检验，增加错误处理
                t_stat, p_value = stats.ttest_ind(group1, group2)
                
                # 检查是否为nan
                if np.isnan(t_stat) or np.isnan(p_value):
                    print(f"警告：{var}的t检验结果无效")
                    t_stat = 0
                    p_value = 1
            except Exception as e:
                print(f"警告：{var}的t检验出错: {str(e)}")
                t_stat = 0
                p_value = 1
            
            results[var] = {
                'test_type': 't-test',
                't_statistic': t_stat,
                'p_value': p_value,
                'groups': unique_groups,
                'group_means': {
                    unique_groups[0]: group1.mean(),
                    unique_groups[1]: group2.mean()
                },
                'group_sizes': {
                    unique_groups[0]: len(group1),
                    unique_groups[1]: len(group2)
                }
            }
            
        else:  # 如果有两组以上，使用单因素方差分析
            # 准备数据，移除每组中的缺失值
            groups_data = [df[df[var] == group]['score'].dropna().values 
                         for group in unique_groups]
            
            try:
                # 执行单因素方差分析
                f_stat, p_value = stats.f_oneway(*groups_data)
                
                # 检查是否为nan
                if np.isnan(f_stat) or np.isnan(p_value):
                    print(f"警告：{var}的ANOVA结果无效")
                    f_stat = 0
                    p_value = 1
            except Exception as e:
                print(f"警告：{var}的ANOVA检验出错: {str(e)}")
                f_stat = 0
                p_value = 1
            
            # 获取每组的均值和样本量
            group_means = {group: df[df[var] == group]['score'].mean() 
                         for group in unique_groups}
            group_sizes = {group: len(df[df[var] == group]) 
                         for group in unique_groups}
            
            results[var] = {
                'test_type': 'ANOVA',
                'f_statistic': f_stat,
                'p_value': p_value,
                'groups': unique_groups,
                'group_means': group_means,
                'group_sizes': group_sizes
            }
    
    # 打印结果
    for var, result in results.items():
        print(f"\n{variable_names[var]}的统计检验结果：")
        print(f"检验方法：{result['test_type']}")
        
        if result['test_type'] == 't-test':
            print(f"t统计量：{result['t_statistic']:.4f}")
        else:
            print(f"F统计量：{result['f_statistic']:.4f}")
            
        print(f"p值：{result['p_value']:.4f}")
        print("各组平均分和样本量：")
        for group in result['groups']:
            mean = result['group_means'][group]
            size = result['group_sizes'][group]
            print(f"  {group}: 平均分={mean:.2f}, n={size}")
            
        # 显示显著性结论
        alpha = 0.05
        if result['p_value'] < alpha:
            print(f"结论：在{alpha}显著性水平下，{variable_names[var]}对分数有显著影响")
        else:
            print(f"结论：在{alpha}显著性水平下，{variable_names[var]}对分数没有显著影响")
        print("-" * 80)  # 添加分隔线，使输出更清晰

# 使用示例
perform_statistical_tests(df)