In [1]:
"""
可视化模块：将分析结果以图形方式展示
"""

from config import RESULTS_DIR, VISUALIZATION_CONFIG
from datetime import datetime
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from matplotlib.colors import Normalize
import matplotlib.cm as cm
from wordcloud import WordCloud
import seaborn as sns
import matplotlib.pyplot as plt
import os
import json
import logging
import pandas as pd
import numpy as np
import matplotlib
import logging
from typing import *
matplotlib.use('Agg')  # 设置后端为Agg，不显示图形窗口
logger = logging.getLogger('Visualizer')

In [2]:
class DataVisualizer:
    """数据可视化类，负责生成各种可视化图表"""

    def __init__(self, results_file_path=None):
        """初始化可视化器"""
        self.results_dir = RESULTS_DIR

        # 如果传入了分析结果，则使用传入的结果
        results_file_path = results_file_path or os.path.join(
            self.results_dir, "analysis_results.json")

        self.analysis_results = self._load_analysis_results(
            analysis_json_path=results_file_path)

        if not self.analysis_results:
            logger.error("无法获取分析结果，无法生成可视化")
            return {}

        # 绘图参数
        plt.style.use('seaborn-v0_8-whitegrid')

        # 确保可视化目录存在
        self.viz_dir = os.path.join(self.results_dir, "visualizations")
        os.makedirs(self.viz_dir, exist_ok=True)

        # 设置字体
        plt.rcParams['font.family'] = VISUALIZATION_CONFIG['font_family']

        if os.path.exists(VISUALIZATION_CONFIG['font_path']):
            self.font_path = VISUALIZATION_CONFIG['font_path']
        else:
            logger.info(f"字体{VISUALIZATION_CONFIG['font_path']} 不存在，回退至默认字体")

        # 设置颜色方案
        self.colors = {
            'france': VISUALIZATION_CONFIG['colors']['france'],  # 法国蓝
            'china': VISUALIZATION_CONFIG['colors']['china'],    # 中国红
        }

    def visualize_all(self):
        """生成所有可视化"""
        logger.info("开始生成所有可视化")

        # 生成词云
        word_cloud_results = self.generate_word_clouds()

        # 生成主题模型可视化
        topic_results = self.generate_topic_visualizations()

        # 生成文化维度雷达图
        # culture_results = self.generate_cultural_radar_charts()

        # 组合结果
        visualization_results = {
            'word_clouds': word_cloud_results,
            'topic_visualizations': topic_results,
            # 'cultural_radar_charts': culture_results,
            'visualization_date': datetime.now().isoformat()
        }

        # 保存可视化结果
        self._save_visualization_results(visualization_results)

        logger.info("所有可视化生成完成")

        return visualization_results

    def _load_analysis_results(self, analysis_json_path=None):
        """加载分析结果"""
        if analysis_json_path is None:
            results_file = os.path.join(
                self.results_dir, "analysis_results.json")
        else:
            results_file = analysis_json_path

        if not os.path.exists(results_file):
            logger.warning(f"分析结果文件不存在: {results_file}")
            return None

        try:
            with open(results_file, 'r', encoding='utf-8') as f:
                results = json.load(f)
            logger.info("成功加载分析结果")
            return results
        except Exception as e:
            logger.error(f"加载分析结果时出错: {str(e)}")
            return None

    def _save_visualization_results(self, results):
        """保存可视化结果"""
        results_file = os.path.join(
            self.results_dir, "visualization_results.json")

        try:
            with open(results_file, 'w', encoding='utf-8') as f:
                json.dump(results, f, ensure_ascii=False, indent=2)
            logger.info(f"可视化结果已保存至: {results_file}")
        except Exception as e:
            logger.error(f"保存可视化结果时出错: {str(e)}")

    def load_visualization_results(self):
        """从文件加载可视化结果"""
        logger.info("从文件加载可视化结果")

        results_file = os.path.join(
            self.results_dir, "visualization_results.json")

        if not os.path.exists(results_file):
            raise FileNotFoundError(
                f"可视化结果文件不存在: {results_file}")
            
        with open(results_file, 'r', encoding='utf-8') as f:
            results = json.load(f)
        logger.info(f"成功加载可视化结果")
        
    def generate_word_clouds(self):
        """生成词云可视化"""
        logger.info("生成词云可视化")
        results = {}

        if 'tf_idf' not in self.analysis_results:
            logger.warning("找不到TF-IDF分析结果，无法生成词云")
            return results

        tf_idf_results = self.analysis_results['tf_idf']

        for country in tf_idf_results.keys():
            country_dir = os.path.join(self.viz_dir, country)
            if not os.path.exists(country_dir):
                os.makedirs(country_dir)

            results[country] = {}

            for sector, sector_data in tf_idf_results[country].items():
                logger.info(f"生成{country}的{sector}领域词云")

                if 'top_words' not in sector_data or not sector_data['top_words']:
                    logger.warning(f"{country}的{sector}领域没有词频数据")
                    continue

                # 创建词-权重字典
                word_weights = {item['word']: item['score']
                                for item in sector_data['top_words']}

                # 设置词云颜色
                color = self.colors[country]

                # 创建词云对象
                wc = WordCloud(
                    width=800,
                    height=400,
                    background_color='white',
                    colormap='Blues' if country == 'france' else 'Reds',
                    max_words=100,
                    contour_width=1,
                    contour_color='steelblue' if country == 'france' else 'firebrick',
                    font_path=self.font_path)

                # 生成词云
                wc.generate_from_frequencies(word_weights)

                # 创建图形
                plt.figure(figsize=(10, 6))
                plt.imshow(wc, interpolation='bilinear')
                plt.axis('off')
                plt.title(
                    f"{country.capitalize()} - {sector.capitalize()} Sector", fontsize=16)
                plt.tight_layout()

                # 保存图像
                output_path = os.path.join(
                    country_dir, f"wordcloud_{sector}.svg")
                plt.savefig(output_path, dpi=300, bbox_inches='tight', format='svg')
                plt.close('all')  # 确保关闭所有图形

                results[country][sector] = output_path

        logger.info("词云可视化生成完成")

        return results

    def generate_topic_visualizations(self):
        """生成主题模型可视化"""
        logger.info("生成主题模型可视化")

        results = {}

        if 'topics' not in self.analysis_results:
            logger.warning("找不到主题分析结果，无法生成可视化")
            return results

        topic_results = self.analysis_results['topics']

        for country in topic_results.keys():
            country_dir = os.path.join(self.viz_dir, country)
            if not os.path.exists(country_dir):
                os.makedirs(country_dir)

            results[country] = {}

            for sector, sector_data in topic_results[country].items():
                logger.info(f"生成{country}的{sector}领域主题可视化")

                if 'topics' not in sector_data or not sector_data['topics']:
                    logger.warning(f"{country}的{sector}领域没有主题数据")
                    continue

                topics = sector_data['topics']

                # 创建多个子图
                n_topics = len(topics)
                fig, axes = plt.subplots(1, n_topics, figsize=(n_topics*4, 5))

                # 如果只有一个主题，将axes转换为列表
                if n_topics == 1:
                    axes = [axes]

                # 设置颜色映射
                cmap = cm.get_cmap('Blues' if country == 'france' else 'Reds')
                norm = Normalize(vmin=0, vmax=max(
                    [max(topic['weights']) for topic in topics]))

                # 绘制每个主题的词条形图
                for i, topic in enumerate(topics):
                    words = topic['words']
                    weights = topic['weights']

                    # 对权重和词组合排序
                    word_weight_pairs = list(zip(words, weights))
                    word_weight_pairs.sort(key=lambda x: x[1], reverse=True)
                    words = [pair[0] for pair in word_weight_pairs]
                    weights = [pair[1] for pair in word_weight_pairs]

                    # 反转顺序，使最重要的词在顶部
                    words = words[:10]
                    weights = weights[:10]
                    words.reverse()
                    weights.reverse()

                    # 绘制水平条形图
                    colors = [cmap(norm(weight)) for weight in weights]
                    axes[i].barh(words, weights, color=colors)

                    # 设置标题和标签
                    axes[i].set_title(f"Topic {i+1}", fontsize=14)
                    axes[i].set_xlabel('Weight', fontsize=12)

                    # 添加格线
                    axes[i].grid(axis='x', linestyle='--', alpha=0.7)

                # 调整布局
                plt.suptitle(
                    f"{country.capitalize()} - {sector.capitalize()} Sector: Topic Model", fontsize=16)
                plt.tight_layout()

                # 保存图像
                output_path = os.path.join(country_dir, f"topics_{sector}.svg")
                plt.savefig(output_path, dpi=300, bbox_inches='tight', format='svg')
                plt.close('all')  # 确保关闭所有图形

                results[country][sector] = output_path

        logger.info("主题模型可视化生成完成")

        return results

    # def generate_cultural_radar_charts(self):
    #     """生成文化维度雷达图"""
    #     logger.info("生成文化维度雷达图")

    #     results = {}

    #     if 'cultural_dimensions' not in self.analysis_results:
    #         logger.warning("找不到文化维度分析结果，无法生成雷达图")
    #         return results

    #     cultural_results = self.analysis_results['cultural_dimensions']

    #     # 准备数据
    #     countries = list(cultural_results.keys())

    #     if not countries:
    #         logger.warning("没有国家数据")
    #         return results

    #     # 创建雷达图数据
    #     fig = go.Figure()

    #     for country in countries:
    #         # 获取维度分数和维度名称
    #         dimensions = cultural_results[country].keys()
    #         scores = [cultural_results[country][dim]['avg_value']
    #                   for dim in dimensions]

    #         # 闭合雷达图
    #         scores.append(scores[0])
    #         dimensions_closed = list(dimensions) + [list(dimensions)[0]]

    #         # 添加轨迹
    #         fig.add_trace(go.Scatterpolar(
    #             r=scores,
    #             theta=dimensions_closed,
    #             fill='toself',
    #             name=country.capitalize(),
    #             line_color=self.colors[country]
    #         ))

    #     # 更新布局
    #     fig.update_layout(
    #         polar=dict(
    #             radialaxis=dict(
    #                 visible=True,
    #                 range=[-1, 1]
    #             )
    #         ),
    #         title="Cultural Dimensions: France vs China",
    #         showlegend=True
    #     )

    #     # 保存图像
    #     output_path = os.path.join(
    #         self.viz_dir, "cultural_dimensions_radar.html")
    #     fig.write_html(output_path)

    #     # 也保存为图像
    #     # img_path = os.path.join(self.viz_dir, "cultural_dimensions_radar.png")
    #     # fig.write_image(img_path, width=800, height=600, scale=2)

    #     results['radar_chart'] = output_path
    #     # results['radar_image'] = img_path

    #     logger.info("文化维度雷达图生成完成")

    #     return results

    def generate_cultural_radar_charts(self, save_path: str = None):
        """
        用 matplotlib 画出中法文化维度雷达图
        :param cultural_results: dict 格式，如 analysis_results["cultural_dimensions"]
        :param save_path: 保存路径，如 "results/cultural_dimensions_radar.png"
        """
        cultural_results = self.analysis_results["cultural_dimensions"]
        if not cultural_results:
            logger.warning("没有文化维度数据")
            return {}
        labels = list(cultural_results["china"].keys())
        china_scores = [cultural_results["china"][dim]["avg_value"] for dim in labels]
        france_scores = [cultural_results["france"][dim]["avg_value"] for dim in labels]

        # 闭环
        labels += [labels[0]]
        china_scores += [china_scores[0]]
        france_scores += [france_scores[0]]

        angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=True)

        fig, ax = plt.subplots(figsize=(8, 6), subplot_kw=dict(polar=True))
        ax.plot(angles, china_scores, label="China", linewidth=2)
        ax.fill(angles, china_scores, alpha=0.25)

        ax.plot(angles, france_scores, label="France", linewidth=2)
        ax.fill(angles, france_scores, alpha=0.25)

        ax.set_thetagrids(angles * 180 / np.pi, labels)
        ax.set_title("中法文化维度投影对比", fontsize=14)
        ax.legend(loc="upper right", bbox_to_anchor=(1.2, 1.1))
        ax.set_rlabel_position(0)
        ax.grid(True)

        # 设置坐标范围适应你现在的数据范围 [-0.1, 0.1]
        ax.set_ylim(-0.1, 0.1)

        # 保存图像
        output_path = os.path.join(self.viz_dir, f"cultural_dimensions.svg")
        plt.savefig(output_path, dpi=300, bbox_inches='tight', format='svg')
        plt.close('all')  # 确保关闭所有图形

In [3]:
# 文化维度解释
HOFSTEDE_DESCRIPTIONS = {
    "power_distance": "权力距离：社会中权力的不平等分配的接受程度",
    "individualism": "个人主义：个人对团体的独立程度",
    "masculinity": "男性气质：成就、英雄主义、果断和物质成功的偏好",
    "uncertainty_avoidance": "不确定性规避：社会对不确定性和模糊性的不适程度",
    "long_term_orientation": "长期导向：注重长期规划和传统的程度",
    "indulgence": "放纵度：控制自身欲望和冲动的程度",
}

In [None]:
visualizer = DataVisualizer()
visualization_results = visualizer.visualize_all()
print("可视化结果：", visualization_results)