In [1]:
# ==============================================================================
# CELL 1: 导入所有库并定义包含所有功能的最终版主分析函数
# ==============================================================================
import os
import pandas as pd
import numpy as np

# 解决 KMeans 内存泄漏警告 (必须在导入 KMeans 之前设置)
os.environ['OMP_NUM_THREADS'] = '1'

# 分析库
from statsmodels.tsa.arima.model import ARIMA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input

# 可视化库
import plotly.graph_objects as go
import plotly.express as px

# NLP & 交互库
from deep_translator import GoogleTranslator
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import ipywidgets as widgets
from IPython.display import display, clear_output


# --- (关键修复) 这是一个修复了语法错误的、标准的多行函数 ---
def sentiment_to_rating(sentiment):
    if sentiment >= 0.5:
        return 5
    elif sentiment >= 0.05:
        return 4
    elif sentiment > -0.05:
        return 3
    elif sentiment > -0.5:
        return 2
    else:
        return 1

# --- 将所有分析步骤封装到主函数中 ---
def run_analysis(sales_file, unesco_file, reviews_file):
    clear_output(wait=True)
    print(f"--- 正在使用销售文件: '{sales_file}' ---")
    if unesco_file: print(f"--- 正在使用UNESCO文件: '{unesco_file}' ---")
    if reviews_file: print(f"--- 正在使用评论文件: '{reviews_file}' ---")
    print("\n" + "="*50 + "\n")

    # ==================== 1. 数据加载与清洗 ====================
    try:
        print("--- 正在加载与清洗数据 ---")
        amazon_df = pd.read_csv(sales_file, dtype={23: str}, on_bad_lines='skip')
        if unesco_file:
            unesco_df = pd.read_csv(unesco_file, encoding="utf-8-sig", on_bad_lines='skip')
        
        # 智能重命名
        if 'Total Sales' in amazon_df.columns: amazon_df.rename(columns={'Total Sales': 'Amount'}, inplace=True)
        if 'Product' in amazon_df.columns: amazon_df.rename(columns={'Product': 'SKU'}, inplace=True)
        if 'Qty' not in amazon_df.columns and 'Quantity' in amazon_df.columns: amazon_df.rename(columns={'Quantity': 'Qty'}, inplace=True)
        if 'Order ID' not in amazon_df.columns and 'Order_ID' in amazon_df.columns: amazon_df.rename(columns={'Order_ID': 'Order ID'}, inplace=True)
        
        required_cols = ["Amount", "Category", "Date", "Status", "SKU", "Order ID", "Qty"]
        if any(col not in amazon_df.columns for col in required_cols): raise ValueError(f"文件 '{sales_file}' 缺少必需的列。")
        
        amazon_df.dropna(subset=["Amount", "Category", "Date"], inplace=True)
        try:
            amazon_df["Date"] = pd.to_datetime(amazon_df["Date"], format='%m-%d-%y')
        except ValueError:
            amazon_df["Date"] = pd.to_datetime(amazon_df["Date"], errors='coerce')
            
        amazon_df["Amount"] = pd.to_numeric(amazon_df["Amount"], errors='coerce')
        valid_statuses = ["Shipped", "Shipped - Delivered to Buyer", "Completed", "Pending", "Cancelled"]
        amazon_df = amazon_df[amazon_df["Status"].isin(valid_statuses)]
        amazon_df.dropna(subset=['Date', 'Amount', 'SKU', 'Order ID', 'Qty'], inplace=True)
        
        all_categories = amazon_df['Category'].unique()
        non遗_products = amazon_df[amazon_df['Category'].str.contains('|'.join(all_categories), case=False, na=False)]
        
        print("✅ 数据加载和清洗完成！\n")
    except Exception as e:
        print(f"❌ 处理数据时出错: {e}"); return

    # ==================== 2. 过滤与映射 ====================
    print("\n--- 正在进行过滤与映射 ---")
    if unesco_file:
        keywords = ['craft', 'textile', 'embroidery', 'weaving', 'costume', 'dress', 'heritage product', 'handicraft']
        relevant_unesco = unesco_df[unesco_df['Description EN'].str.contains('|'.join(keywords), case=False, na=False)]
        print(f"相关非遗数量: {len(relevant_unesco)}")
    print(f"潜在非遗产品订单: {len(non遗_products)}\n")

    # ==================== 3. 时间序列预测 (LSTM) ====================
    print("\n--- 🧠 正在进行 LSTM 深度学习预测 ---")
    try:
        sales_ts = amazon_df.groupby('Date')['Amount'].sum().asfreq('D', fill_value=0)
        sales_values = sales_ts.values.reshape(-1, 1)
        scaler = MinMaxScaler(feature_range=(0, 1))
        scaled_values = scaler.fit_transform(sales_values)

        def create_dataset(data, look_back=7):
            X, y = [], []
            for i in range(len(data) - look_back):
                X.append(data[i:(i + look_back), 0])
                y.append(data[i + look_back, 0])
            return np.array(X), np.array(y)

        look_back = 7
        X, y = create_dataset(scaled_values, look_back)
        X = np.reshape(X, (X.shape[0], X.shape[1], 1))

        model = Sequential([Input(shape=(look_back, 1)), LSTM(50), Dense(1)])
        model.compile(loss='mean_squared_error', optimizer='adam')
        
        print("正在训练模型...")
        model.fit(X, y, epochs=20, batch_size=32, verbose=0)

        print("正在预测未来...")
        last_days_scaled = scaled_values[-look_back:]
        current_input = np.reshape(last_days_scaled, (1, look_back, 1))
        future_predictions_scaled = []
        for _ in range(30):
            next_pred_scaled = model.predict(current_input, verbose=0)
            future_predictions_scaled.append(next_pred_scaled[0, 0])
            new_pred_reshaped = np.reshape(next_pred_scaled, (1, 1, 1))
            current_input = np.append(current_input[:, 1:, :], new_pred_reshaped, axis=1)
        
        future_predictions = scaler.inverse_transform(np.array(future_predictions_scaled).reshape(-1, 1))
        
        last_date = sales_ts.index[-1]
        future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=30)
        
        fig_lstm = go.Figure()
        fig_lstm.add_trace(go.Scatter(x=sales_ts.index, y=sales_ts.values, name='历史销售额', line=dict(color='royalblue', width=2), fill='tozeroy', fillcolor='rgba(65, 105, 225, 0.2)'))
        fig_lstm.add_trace(go.Scatter(x=future_dates, y=future_predictions.flatten(), name='LSTM 预测销售额', line=dict(color='darkorange', dash='dash', width=2), fill='tozeroy', fillcolor='rgba(255, 140, 0, 0.2)'))
        fig_lstm.update_layout(title='未来30天销售额深度学习预测 (LSTM模型)')
        fig_lstm.show()
        
    except Exception as e:
        print(f"❌ LSTM 预测失败: {e}")

    # ==================== 4. 类别销售可视化 ====================
    print("\n--- 🛍️ 正在生成品类表现图 ---")
    category_means = non遗_products.groupby('Category')['Amount'].mean().sort_values(ascending=False).reset_index()
    fig_bar = px.bar(category_means, x='Category', y='Amount', color='Category', text_auto='.2f', title='各产品类别平均销售额对比')
    fig_bar.update_layout(width=800, height=500, showlegend=False); fig_bar.show()

    # ==================== 5. 商品聚类分析 ====================
    print("\n--- 🔥 正在进行商品聚类分析 ---")
    try:
        product_agg_df = amazon_df.groupby('SKU').agg(total_amount=('Amount', 'sum'), total_qty=('Qty', 'sum'), order_count=('Order ID', 'nunique')).reset_index()
        features_to_cluster = ['total_amount', 'total_qty', 'order_count']; features = product_agg_df[features_to_cluster]
        scaler = StandardScaler(); features_scaled = scaler.fit_transform(features)
        kmeans = KMeans(n_clusters=3, n_init=10, random_state=42); product_agg_df.loc[:, 'cluster'] = kmeans.fit_predict(features_scaled)
        cluster_summary = product_agg_df.groupby('cluster')[features_to_cluster].mean().sort_values(by='total_amount', ascending=False)
        hot_product_cluster_id = cluster_summary.index[0]; hot_products = product_agg_df[product_agg_df['cluster'] == hot_product_cluster_id].sort_values(by='total_amount', ascending=False)
        
        print("\n每个商品簇的特征均值:")
        display(cluster_summary)
        print("\n排名前10的热销商品:")
        display(hot_products.head(10))
    except Exception as e:
        print(f"❌ 商品聚类失败: {e}")

    # ==================== 6. 情感分析 ====================
    if reviews_file:
        print("\n--- 💬 正在进行情感分析 ---")
        try:
            def find_review_column(df):
                priority_cols = ['reviews.text', 'review_text', 'content', 'comment', 'review']
                for p_col in priority_cols:
                    if p_col in df.columns and df[p_col].dropna().astype(str).str.strip().any(): return p_col
                object_cols = df.select_dtypes(include=['object']).columns
                if not object_cols.empty:
                    return max(object_cols, key=lambda col: df[col].dropna().astype(str).str.len().mean())
                return None
            
            reviews_df = pd.read_csv(reviews_file)
            review_column_name = find_review_column(reviews_df)
            if review_column_name is None: raise ValueError("未能自动检测到文本列。")
            reviews_df.rename(columns={review_column_name: 'review_text'}, inplace=True)
            reviews_df.dropna(subset=['review_text'], inplace=True)
            analyzer = SentimentIntensityAnalyzer()
            reviews_df['sentiment'] = reviews_df['review_text'].apply(lambda text: analyzer.polarity_scores(str(text))['compound'])
            if 'rating' not in reviews_df.columns:
                reviews_df['rating'] = reviews_df['sentiment'].apply(sentiment_to_rating)
            
            print("\n情感分析结果预览:")
            display(reviews_df.head())
            print("\n高分(>=4星)与低分(<=2星)评论对比:")
            display(reviews_df[reviews_df['rating'] >= 4].head(3))
            display(reviews_df[reviews_df['rating'] <= 2].head(3))
        except Exception as e:
            print(f"❌ 情感分析失败: {e}")

    # ==================== 7. 多语言翻译 ====================
    if unesco_file:
        print("\n--- 🌍 正在进行非遗描述翻译 (仅前5条作为演示) ---")
        try:
            def translate_text(text, target_lang):
                if not isinstance(text, str) or not text.strip(): return ""
                try: return GoogleTranslator(source='auto', target=target_lang).translate(text)
                except: return text
            target_languages = ['de', 'fr', 'zh-cn']; unesco_translated_df = unesco_df.head(5).copy()
            for lang in target_languages:
                column_name = f'Description_{lang.upper()}'
                unesco_translated_df[column_name] = unesco_translated_df['Description EN'].apply(lambda x: translate_text(x, lang))
            print("\n翻译完成后的数据预览："); display(unesco_translated_df)
        except Exception as e:
            print(f"❌ 翻译失败: {e}")
            
    print("\n--- ✨ 分析全部完成 ---")

In [2]:
# ==============================================================================
# CELL 2: 创建并显示交互式界面
# ==============================================================================
import os
from ipywidgets import interactive_output, Dropdown, VBox

# 扫描文件并创建下拉菜单
csv_files = [f for f in os.listdir('.') if f.endswith('.csv')]
sales_report_options = [f for f in csv_files if 'amazon' in f.lower() or 'sales' in f.lower()]
unesco_options = [f for f in csv_files if 'ich' in f.lower() or 'unesco' in f.lower()]
reviews_options = [None] + [f for f in csv_files if 'review' in f.lower()]

if not sales_report_options or not unesco_options:
    print("错误：请确保销售文件和UNESCO文件都在当前目录中。")
else:
    # 创建所有控件
    sales_dropdown = Dropdown(options=sales_report_options, description='选择销售文件:')
    unesco_dropdown = Dropdown(options=unesco_options, description='选择UNESCO文件:')
    reviews_dropdown = Dropdown(options=reviews_options, description='选择评论文件 (可选):')

    # 使用 interactive_output 将控件与函数输出分离
    output_area = widgets.Output()

    def on_value_change(change):
        with output_area:
            run_analysis(sales_dropdown.value, unesco_dropdown.value, reviews_dropdown.value)

    # 监听控件值的变化
    sales_dropdown.observe(on_value_change, names='value')
    unesco_dropdown.observe(on_value_change, names='value')
    reviews_dropdown.observe(on_value_change, names='value')

    # 将控件和输出区域一起显示出来
    display(VBox([sales_dropdown, unesco_dropdown, reviews_dropdown, output_area]))
    
    # 首次手动触发运行
    on_value_change(None)

VBox(children=(Dropdown(description='选择销售文件:', options=('Amazon Sale Report.csv', 'amazon-fashion-800k+-user-r…