In [1]:
# ==============================================================================
# 大型评论文件交互式预处理脚本 (最终版)
# ==============================================================================
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import os
import time
import ipywidgets as widgets
from IPython.display import display, clear_output

# --- 导入并初始化 tqdm，以启用 progress_apply ---
from tqdm.auto import tqdm
tqdm.pandas()

# ==============================================================================
# 辅助函数 (保持不变)
# ==============================================================================
def find_review_column(df):
    # ... (函数内容与之前相同) ...
    priority_cols = ['reviews.text', 'review_text', 'content', 'comment', 'review']
    for p_col in priority_cols:
        if p_col in df.columns and df[p_col].dropna().astype(str).str.strip().any(): return p_col
    object_cols = df.select_dtypes(include=['object']).columns
    if not object_cols.empty:
        return max(object_cols, key=lambda col: df[col].dropna().astype(str).str.len().mean())
    return None

def sentiment_to_rating(sentiment):
    # ... (函数内容与之前相同) ...
    if sentiment >= 0.5: return 5
    elif sentiment >= 0.05: return 4
    elif sentiment > -0.05: return 3
    elif sentiment > -0.5: return 2
    else: return 1

# ==============================================================================
# 主处理函数
# ==============================================================================
def preprocess_reviews(input_csv_file):
    """
    接收选择的评论文件名，对其进行情感分析和预处理，并保存为 Parquet 文件。
    """
    # 清除上一次的输出
    clear_output(wait=True)
    
    if not input_csv_file:
        print("请选择一个文件进行处理。")
        return
        
    output_parquet_file = 'reviews_processed.parquet'
    print(f"--- 开始预处理文件: '{input_csv_file}' ---")
    start_time = time.time()
    
    try:
        print(f"正在加载 {input_csv_file}...")
        reviews_df = pd.read_csv(input_csv_file, on_bad_lines='skip')
        
        review_column_name = find_review_column(reviews_df)
        if review_column_name is None:
            raise ValueError("未能在CSV文件中找到有效的文本评论列。")
            
        print(f"自动检测到评论列为: '{review_column_name}'")
        reviews_df.rename(columns={review_column_name: 'review_text'}, inplace=True)
        reviews_df.dropna(subset=['review_text'], inplace=True)
        reviews_df['review_text'] = reviews_df['review_text'].astype(str)
        
        analyzer = SentimentIntensityAnalyzer()
        
        print("正在进行情感分析 (这可能需要几分钟)...")
        # --- (关键修复) 现在可以正常使用 progress_apply ---
        reviews_df['sentiment'] = reviews_df['review_text'].progress_apply(
            lambda text: analyzer.polarity_scores(text)['compound']
        )
        
        print("正在根据情感分数估算星级...")
        reviews_df['rating'] = reviews_df['sentiment'].apply(sentiment_to_rating)
        
        final_df = reviews_df[['rating', 'review_text', 'sentiment']]
        
        print(f"\n正在将结果保存到 '{output_parquet_file}'...")
        final_df.to_parquet(output_parquet_file, index=False)
        
        end_time = time.time()
        
        print("\n" + "="*50)
        print("✅ 预处理成功完成！")
        print(f"总耗时: {end_time - start_time:.2f} 秒")
        print(f"处理了 {len(final_df)} 条评论。")
        print(f"现在您可以在 Streamlit 应用中直接使用 '{output_parquet_file}' 文件了。")
        print("="*50)
        
        print("\n处理后数据预览:")
        display(final_df.head())
        
    except Exception as e:
        print(f"\n处理过程中发生错误: {e}")

# ==============================================================================
# 创建交互式控件
# ==============================================================================
# 扫描文件并创建下拉菜单
csv_files = [f for f in os.listdir('.') if f.endswith('.csv')]
reviews_options = [f for f in csv_files if 'review' in f.lower()]

if not reviews_options:
    print("错误：在当前目录中未找到任何评论文件 (文件名应包含 'review')。")
else:
    # 使用 interactive 函数将下拉菜单与我们的主处理函数绑定
    interactive_preprocessor = widgets.interactive(
        preprocess_reviews, 
        input_csv_file=widgets.Dropdown(options=reviews_options, description='选择评论文件:')
    )
    
    # 显示控件和输出区域
    display(interactive_preprocessor)

interactive(children=(Dropdown(description='选择评论文件:', options=('amazon-fashion-800k+-user-reviews-dataset.csv'…