In [None]:
import os
import time
import httpx
import pandas as pd
import json
from openai import OpenAI
from tqdm import tqdm
from typing import List, Dict, Any, Optional, Union

In [None]:
# Set API key
os.environ["OPENAI_API_KEY"] = "Put Your GPT API Key Code"

# Create client with longer timeout
client = OpenAI(
    timeout=httpx.Timeout(120.0)
)

def get_completion_with_retry(
    prompt: str, 
    model: str = "gpt-4o",
    temperature: float = 0.2,
    max_retries: int = 5,
    initial_retry_delay: float = 5,
    max_tokens: Optional[int] = 1000,
    presence_penalty: float = 0,
    frequency_penalty: float = 0
) -> str:
    """
    Send request to OpenAI API with error handling and retry logic
    """
    messages = [{"role": "user", "content": prompt}]
    retries = 0
    
    while retries <= max_retries:
        try:
            response = client.chat.completions.create(
                model=model,
                messages=messages,
                temperature=temperature,
                max_tokens=max_tokens,
                presence_penalty=presence_penalty,
                frequency_penalty=frequency_penalty
            )
            
            return response.choices[0].message.content
            
        except Exception as e:
            print(f"Attempt {retries + 1} failed, error: {type(e).__name__}: {str(e)}")
            
            if retries >= max_retries:
                print(f"Maximum retry attempts reached. Returning empty result and continuing.")
                return "{}"
            
            wait_time = initial_retry_delay * (2 ** retries)
            print(f"Retrying in {wait_time} seconds...")
            
            time.sleep(wait_time)
            retries += 1

def analyze_movie_review(
    review_text: str, 
    movie_title: str, 
    director: str,
    writers: str,
    release_year: int,
    budget: float,
    gross: float,
    opening_weekend: float,
    roi: float,
    official_rating: float,
    user_rating: Optional[float] = None,
    language: str = "",
    country: str = "",
    filming_locations: str = "",
    production_companies: str = ""
) -> Dict:
    """
    Analyze movie review using OpenAI API
    """
    # Create prompt
    prompt = f"""
    You are a professional film critic and sentiment analysis expert. Please analyze the following movie review and provide a detailed sentiment analysis.

    MOVIE INFORMATION:
    - Title: {movie_title}
    - Director: {director}
    - Writers: {writers}
    - Release Year: {release_year}
    - Budget: ${budget:,.0f}
    - Opening Weekend (US/Canada): ${opening_weekend:,.0f}
    - Worldwide Gross: ${gross:,.0f}
    - ROI: {roi:.2f}
    - IMDb Rating: {official_rating}/10
    - User Rating: {user_rating if user_rating else "Not provided"}/10
    - Language: {language}
    - Country of Origin: {country}
    - Filming Locations: {filming_locations}
    - Production Companies: {production_companies}

    REVIEW TEXT:
    "{review_text}"

    Analyze this review's sentiment and attitude. Return ONLY a JSON object with the following keys:
    1. sentiment_score: Score from 1-10 (1=extremely negative, 10=extremely positive)
    2. emotion_keywords: List of 5 keywords/phrases that best represent the emotional tone
    3. primary_emotion: Main emotion expressed (e.g., admiration, disappointment, anger, surprise)
    4. review_focus: What aspects the review focuses on (e.g., plot, acting, visuals, directing)
    5. bias_analysis: Analysis of potential biases or subjective factors
    6. summary: Brief summary (50 words or less)

    Return ONLY the JSON result with no additional text or explanation.
    """
    
    try:
        # Call API for analysis
        response = get_completion_with_retry(prompt)
        
        # Parse JSON response
        try:
            result = json.loads(response)
            return result
        except json.JSONDecodeError:
            # Try to extract JSON portion if full parsing fails
            if '{' in response and '}' in response:
                json_str = response[response.find('{'):response.rfind('}')+1]
                try:
                    return json.loads(json_str)
                except:
                    pass
            
            # Return default result if parsing fails
            print(f"Could not parse response as JSON, using default values: {response[:100]}...")
            return {
                "sentiment_score": 5,
                "emotion_keywords": ["parsing_failed"],
                "primary_emotion": "unknown",
                "review_focus": "unknown",
                "bias_analysis": "analysis_failed",
                "summary": "Failed to parse review content"
            }
    
    except Exception as e:
        print(f"Error analyzing review: {str(e)}")
        # Return default result instead of failing
        return {
            "sentiment_score": 5,
            "emotion_keywords": ["analysis_failed"],
            "primary_emotion": "unknown",
            "review_focus": "unknown",
            "bias_analysis": "analysis_failed",
            "summary": "Error during analysis process"
        }

def analyze_movie_reviews_batch(df, sample_size=None, start_idx=0, save_interval=5):
    """
    Batch analyze movie reviews dataset
    
    Parameters:
        df: DataFrame containing movie reviews
        sample_size: Sample size to process (None means process all)
        start_idx: Starting index for continuation
        save_interval: How often to save results
    
    Returns:
        DataFrame with analysis results
    """
    # Select data to process
    if sample_size is not None:
        df_to_process = df.iloc[start_idx:start_idx + sample_size].copy()
    else:
        df_to_process = df.iloc[start_idx:].copy()
    
    # Create new columns for results
    df_to_process['sentiment_score'] = None
    df_to_process['emotion_keywords'] = None
    df_to_process['primary_emotion'] = None
    df_to_process['review_focus'] = None
    df_to_process['bias_analysis'] = None
    df_to_process['summary'] = None
    
    # Create save directory
    save_dir = "movie_analysis_results"
    os.makedirs(save_dir, exist_ok=True)
    
    # Process each review
    for i, (idx, row) in enumerate(tqdm(df_to_process.iterrows(), total=len(df_to_process), desc="Analyzing reviews")):
        # Skip empty reviews
        if pd.isna(row['Comments']):
            print(f"Skipping empty review at index {idx}")
            continue
        
        try:
            # Analyze review
            result = analyze_movie_review(
                review_text=str(row['Comments'])[:3000],  # Limit review length
                movie_title=str(row['Title']),
                director=str(row['Director']),
                writers=str(row['Writers']),
                release_year=int(row['Release_Year']),
                budget=float(row['Budget']),
                opening_weekend=float(row['Opening_Weekend_US_Canada']),
                gross=float(row['Gross_Worldwide']),
                roi=float(row['ROI']),
                official_rating=float(row['Rating_movie']),
                user_rating=float(row['Review_Rating']) if not pd.isna(row['Review_Rating']) else None,
                language=str(row['Language']),
                country=str(row['Country_of_origin']),
                filming_locations=str(row['Filming_Locations']),
                production_companies=str(row['Production_Companies'])
            )
            
            # Save results to DataFrame
            df_to_process.at[idx, 'sentiment_score'] = result.get('sentiment_score')
            df_to_process.at[idx, 'emotion_keywords'] = str(result.get('emotion_keywords', []))
            df_to_process.at[idx, 'primary_emotion'] = result.get('primary_emotion')
            df_to_process.at[idx, 'review_focus'] = result.get('review_focus')
            df_to_process.at[idx, 'bias_analysis'] = result.get('bias_analysis')
            df_to_process.at[idx, 'summary'] = result.get('summary')
            
        except Exception as e:
            print(f"Error processing review at index {idx}: {str(e)}")
            # Record error but continue processing
            df_to_process.at[idx, 'summary'] = f"Processing error: {str(e)[:100]}"
        
        # Save results periodically
        if (i + 1) % save_interval == 0 or i == len(df_to_process) - 1:
            save_path = os.path.join(save_dir, f"movie_reviews_analysis_{start_idx}_{start_idx + i + 1}.xlsx")
            try:
                df_to_process.iloc[:i+1].to_excel(save_path, index=False)
                print(f"Completed {i + 1}/{len(df_to_process)} reviews, results saved to {save_path}")
            except Exception as e:
                print(f"Error saving results to {save_path}: {str(e)}")
                # Try saving as CSV (more reliable)
                csv_path = save_path.replace('.xlsx', '.csv')
                try:
                    df_to_process.iloc[:i+1].to_csv(csv_path, index=False)
                    print(f"Saved as CSV instead: {csv_path}")
                except:
                    print("Failed to save CSV as well, continuing but may lose partial results")
    
    # Save final results
    final_path = os.path.join(save_dir, f"movie_reviews_analysis_final_{start_idx}_{start_idx + len(df_to_process)}.xlsx")
    try:
        df_to_process.to_excel(final_path, index=False)
        print(f"Analysis complete! Final results saved to {final_path}")
    except Exception as e:
        print(f"Error saving final results: {str(e)}")
        # Try saving as CSV
        final_csv = final_path.replace('.xlsx', '.csv')
        df_to_process.to_csv(final_csv, index=False)
        print(f"Saved final results as CSV: {final_csv}")
    
    return df_to_process

# 主函数
def main():
    try:
        # 读取电影评论数据
        print("正在读取电影评论数据...")
        df = pd.read_excel(r"E:\MTL\Data\Exmperience Test\IMDb_All_Information_Review.xlsx")
        
        print(f"共读取 {len(df)} 条评论")
        
        # 询问用户是否使用样本
        use_sample = input("是否只处理部分样本数据？(y/n): ").lower() == 'y'
        
        if use_sample:
            sample_size = int(input("请输入样本大小: "))
            start_idx = int(input("请输入起始索引（用于断点续传，默认0）: ") or "0")
        else:
            sample_size = None
            start_idx = int(input("请输入起始索引（用于断点续传，默认0）: ") or "0")
        
        # 设置保存间隔
        save_interval = int(input("请输入保存间隔（每处理多少条评论保存一次，默认5）: ") or "5")
        
        # 分析评论
        print("开始分析电影评论...")
        results_df = analyze_movie_reviews_batch(df, sample_size, start_idx, save_interval)
        
        print(f"全部分析完成！")
    
    except Exception as e:
        print(f"程序执行过程中出现严重错误: {str(e)}")
        print("请检查错误信息并重新运行程序。")

if __name__ == "__main__":
    main()