**Project Automated-Customer Reviews**

**- This is the phase 3 of this project**<br>

"<i><i>Bla, Bla, Bla text.

</i>"

**BART**<br>
gpt copy, seems alright but have to test it.

In [None]:
# %%
# Imports and Setup (UPDATED)

import pandas as pd
from transformers import pipeline
import numpy as np 

# Define constants for file names and the specific AI model
SENTIMENT_FILE = 'sentiment_analysis_output.csv'
CLUSTERING_FILE = 'clustering_output.csv'

# **Update to target the BART model**
# BART is commonly used with the 'facebook/bart-large-cnn' checkpoint for summarization.
BART_MODEL = 'facebook/bart-large-cnn' 

##PAY ATTENTION TO THIS!!!
REQUIRED_COLUMNS = [
    'ProductID',
    'Product Name',
    'Category',
    'Brand',
    'Ratings',
    'Cluster',
    'sentiment',
    'reviews.text',
]

print(f"Setup Complete. Targeting model: {BART_MODEL}")

**Data Loading and prep**<br>#ADD PATHS FOR CSV FILES HERE!!!! BELOW!!!

In [None]:
# %%
# Function to Load and Merge Data

def load_and_merge_data():
    """Loads and merges sentiment and clustering data."""
    print("--- 1. Loading and Merging Data ---")
    try:
        #ADD PATHS FOR CSV FILES HERE!!!! BELOW!!!
        df_sentiment = pd.read_csv(SENTIMENT_FILE)
        df_clustering = pd.read_csv(CLUSTERING_FILE)
    except FileNotFoundError as e:
        print(f"Error: {e}. Please ensure the input CSV files exist.")
        return None

    # **NOTE: Adjust the 'on' column name based on your actual data structure.**
    df_merged = pd.merge(df_sentiment, df_clustering, on='review_id', how='inner')
    
    missing_cols = [col for col in REQUIRED_COLUMNS if col not in df_merged.columns]
    if missing_cols:
        print(f"ERROR: Merged DataFrame is missing required columns: {missing_cols}")
        return None

    print(f"Merged DataFrame shape: {df_merged.shape}")
    return df_merged

step

In [None]:
# %%
# Function to Prepare AI Prompt

def prepare_category_input(df_category, min_reviews=10, positive_threshold=0.7, negative_threshold=0.3):
    """
    Analyzes products in a single category and constructs a detailed input prompt 
    for the generative AI model based on the project requirements.
    """
    
    # 1. Calculate Average Sentiment and Identify Top/Worst Products
    product_summary = df_category.groupby('product_id')['sentiment_score'].agg(['mean', 'count']).reset_index()
    product_summary = product_summary[product_summary['count'] >= min_reviews]
    
    # Sort and select Top 3 and Worst Product
    product_summary = product_summary.sort_values(by='mean', ascending=False)
    top_3 = product_summary.head(3)['product_id'].tolist()
    
    if product_summary.shape[0] < 4:
        return None, f"Only found {product_summary.shape[0]} products with more than {min_reviews} reviews."

    worst_product = product_summary.tail(1)['product_id'].iloc[0]

    # 2. Build the Multi-Part Prompt
    category_name = df_category['product_category'].iloc[0]
    prompt_sections = [
        f"Generate a compelling, well-structured blog article recommending products in the '{category_name}' category. "
        "The article MUST detail the Top 3, highlight key differences, list primary complaints for the Top 3, and explain why the Worst Product should be avoided, based ONLY on the provided review data."
    ]

    # --- Section A & B: Top 3 Products, Features, and Complaints ---
    
    for rank, product_id in enumerate(top_3):
        df_product_pos = df_category[(df_category['product_id'] == product_id) & (df_category['sentiment_score'] >= positive_threshold)]
        df_product_neg = df_category[(df_category['product_id'] == product_id) & (df_category['sentiment_score'] <= negative_threshold)]
        
        meta_features = df_product_pos.groupby('meta_category')['review_text'].head(3).str.cat(sep=" | ").replace('\n', ' ')
        complaint_texts = df_product_neg['review_text'].head(5).str.cat(sep=" | ").replace('\n', ' ')
        
        prompt_sections.append(
            f"\n\n--- Product #{rank+1} (Top Rated): {product_id} ---"
            f"\nCore Strengths (Clustered Reviews): {meta_features}"
            f"\nTop Complaints/Weaknesses: {complaint_texts}"
        )

    # --- Section C: Worst Product to Avoid ---
    df_worst = df_category[(df_category['product_id'] == worst_product)]
    worst_neg_texts = df_worst[(df_worst['sentiment_score'] <= negative_threshold)]['review_text'].head(5).str.cat(sep=" | ").replace('\n', ' ')

    prompt_sections.append(
        f"\n\n--- Product to AVOID: {worst_product} (Lowest Rated) ---"
        f"\nKey Negative Reviews/Reasons to Avoid: {worst_neg_texts}"
    )

    final_prompt = "\n".join(prompt_sections)
    
    return category_name, final_prompt

In [None]:
# %%
# Function for Generative AI (BART) - REFORMULATED

def generate_article_bart(prompt):
    """Uses the specified BART model to generate the structured article."""
    
    # NOTE: BART is very good at summarization, but it may struggle to follow 
    # the multi-part structure of the long prompt as precisely as FLAN-T5 did.
    print(f"  -> Generating with {BART_MODEL} (Running on CPU)...")
    
    try:
        # We use the 'summarization' pipeline, which is common for BART models.
        generator = pipeline(
            "summarization", 
            model=BART_MODEL, 
            device='cpu'  # Explicitly use CPU
        )
    except Exception as e:
        print(f"  ERROR: Could not load model {BART_MODEL}. Check installation/path.")
        print(f"  Error details: {e}")
        return "MODEL LOADING FAILED"

    # Generate the article
    # BART is trained to take long input and produce shorter output.
    result = generator(
        prompt, 
        max_length=500,  
        min_length=150, 
        do_sample=True,  
        temperature=0.7 
    )
    
    return result[0]['summary_text'] # Output key for summarization pipeline is 'summary_text'

**5: Main Execution Block**

In [None]:
# %%
# Main Execution Block

def main_article_generator():
    """Main function to execute the full template process."""
    
    df_merged = load_and_merge_data()
    
    if df_merged is None:
        return

    # Group by the main product category
    grouped_by_category = df_merged.groupby('product_category')
    
    print("\n--- 2. Generating Articles by Category ---")
    
    final_articles = {}
    
    # Iterate over each distinct product category
    for category_name, df_category in grouped_by_category:
        
        print(f"\nProcessing Category: {category_name}")
        
        # 1. Prepare the highly structured input prompt
        category_name, ai_prompt = prepare_category_input(df_category)
        
        if ai_prompt.startswith("Only found"):
            print(f"  Skipping category: {category_name}. {ai_prompt}")
            continue

        # 2. Call the Generative Model
        article = generate_article_flan(ai_prompt)
        
        final_articles[category_name] = article
        
        print(f"âœ… Article Generated for: {category_name}")

    print("\n--- Process Complete. Final Articles: ---")
    
    # Final Output Display
    for category, article in final_articles.items():
        print(f"\n=====================================")
        print(f"GENERATED ARTICLE: {category}")
        print(f"=====================================")
        print(article)
    
    return final_articles

# Execute the main function
if __name__ == '__main__':
    generated_articles = main_article_generator()