In [1]:
import os
import pandas as pd
import logging
import re
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from pathlib import Path

# Import custom modules (make sure they are in the Python path or same directory)
from customer_feedback_analyzer import CustomerFeedbackAnalyzer
from bertopic import BERTopic
from bert_sentiment_trainer import BertSentimentModel

# Configure logging (optional for notebook, but good practice)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("New Data Processor Notebook")

# --- Parameters to Modify --- #
INPUT_FILE = "Synthetic_Data_and_AI_results-1 (1).xlsx" # Path to your new data file (Excel or CSV)
TEXT_COLUMN = "Comment"            # Name of the column containing the feedback text
OPENAI_API_KEY = None           # Set your OpenAI API key here if you want to use LLM preprocessing, otherwise leave as None
# --- --- --- --- --- --- --- --

# Directories (using absolute paths from the original script for consistency)
BASE_DIR = Path("/Users/DINGZEEFS/Documents/Fine_tuning_odido")
OUTPUT_DIR = BASE_DIR / "output_dir"
MODEL_DIR = OUTPUT_DIR # Models are expected to be in the main output directory
NEW_OUTPUT_DIR = OUTPUT_DIR / "new_data_notebook_results" # Separate output for this notebook
VIS_DIR = NEW_OUTPUT_DIR / "visualizations"

# Ensure input file path is absolute or relative to the notebook's location
# If INPUT_FILE is just a filename, assume it's in the same directory as the notebook
if not os.path.isabs(INPUT_FILE) and not os.path.exists(INPUT_FILE):
    INPUT_FILE = BASE_DIR / INPUT_FILE

logger.info(f"Input File: {INPUT_FILE}")
logger.info(f"Output Directory: {NEW_OUTPUT_DIR}")
logger.info(f"Model Directory: {MODEL_DIR}")

# Dutch stop words (copied from regenerate_visualizations.py)
DUTCH_STOP_WORDS = [
    'de', 'het', 'een', 'van', 'in', 'op', 'met', 'is', 'zijn', 'was', 'dat',
    'ik', 'je', 'hij', 'zij', 'wij', 'jullie', 'ze', 'deze', 'dit', 'die',
    'bij', 'aan', 'als', 'door', 'uit', 'over', 'om', 'voor', 'na', 'er', 'niet',
    'ook', 'maar', 'dan', 'dus', 'nog', 'al', 'zo', 'kan', 'of', 'naar', 'worden',
    'tot', 'onder', 'al', 'ben', 'tegen', 'sinds', 'zonder', 'te', 'wel', 'omdat',
    'zich', 'u', 'uw', 'mijn', 'hun', 'veel', 'meer', 'andere', 'heeft'
]

  from .autonotebook import tqdm as notebook_tqdm
2025-03-29 00:31:46,382 - New Data Processor Notebook - INFO - Input File: Synthetic_Data_and_AI_results-1 (1).xlsx
2025-03-29 00:31:46,382 - New Data Processor Notebook - INFO - Output Directory: /Users/DINGZEEFS/Documents/Fine_tuning_odido/output_dir/new_data_notebook_results
2025-03-29 00:31:46,382 - New Data Processor Notebook - INFO - Model Directory: /Users/DINGZEEFS/Documents/Fine_tuning_odido/output_dir


In [2]:
# Function from regenerate_visualizations.py
def clean_topic_terms(topic_model):
    """Get more meaningful topic terms by filtering common words."""
    improved_topics = {}
    topic_info = topic_model.get_topic_info()

    for topic_id in topic_info['Topic']:
        if topic_id == -1: # Skip outlier topic
            continue

        # Get topic terms
        topic_terms = topic_model.get_topic(topic_id)
        if not topic_terms: # Handle case where topic might not have terms
             improved_topics[topic_id] = []
             continue

        # Filter the terms
        filtered_terms = []
        for word, score in topic_terms:
            # Create a clean word by replacing underscores with spaces (common in BERTopic)
            clean_word = word.replace('_', ' ')

            # Filter out single words that are stopwords or very short
            if len(clean_word.split()) > 1:
                # This is a multi-word phrase, keep it
                filtered_terms.append((clean_word, score))
            elif clean_word.lower() not in DUTCH_STOP_WORDS and len(clean_word) > 3:
                # This is a single word but not a stopword and longer than 3 chars
                filtered_terms.append((clean_word, score))

        # If we have meaningful terms, use them
        if filtered_terms:
            improved_topics[topic_id] = filtered_terms[:10]  # Get top 10 terms
        else:
            # Fallback to the original terms but skip very common words
            fallback_terms = []
            for word, score in topic_terms:
                clean_word = word.replace('_', ' ')
                if clean_word.lower() not in DUTCH_STOP_WORDS:
                    fallback_terms.append((clean_word, score))
            improved_topics[topic_id] = fallback_terms[:10]

    return improved_topics

In [3]:
# Function from regenerate_visualizations.py
def create_custom_barchart(topic_model, improved_topics, output_dir, display_inline=True):
    """Create a custom barchart with descriptive topic labels and more meaningful terms."""
    # Get topic info with descriptive labels
    topic_info = topic_model.get_topic_info()

    # Dictionary to map raw topic names to cleaner names (adjust as needed)
    topic_name_mapping = {
        "Product": "Productkwaliteit",
        "Customerservice": "Klantenservice",
        "Aankoop / Levering": "Aankoop en Levering",
        "Merk Perceptie": "Merk Perceptie",
        "4_snelheid_10 van_10 van de_van de beloofde": "Snelheid internet",
        "5_ipv6_niet_instellen_de": "IPv6 Issues",
        "6_tv_de tv_product_bij": "TV Box Issues",
        "7_in_product_goed_dekking": "Bereik/Dekking",
        "8_app_de ben_ben app_de ben app": "Ben App",
        "9_simkaart_nieuwe simkaart_de_nieuwe": "Simkaart Issues",
        "10_prijs_de prijs_prijs de prijs_prijs de": "Prijs",
        "11_wifi_de wifi_de_meer": "Wifi Issues",
        "12_ben_de_klant_is": "Klantrelatie",
        "13_de_van_is_het": "Algemeen"
        # Add more mappings if new raw topic names appear
    }

    # Get top topics excluding outliers
    top_topics = topic_info[topic_info['Topic'] != -1].sort_values('Count', ascending=False).head(8)

    if top_topics.empty:
        logger.warning("No topics found (excluding outliers) to generate custom barchart.")
        return None

    num_topics = len(top_topics)
    cols = min(num_topics, 4)
    rows = (num_topics + cols - 1) // cols # Calculate rows needed

    # Create subplot grid
    subplot_titles = [topic_name_mapping.get(row['Name'], row['Name']) for _, row in top_topics.iterrows()]
    fig = make_subplots(
        rows=rows,
        cols=cols,
        subplot_titles=subplot_titles,
        vertical_spacing=0.25, # Adjust spacing if needed
        horizontal_spacing=0.1
    )

    # Colors (ensure enough colors for top_n topics)
    colors = px.colors.qualitative.Plotly[:num_topics]

    # Add the bar charts for each topic
    for i, (_, row) in enumerate(top_topics.iterrows()):
        topic_id = row['Topic']
        topic_count = row['Count']
        raw_name = row['Name']
        clean_name = topic_name_mapping.get(raw_name, raw_name)

        subplot_row = (i // cols) + 1
        subplot_col = (i % cols) + 1

        # Get improved terms for this topic
        if topic_id in improved_topics and improved_topics[topic_id]:
            terms = improved_topics[topic_id][:5] # Top 5 improved words
        else:
             terms = topic_model.get_topic(topic_id)
             if terms: # Check if terms exist
                 terms = terms[:5]
             else: # Handle case with no terms
                 terms = []

        if not terms:
            # Add placeholder text if no terms are available
             fig.add_annotation(text="No terms found",
                               xref="paper", yref="paper",
                               x=0.5, y=0.5,
                               showarrow=False,
                               row=subplot_row, col=subplot_col)
             continue # Skip adding bar trace if no terms

        words, weights = zip(*terms)

        # Add bar chart to the subplot
        fig.add_trace(
            go.Bar(
                x=weights,
                y=words,
                orientation='h',
                marker=dict(color=colors[i]),
                name=clean_name,
                showlegend=False
            ),
            row=subplot_row,
            col=subplot_col
        )

        # Add count annotation below title
        # Note: Annotations relative to subplots can be tricky. This adds it near the top of the plot area.
        # We might need to adjust positioning based on actual plot rendering.
        # fig.layout.annotations[i].update(text=f"{subplot_titles[i]}<br><i>{topic_count} mentions</i>")

    # Update layout
    fig.update_layout(
        title_text="Top Topics with Meaningful Terms",
        height=300 * rows, # Adjust height based on rows
        # width=1200,
        margin=dict(t=80, b=40, l=50, r=30),
        font=dict(size=10),
        showlegend=False
    )

    # Update axes for all subplots
    # Finding a common range can be tricky, let Plotly auto-range for now
    # max_weight = max(max(zip(*improved_topics[tid][:5]))[1] for tid in top_topics['Topic'] if tid in improved_topics and improved_topics[tid])
    # fig.update_xaxes(title_text="Term Weight", range=[0, max_weight * 1.1])
    fig.update_yaxes(autorange="reversed") # Show top terms at the top

    # Save the figure
    output_path = os.path.join(output_dir, "meaningful_topic_barchart.html")
    fig.write_html(output_path)
    logger.info(f"Meaningful topic barchart saved to {output_path}")

    if display_inline:
        fig.show()

    return fig

In [4]:
# Function from visualizations.py (modified for inline display)
def create_topic_distribution_chart(df, output_file=None, display_inline=True):
    """Create a chart showing the distribution of topics."""
    if 'topic_name' not in df.columns:
        logger.error("Column 'topic_name' not found in DataFrame for topic distribution chart.")
        return None

    # Count topics, excluding potential outliers mapped to 'Overig' if desired
    # Or use 'schema_main_category' if that's the final topic representation
    topic_counts = df['topic_name'].value_counts().reset_index()
    topic_counts.columns = ['Topic', 'Count']

    if topic_counts.empty:
        logger.warning("No topic data found to generate distribution chart.")
        return None

    # Create chart
    fig = px.bar(topic_counts, x='Count', y='Topic', orientation='h',
                 title='Topic Distribution',
                 labels={'Count': 'Number of Comments', 'Topic': 'Topic'},
                 text='Count') # Add count labels to bars

    fig.update_layout(yaxis={'categoryorder':'total ascending'}) # Sort topics by count
    fig.update_traces(textposition='outside')

    # Save to file
    if output_file:
        fig.write_html(output_file)
        logger.info(f"Topic distribution chart saved to {output_file}")

    if display_inline:
        fig.show()

    return fig

In [5]:
# Function from visualizations.py (modified for inline display)
def create_sentiment_pie_chart(df, sentiment_column='bert_sentiment', output_file=None, display_inline=True):
    """Create a pie chart showing sentiment distribution."""
    if sentiment_column not in df.columns:
        logger.error(f"Column '{sentiment_column}' not found in DataFrame for sentiment pie chart.")
        return None

    # Count sentiments
    sentiment_counts = df[sentiment_column].value_counts().reset_index()
    sentiment_counts.columns = ['Sentiment', 'Count']

    if sentiment_counts.empty:
        logger.warning("No sentiment data found to generate pie chart.")
        return None

    # Create chart
    fig = px.pie(sentiment_counts, values='Count', names='Sentiment',
                 title=f'Sentiment Distribution ({sentiment_column})',
                 color='Sentiment', # Optional: color mapping
                 color_discrete_map={'Positive':'green', 'Negative':'red', 'Neutral':'grey'})

    fig.update_traces(textposition='inside', textinfo='percent+label+value')

    # Save to file
    if output_file:
        fig.write_html(output_file)
        logger.info(f"Sentiment pie chart saved to {output_file}")

    if display_inline:
        fig.show()

    return fig

In [6]:
# Create output directories
os.makedirs(NEW_OUTPUT_DIR, exist_ok=True)
os.makedirs(VIS_DIR, exist_ok=True)

# Initialize analyzer
# Note: We only pass arguments expected by the __init__ method.
# model_dir is likely inferred from output_dir or handled internally by load/save methods.
# text_column is passed to specific methods later.
logger.info("Initializing analyzer...")
analyzer = CustomerFeedbackAnalyzer(
    openai_api_key=OPENAI_API_KEY, # Will be None if not provided
    output_dir=NEW_OUTPUT_DIR    # Output specific to this notebook run
)
logger.info("Analyzer initialized.")

# The TEXT_COLUMN variable defined in the setup cell will be used when calling
# methods like analyzer.apply_bert_sentiment(..., text_column=TEXT_COLUMN) later.

2025-03-29 00:31:46,432 - New Data Processor Notebook - INFO - Initializing analyzer...
2025-03-29 00:31:46,434 - New Data Processor Notebook - INFO - Analyzer initialized.


In [7]:
logger.info(f"Loading data from {INPUT_FILE}...")
try:
    if str(INPUT_FILE).endswith('.xlsx'):
        df = pd.read_excel(INPUT_FILE)
    elif str(INPUT_FILE).endswith('.csv'):
        # Try detecting separator, fall back to comma
        try:
            df = pd.read_csv(INPUT_FILE, sep=None, engine='python') # Auto-detect separator
        except Exception:
             df = pd.read_csv(INPUT_FILE) # Default to comma
    else:
        raise ValueError("Input file must be .xlsx or .csv")

    logger.info(f"Loaded {len(df)} rows from {INPUT_FILE}")
    logger.info(f"Columns: {df.columns.tolist()}")

    # Ensure the specified text column exists
    if TEXT_COLUMN not in df.columns:
        raise ValueError(f"Specified text column '{TEXT_COLUMN}' not found in the input file. Available columns: {df.columns.tolist()}")

    # Display first few rows
    display(df.head())

except FileNotFoundError:
    logger.error(f"Error: Input file not found at {INPUT_FILE}")
    # Stop execution or handle error appropriately
    raise
except Exception as e:
    logger.error(f"Error loading data: {str(e)}")
    raise

2025-03-29 00:31:46,441 - New Data Processor Notebook - INFO - Loading data from Synthetic_Data_and_AI_results-1 (1).xlsx...
2025-03-29 00:31:46,523 - New Data Processor Notebook - INFO - Loaded 20 rows from Synthetic_Data_and_AI_results-1 (1).xlsx
2025-03-29 00:31:46,523 - New Data Processor Notebook - INFO - Columns: ['Business Unit', 'List Id', 'Campaign Id', 'Campaign Name', 'Campaign Type', 'Mail Id', 'Mail Name', 'Reason', 'Score', 'Comment', 'Event Timestamp']


Unnamed: 0,Business Unit,List Id,Campaign Id,Campaign Name,Campaign Type,Mail Id,Mail Name,Reason,Score,Comment,Event Timestamp
0,B2C - BEN - MOBILE,4058,10933,,Unknown,,,Brand,8,Ben is betaalbaar in deze dure tijd | Vroeger ...,2025-02-19T16:01:29.999Z
1,B2C - Odido_mobile_fixed,4059,10934,Zomercampagne,Retention,2315.0,Zomer Actie,Service,2,Mijn verbinding werkt dus niet! | Mijn verbind...,2025-02-20T09:14:53.234Z
2,B2C - BEN - MOBILE,4062,10940,Loyaliteit 2025,Loyalty,2317.0,Klant Beloning,Service,8,Al 3 jaar klant en erg tevreden over de klante...,2025-02-20T13:42:17.567Z
3,B2C - Odido_mobile_fixed,4065,10945,,Unknown,,,Product,3,"De TV box hapert constant, bij elke zender. He...",2025-02-21T08:23:45.123Z
4,B2C - BEN - MOBILE,4070,10950,Voorjaar 2025,Acquisition,2325.0,Nieuwe Klanten,Price,7,Overgestapt naar Ben vanwege de lage prijzen. ...,2025-02-22T10:15:32.456Z


In [8]:
import re # Make sure re is imported if not already done in the first cell
import numpy as np # Import numpy for nan

logger.info("Preprocessing data...")
preprocessed_texts = []

# Determine if LLM preprocessing should be used
use_llm_preprocessing = bool(analyzer.openai_client)

if use_llm_preprocessing:
    logger.info("Using OpenAI API for enhanced preprocessing...")
    # --- LLM Preprocessing Handling ---
    # This section needs careful implementation based on how your LLM preprocessing
    # (e.g., analyzer.preprocess_data or llm_preprocessing.py) actually works.
    # It should ideally populate columns like 'sentiment', 'sentiment_score', etc.
    # For now, falling back to simple preprocessing with placeholders.
    logger.warning("LLM preprocessing logic in notebook needs review/implementation. Falling back to simple preprocessing.")
    for text in df[TEXT_COLUMN].astype(str):
        if pd.isna(text) or not text:
            cleaned_text = ""
        else:
            cleaned_text = str(text).lower()
            cleaned_text = re.sub(r'[^a-z\s]', '', cleaned_text) # Keep only lowercase letters and spaces
        preprocessed_texts.append(cleaned_text)

    processed_df = df.copy()
    processed_df['preprocessed_text'] = preprocessed_texts
    processed_df['combined_text'] = processed_df['preprocessed_text']
    # Add placeholders assuming LLM didn't run or populate these
    processed_df['sentiment'] = 'Neutral' # Placeholder
    processed_df['sentiment_score'] = 0.0    # Placeholder
    processed_df['sentiment_label'] = 1      # Placeholder (Neutral)
    # --- End LLM Handling Placeholder ---

else:
    # Simple preprocessing (lowercase, remove special chars)
    logger.info("No API key provided. Using simple preprocessing...")
    for text in df[TEXT_COLUMN].astype(str):
        if pd.isna(text) or not text:
             cleaned_text = ""
        else:
             cleaned_text = str(text).lower()
             cleaned_text = re.sub(r'[^a-z\s]', '', cleaned_text)
        preprocessed_texts.append(cleaned_text)

    # Create a new DataFrame or add to existing one
    processed_df = df.copy()
    processed_df['preprocessed_text'] = preprocessed_texts
    processed_df['combined_text'] = processed_df['preprocessed_text']

    # *** Add placeholder columns required by apply_bert_sentiment ***
    logger.info("Adding placeholder sentiment columns for compatibility.")
    processed_df['sentiment'] = 'Neutral' # Placeholder value
    processed_df['sentiment_score'] = 0.0    # Placeholder value
    processed_df['sentiment_label'] = 1      # Placeholder value (assuming 1=Neutral)


logger.info(f"Preprocessing complete for {len(processed_df)} rows")
# Display relevant columns including the new placeholder
display(processed_df[[TEXT_COLUMN, 'preprocessed_text', 'sentiment']].head())

2025-03-29 00:31:46,538 - New Data Processor Notebook - INFO - Preprocessing data...
2025-03-29 00:31:46,615 - New Data Processor Notebook - INFO - No API key provided. Using simple preprocessing...
2025-03-29 00:31:46,660 - New Data Processor Notebook - INFO - Adding placeholder sentiment columns for compatibility.
2025-03-29 00:31:46,663 - New Data Processor Notebook - INFO - Preprocessing complete for 20 rows


Unnamed: 0,Comment,preprocessed_text,sentiment
0,Ben is betaalbaar in deze dure tijd | Vroeger ...,ben is betaalbaar in deze dure tijd vroeger m...,Neutral
1,Mijn verbinding werkt dus niet! | Mijn verbind...,mijn verbinding werkt dus niet mijn verbindin...,Neutral
2,Al 3 jaar klant en erg tevreden over de klante...,al jaar klant en erg tevreden over de klanten...,Neutral
3,"De TV box hapert constant, bij elke zender. He...",de tv box hapert constant bij elke zender heb ...,Neutral
4,Overgestapt naar Ben vanwege de lage prijzen. ...,overgestapt naar ben vanwege de lage prijzen t...,Neutral


In [9]:
# Load sentiment model (uses saved BERT)
logger.info("Loading trained BERT sentiment model...")
try:
    # Load BERT model directly using the BertSentimentModel class
    sentiment_model_path = MODEL_DIR / "bert-sentiment-model"
    bert_model_wrapper = BertSentimentModel.load(str(sentiment_model_path))
    # Assign the loaded model wrapper to the analyzer instance
    analyzer.bert_sentiment_model = bert_model_wrapper
    logger.info("BertSentimentModel wrapper loaded and assigned to analyzer.")

except FileNotFoundError:
    logger.error(f"Error: Sentiment model file/directory not found at {sentiment_model_path}")
    logger.error("Ensure the model was trained and saved correctly using run_pipeline.py")
    raise
except Exception as e:
    logger.error(f"Error loading sentiment model: {e}")
    raise

# Load topic model (uses saved BERTopic model)
logger.info("Loading trained topic model...")
try:
    # Load BERTopic model directly
    topic_model_path = MODEL_DIR / "topic_model"
    topic_model = BERTopic.load(str(topic_model_path))
    # Assign the loaded model to the analyzer instance (optional, but might be useful)
    analyzer.topic_model = topic_model
    logger.info("Topic model loaded successfully and assigned to analyzer.")
except FileNotFoundError:
     logger.error(f"Error: Topic model file not found at {topic_model_path}")
     logger.error("Ensure the model was trained and saved correctly using run_pipeline.py")
     raise
except Exception as e:
    logger.error(f"Error loading topic model: {e}")
    raise

# Note: We are loading the models directly here, not using methods from the analyzer object.
# The loaded models are stored in bert_model_wrapper and topic_model variables
# and also assigned to the analyzer instance for potential use by its other methods.

2025-03-29 00:31:46,671 - New Data Processor Notebook - INFO - Loading trained BERT sentiment model...


Loading tokenizer from /Users/DINGZEEFS/Documents/Fine_tuning_odido/output_dir/bert-sentiment-model...
Loading model from /Users/DINGZEEFS/Documents/Fine_tuning_odido/output_dir/bert-sentiment-model...


2025-03-29 00:31:46,906 - New Data Processor Notebook - INFO - BertSentimentModel wrapper loaded and assigned to analyzer.
2025-03-29 00:31:46,906 - New Data Processor Notebook - INFO - Loading trained topic model...
2025-03-29 00:31:49,226 - New Data Processor Notebook - INFO - Topic model loaded successfully and assigned to analyzer.


In [10]:
# Apply sentiment model (uses BERT)
# Ensure the correct text column is passed
logger.info("Applying trained BERT sentiment model...")
df_with_sentiment = analyzer.apply_bert_sentiment(processed_df, text_column='combined_text')
logger.info("Sentiment analysis complete.")
display(df_with_sentiment[['combined_text', 'bert_sentiment', 'bert_sentiment_score']].head())

# --- Apply Topic Model --- #

# Step 1: Compute embeddings (if not already done by analyzer)
# Check if embeddings are needed explicitly for transform
logger.info("Computing embeddings for topic modeling...")
embeddings = analyzer.compute_embeddings(df_with_sentiment, text_column='combined_text')
logger.info(f"Embeddings computed with shape: {embeddings.shape}")

# Step 2: Apply topic model using transform()
logger.info("Applying topic model...")
# Make sure BERTopic was initialized with calculate_probabilities=True if you want probabilities
topics, probs = topic_model.transform(
    df_with_sentiment['combined_text'].tolist(),
    embeddings=embeddings # Pass pre-computed embeddings
)
logger.info("Topic modeling complete.")

# --- Correctly handle probabilities ---
# Add raw topic IDs to the DataFrame
df_with_sentiment['raw_topic_id'] = topics

# Calculate the probability of the assigned topic (maximum probability for that doc)
if probs is not None:
    # Ensure probs is a NumPy array for calculations
    probs = np.array(probs)
    if probs.ndim == 2:
        # Get the probability of the MOST likely topic for each doc
        assigned_topic_probabilities = np.max(probs, axis=1)
        df_with_sentiment['topic_probability'] = assigned_topic_probabilities
        logger.info("Assigned topic probabilities calculated from probability matrix.")
    else:
        # Handle unexpected shape for probs
        logger.warning(f"Expected a 2D array for topic probabilities, but got shape {probs.shape}. Setting probabilities to 0.")
        df_with_sentiment['topic_probability'] = 0.0
else:
    # Handle case where probabilities were not calculated (e.g., calculate_probabilities=False)
    logger.warning("Topic probabilities were not calculated by BERTopic. Setting probabilities to 0.")
    df_with_sentiment['topic_probability'] = 0.0
# --- End probability handling ---

display(df_with_sentiment[['combined_text', 'bert_sentiment', 'raw_topic_id', 'topic_probability']].head())


2025-03-29 00:31:49,236 - New Data Processor Notebook - INFO - Applying trained BERT sentiment model...
2025-03-29 00:31:49,238 - Customer Feedback Analysis - INFO - Applying BERT sentiment model to reviews...
Predicting sentiment: 100%|██████████| 1/1 [00:00<00:00,  1.02it/s]
2025-03-29 00:31:50,267 - Customer Feedback Analysis - INFO - BERT sentiment analysis complete
2025-03-29 00:31:50,270 - Customer Feedback Analysis - INFO - Agreement between LLM and BERT sentiment: 0.00%
2025-03-29 00:31:50,271 - New Data Processor Notebook - INFO - Sentiment analysis complete.


Unnamed: 0,combined_text,bert_sentiment,bert_sentiment_score
0,ben is betaalbaar in deze dure tijd vroeger m...,positive,0.853181
1,mijn verbinding werkt dus niet mijn verbindin...,negative,0.980791
2,al jaar klant en erg tevreden over de klanten...,positive,0.983737
3,de tv box hapert constant bij elke zender heb ...,negative,0.98384
4,overgestapt naar ben vanwege de lage prijzen t...,positive,0.977336


2025-03-29 00:31:50,275 - New Data Processor Notebook - INFO - Computing embeddings for topic modeling...
2025-03-29 00:31:50,276 - Customer Feedback Analysis - INFO - Loading precomputed embeddings from /Users/DINGZEEFS/Documents/Fine_tuning_odido/output_dir/new_data_notebook_results/embeddings.npy
2025-03-29 00:31:50,279 - New Data Processor Notebook - INFO - Embeddings computed with shape: (20, 768)
2025-03-29 00:31:50,280 - New Data Processor Notebook - INFO - Applying topic model...
2025-03-29 00:31:50,285 - New Data Processor Notebook - INFO - Topic modeling complete.
2025-03-29 00:31:50,286 - New Data Processor Notebook - INFO - Assigned topic probabilities calculated from probability matrix.


Unnamed: 0,combined_text,bert_sentiment,raw_topic_id,topic_probability
0,ben is betaalbaar in deze dure tijd vroeger m...,positive,10,0.501629
1,mijn verbinding werkt dus niet mijn verbindin...,negative,11,0.486776
2,al jaar klant en erg tevreden over de klanten...,positive,1,0.62995
3,de tv box hapert constant bij elke zender heb ...,negative,6,0.50888
4,overgestapt naar ben vanwege de lage prijzen t...,positive,11,0.395845


In [11]:
logger.info("Generating final analysis results (combining sentiment, topics, schema mapping, etc.)...")
# Call the method, expecting only the main results DataFrame in return
try:
    results_df = analyzer.generate_analysis_results(df_with_sentiment, topics, probs, topic_model)
    logger.info("Main analysis results DataFrame generated.")
    display(results_df.head())
except Exception as e:
    logger.error(f"Error during generate_analysis_results: {e}")
    # Potentially display df_with_sentiment to see what was passed in
    logger.info("DataFrame passed to generate_analysis_results:")
    display(df_with_sentiment.head())
    raise # Re-raise the exception

# --- Calculate sentiment_by_topic separately ---
logger.info("Calculating sentiment distribution by schema category...")
try:
    # Ensure the necessary columns exist in the returned DataFrame
    if 'schema_main_category' in results_df.columns and 'bert_sentiment' in results_df.columns:
        sentiment_by_topic = results_df.groupby('schema_main_category')['bert_sentiment'].value_counts().unstack(fill_value=0)
        logger.info("Sentiment Distribution by Schema Category calculated:")
        display(sentiment_by_topic)
    else:
        logger.warning("Could not calculate sentiment by topic: Required columns ('schema_main_category' or 'bert_sentiment') not found in results_df.")
        sentiment_by_topic = None # Set to None if calculation fails
except Exception as e:
     logger.error(f"Error calculating sentiment by topic: {e}")
     sentiment_by_topic = None
# --- End separate calculation ---


# Save results
output_csv_path = NEW_OUTPUT_DIR / "new_data_analysis_notebook.csv"
results_df.to_csv(output_csv_path, index=False)
logger.info(f"Results saved to {output_csv_path}")


2025-03-29 00:31:50,299 - New Data Processor Notebook - INFO - Generating final analysis results (combining sentiment, topics, schema mapping, etc.)...
2025-03-29 00:31:50,301 - Customer Feedback Analysis - INFO - Generating final analysis results...
2025-03-29 00:31:50,330 - Customer Feedback Analysis - INFO - Analysis results saved to /Users/DINGZEEFS/Documents/Fine_tuning_odido/output_dir/new_data_notebook_results/analysis_results.csv
2025-03-29 00:31:50,332 - Customer Feedback Analysis - INFO - Sentiment agreement between LLM and BERT: 0.00%
2025-03-29 00:31:50,332 - New Data Processor Notebook - INFO - Main analysis results DataFrame generated.


Unnamed: 0,original_index,text,intent,combined_text,representative_text,opinion_phrases,original_topic_id,original_topic_name,original_topic_keywords,schema_main_category,schema_sub_topic,topic_id,topic_name,topic_keywords,topic_probability,llm_sentiment,llm_sentiment_score,bert_sentiment,bert_sentiment_score,named_entities
0,0,,,ben is betaalbaar in deze dure tijd vroeger m...,ben is betaalbaar in deze dure tijd vroeger m...,Betalen daarom ben ik tevreden bij ben maar ik.,10,Onderwerp: Prijsstelling,"prijs, data, product",Prijs & Kosten,Prijs-kwaliteitverhouding,10,Prijs & Kosten,Prijs-kwaliteitverhouding,0.501629,Neutral,0.0,positive,0.853181,[]
1,1,,,mijn verbinding werkt dus niet mijn verbindin...,mijn verbinding werkt dus niet mijn verbindin...,Mijn verbinding werkt dus niet mijn verbinding...,11,Wifi-problemen bij meerdere apparaten,"wifi, meer, router, valt, product, verbinding,...",Productkwaliteit & Ervaring,Prestaties & Functionaliteit,11,Productkwaliteit & Ervaring,Prestaties & Functionaliteit,0.486776,Neutral,0.0,negative,0.980791,[]
2,2,,,al jaar klant en erg tevreden over de klanten...,al jaar klant en erg tevreden over de klanten...,Al jaar klant en erg tevreden over de klantens...,1,Klantenservice bereikbaarheidsproblemen,"klantenservice, geen, klantenservice de klant",Customer Service,Responsiviteit,1,Customer Service,Responsiviteit,0.62995,Neutral,0.0,positive,0.983737,[]
3,3,,,de tv box hapert constant bij elke zender heb ...,de tv box hapert constant bij elke zender heb ...,Gebeld maar nog steeds niet opgelost nu weer m...,6,Technische problemen met de TV box,"tv, product, bij, signaal, elke",Productkwaliteit & Ervaring,Gebruiksgemak,6,Productkwaliteit & Ervaring,Gebruiksgemak,0.50888,Neutral,0.0,negative,0.98384,[]
4,4,,,overgestapt naar ben vanwege de lage prijzen t...,overgestapt naar ben vanwege de lage prijzen t...,Overgestapt naar ben vanwege de.; Tot nu toe z...,11,Wifi-problemen bij meerdere apparaten,"wifi, meer, router, valt, product, verbinding,...",Productkwaliteit & Ervaring,Prestaties & Functionaliteit,11,Productkwaliteit & Ervaring,Prestaties & Functionaliteit,0.395845,Neutral,0.0,positive,0.977336,[]


2025-03-29 00:31:50,345 - New Data Processor Notebook - INFO - Calculating sentiment distribution by schema category...
2025-03-29 00:31:50,351 - New Data Processor Notebook - INFO - Sentiment Distribution by Schema Category calculated:


bert_sentiment,negative,positive
schema_main_category,Unnamed: 1_level_1,Unnamed: 2_level_1
Customer Service,1,5
Overig,0,1
Prijs & Kosten,3,2
Productkwaliteit & Ervaring,5,3


2025-03-29 00:31:50,361 - New Data Processor Notebook - INFO - Results saved to /Users/DINGZEEFS/Documents/Fine_tuning_odido/output_dir/new_data_notebook_results/new_data_analysis_notebook.csv


In [12]:
logger.info("Generating visualizations...")

# Create topic distribution chart
logger.info("Creating Topic Distribution Chart...")
create_topic_distribution_chart(
    results_df,
    output_file=VIS_DIR / "new_data_topic_distribution.html",
    display_inline=True
)

# Create sentiment pie charts
logger.info("Creating Sentiment Pie Chart (BERT)...")
create_sentiment_pie_chart(
    results_df,
    sentiment_column='bert_sentiment', # Make sure this column name matches the output of apply_bert_sentiment
    output_file=VIS_DIR / "new_data_bert_sentiment.html",
    display_inline=True
)

# Get improved topic terms
logger.info("Cleaning topic terms...")
improved_topics = clean_topic_terms(topic_model)

# Create meaningful topic visualization
logger.info("Creating Custom Topic Barchart...")
create_custom_barchart(
    topic_model,
    improved_topics,
    VIS_DIR,
    display_inline=True
)

logger.info(f"Visualizations saved to {VIS_DIR} and displayed inline.")
logger.info(f"Processing complete. Final results CSV at: {output_csv_path}")

2025-03-29 00:31:50,372 - New Data Processor Notebook - INFO - Generating visualizations...
2025-03-29 00:31:50,374 - New Data Processor Notebook - INFO - Creating Topic Distribution Chart...
2025-03-29 00:31:50,907 - New Data Processor Notebook - INFO - Topic distribution chart saved to /Users/DINGZEEFS/Documents/Fine_tuning_odido/output_dir/new_data_notebook_results/visualizations/new_data_topic_distribution.html


2025-03-29 00:31:51,410 - New Data Processor Notebook - INFO - Creating Sentiment Pie Chart (BERT)...
2025-03-29 00:31:51,458 - New Data Processor Notebook - INFO - Sentiment pie chart saved to /Users/DINGZEEFS/Documents/Fine_tuning_odido/output_dir/new_data_notebook_results/visualizations/new_data_bert_sentiment.html


2025-03-29 00:31:51,463 - New Data Processor Notebook - INFO - Cleaning topic terms...
2025-03-29 00:31:51,466 - New Data Processor Notebook - INFO - Creating Custom Topic Barchart...
2025-03-29 00:31:51,508 - New Data Processor Notebook - INFO - Meaningful topic barchart saved to /Users/DINGZEEFS/Documents/Fine_tuning_odido/output_dir/new_data_notebook_results/visualizations/meaningful_topic_barchart.html


2025-03-29 00:31:51,512 - New Data Processor Notebook - INFO - Visualizations saved to /Users/DINGZEEFS/Documents/Fine_tuning_odido/output_dir/new_data_notebook_results/visualizations and displayed inline.
2025-03-29 00:31:51,513 - New Data Processor Notebook - INFO - Processing complete. Final results CSV at: /Users/DINGZEEFS/Documents/Fine_tuning_odido/output_dir/new_data_notebook_results/new_data_analysis_notebook.csv


In [13]:
# Function from visualizations.py (modified for inline display)
def create_topic_sentiment_heatmap(df, sentiment_column='bert_sentiment', output_file=None, display_inline=True):
    """Create a heatmap showing sentiment distribution by topic."""
    if 'topic_name' not in df.columns or sentiment_column not in df.columns:
         logger.error(f"Required columns ('topic_name', '{sentiment_column}') not found for heatmap.")
         return None

    try:
        # Create cross-tabulation, normalize by topic (index)
        topic_sentiment = pd.crosstab(df['topic_name'], df[sentiment_column], normalize='index') * 100
        topic_sentiment = topic_sentiment.fillna(0) # Fill NaN if a topic has only one sentiment

        # Reshape for plotting
        plot_df = topic_sentiment.reset_index().melt(id_vars='topic_name',
                                                    var_name='Sentiment',
                                                    value_name='Percentage')

        # Create heatmap
        fig = px.density_heatmap(plot_df, x='Sentiment', y='topic_name', z='Percentage',
                               title=f'Sentiment Distribution by Topic (%) - {sentiment_column}',
                               labels={'topic_name': 'Topic', 'Percentage': 'Percentage (%)'},
                               color_continuous_scale="Viridis") # Or choose another scale

        fig.update_layout(yaxis={'categoryorder':'total descending'}) # Order topics by count or alphabetically

        # Save to file
        if output_file:
            fig.write_html(output_file)
            logger.info(f"Topic sentiment heatmap saved to {output_file}")

        if display_inline:
            fig.show()

        return fig
    except Exception as e:
        logger.error(f"Error creating topic sentiment heatmap: {e}")
        return None

In [14]:
# Function from visualizations.py (modified for inline display)
# Requires networkx library (pip install networkx) and import networkx as nx
def create_theme_network_graph(df, min_co_occurrence=3, output_file=None, display_inline=True):
    """Create a network graph of co-occurring topics/themes."""
    if 'topic_name' not in df.columns:
        logger.error("Column 'topic_name' not found for theme network graph.")
        return None
    if 'original_index' not in df.columns: # Assuming 'original_index' links rows after potential processing
        logger.warning("Column 'original_index' not found. Co-occurrence might be inaccurate if rows were duplicated.")
        # Use simple index if original_index is missing
        df_indices = df.index
    else:
        df_indices = df['original_index']


    try:
        import networkx as nx # Ensure networkx is imported

        # Create a graph
        G = nx.Graph()

        # Add topics as nodes with their counts
        topic_counts = df['topic_name'].value_counts()
        for topic, count in topic_counts.items():
             G.add_node(topic, size=count) # Store count for potential sizing later

        # Find co-occurrences
        # Group by original document index (if available) to see topics mentioned in the same review
        # This assumes your 'results_df' might have multiple rows per original review if preprocessing expanded it.
        # If each row in results_df represents one unique review, grouping by index is simpler.
        # Using df_indices determined above.
        grouped = df.groupby(df_indices)['topic_name'].apply(list)

        co_occurrence_counts = {}
        for topics_in_doc in grouped:
            unique_topics = sorted(list(set(topics_in_doc))) # Unique topics per doc
            # Iterate through all unique pairs within the document
            for i in range(len(unique_topics)):
                for j in range(i + 1, len(unique_topics)):
                    topic1, topic2 = unique_topics[i], unique_topics[j]
                    pair = tuple(sorted((topic1, topic2))) # Ensure consistent pair order
                    co_occurrence_counts[pair] = co_occurrence_counts.get(pair, 0) + 1

        # Add edges based on co-occurrence count
        for pair, count in co_occurrence_counts.items():
            if count >= min_co_occurrence:
                G.add_edge(pair[0], pair[1], weight=count)

        if not G.edges():
            logger.warning(f"No topic co-occurrences found with minimum count {min_co_occurrence}.")
            # Optionally display just the nodes
            # return None # Or display node-only graph

        # Convert to plotly
        pos = nx.spring_layout(G, k=0.6, iterations=50) # Adjust layout parameters

        # Create Edges
        edge_x, edge_y = [], []
        edge_weights = []
        for edge in G.edges(data=True):
            x0, y0 = pos[edge[0]]
            x1, y1 = pos[edge[1]]
            edge_x.extend([x0, x1, None])
            edge_y.extend([y0, y1, None])
            edge_weights.append(edge[2]['weight']) # Get weight for potential line thickness

        # Normalize weights for line width (example)
        min_weight = min(edge_weights) if edge_weights else 1
        max_weight = max(edge_weights) if edge_weights else 1
        normalized_widths = [1 + 4 * (w - min_weight) / max(1, max_weight - min_weight) for w in edge_weights] # Scale width e.g. 1 to 5

        # Create edge trace with varying width (tricky with None separators)
        # Simpler: use constant width or color intensity
        edge_trace = go.Scatter(
            x=edge_x, y=edge_y,
            line=dict(width=1.5, color='#888'), # Use constant width for simplicity
            hoverinfo='none',
            mode='lines')

        # Create Nodes
        node_x, node_y, node_text, node_size, node_color = [], [], [], [], []
        node_adjacencies = []
        for node, adjacencies in enumerate(G.adjacency()):
             node_adjacencies.append(len(adjacencies[1]))

        # Determine node sizes (e.g., based on degree or topic count)
        node_degrees = [G.degree(node) for node in G.nodes()]
        min_degree = min(node_degrees) if node_degrees else 0
        max_degree = max(node_degrees) if node_degrees else 1
        scaled_sizes = [10 + 20 * (d - min_degree) / max(1, max_degree - min_degree) for d in node_degrees] # Scale size e.g. 10 to 30


        for i, node in enumerate(G.nodes()):
            x, y = pos[node]
            node_x.append(x)
            node_y.append(y)
            node_text.append(f"{node}<br>Connections: {node_degrees[i]}<br>Mentions: {topic_counts.get(node, 0)}")
            node_size.append(scaled_sizes[i])
            node_color.append(scaled_sizes[i]) # Color by size/degree


        node_trace = go.Scatter(
            x=node_x, y=node_y, text=node_text, mode='markers+text', textposition="top center",
            hoverinfo='text',
            marker=dict(
                showscale=True,
                colorscale='Viridis', # Or another scale
                color=node_color,
                size=node_size,
                colorbar=dict(
                    thickness=15,
                    title='Node Connections/Size', # Adjust title
                    xanchor='left',
                    titleside='right'
                ),
                line_width=1,
                line_color='black'
             )
        )


        # Create figure
        fig = go.Figure(data=[edge_trace, node_trace],
                     layout=go.Layout(
                        title=f'Topic Co-occurrence Network (Min Co-occurrence: {min_co_occurrence})',
                        titlefont_size=16,
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20,l=5,r=5,t=40),
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                    )

        # Save to file
        if output_file:
            fig.write_html(output_file)
            logger.info(f"Theme network graph saved to {output_file}")

        if display_inline:
            fig.show()

        return fig

    except ImportError:
        logger.error("NetworkX library not found. Please install it using 'pip install networkx'")
        return None
    except Exception as e:
        logger.error(f"Error creating theme network graph: {e}")
        return None

In [15]:
# Function from visualizations.py (modified for inline display where possible)
def create_bertopic_visualizations(topic_model, output_dir, display_inline=True):
    """Create standard BERTopic visualizations."""
    if topic_model is None:
        logger.error("BERTopic model is None, cannot create visualizations.")
        return False

    vis_successful = False

    # Create topic similarity map
    try:
        logger.info("Creating BERTopic: Topic Similarity Map...")
        fig_similarity = topic_model.visualize_topics()
        if fig_similarity:
            output_path = os.path.join(output_dir, "bertopic_similarity_map.html")
            fig_similarity.write_html(output_path)
            logger.info(f"Topic Similarity Map saved to {output_path}")
            if display_inline:
                fig_similarity.show()
            vis_successful = True
        else:
            logger.warning("Could not generate topic similarity map.")
    except Exception as e:
        logger.error(f"Error creating topic similarity visualization: {str(e)}")

    # Create barcharts for top topics
    try:
        logger.info("Creating BERTopic: Topic Barchart...")
        # Use custom_labels=True if your model has them set, otherwise False
        # Check if custom labels exist before trying to use them
        has_custom_labels = topic_model.custom_labels_ is not None
        fig_barchart = topic_model.visualize_barchart(top_n_topics=12, custom_labels=has_custom_labels, height=400)
        if fig_barchart:
            output_path = os.path.join(output_dir, "bertopic_barchart.html")
            fig_barchart.write_html(output_path)
            logger.info(f"Topic Barchart saved to {output_path}")
            if display_inline:
                fig_barchart.show()
            vis_successful = True
        else:
            logger.warning("Could not generate topic barchart.")
    except Exception as e:
        logger.error(f"Error creating barchart visualization: {str(e)}")

    # Create topic word scores (Term Rank) - Usually generates multiple plots, harder to show inline
    logger.info("Creating BERTopic: Term Rank Charts (saving to files)...")
    try:
        topics_to_visualize = sorted(set(topic_model.topics_))
        term_rank_saved = False
        for topic_id in topics_to_visualize:
            if topic_id != -1:  # Skip outlier topic
                try:
                    # Use custom_labels=True if available
                    has_custom_labels = topic_model.custom_labels_ is not None
                    fig_term_rank = topic_model.visualize_term_rank(topics=[topic_id], custom_labels=has_custom_labels)
                    if fig_term_rank:
                        output_path = os.path.join(output_dir, f"bertopic_term_rank_topic_{topic_id}.html")
                        fig_term_rank.write_html(output_path)
                        term_rank_saved = True
                        # fig_term_rank.show() # Showing potentially many plots inline might be too much
                except Exception as e_inner:
                     logger.warning(f"Could not generate term rank for topic {topic_id}: {e_inner}")
        if term_rank_saved:
             logger.info(f"Term Rank charts saved to {output_dir}")
             vis_successful = True # Mark as successful if at least one saved
        else:
             logger.warning("Could not save any term rank charts.")

    except Exception as e:
        logger.error(f"Error creating term rank visualizations: {str(e)}")

    # Hierarchical clustering visualization (often requires original documents, might fail)
    # logger.info("Attempting BERTopic: Hierarchical Clustering...")
    # try:
    #     # Reduce topics first for a cleaner hierarchy if many topics exist
    #     if len(topic_model.get_topic_info()) > 50: # Example threshold
    #          hierarchical_topics = topic_model.hierarchical_topics(docs) # Requires original 'docs'
    #     else:
    #          hierarchical_topics = topic_model.hierarchical_topics() # Might still need docs
    #
    #     fig_hierarchy = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
    #     if fig_hierarchy:
    #         output_path = os.path.join(output_dir, "bertopic_hierarchy.html")
    #         fig_hierarchy.write_html(output_path)
    #         logger.info(f"Topic Hierarchy saved to {output_path}")
    #         if display_inline:
    #             fig_hierarchy.show()
    #         vis_successful = True
    #     else:
    #          logger.warning("Could not generate topic hierarchy chart.")
    # except Exception as e:
    #     logger.warning(f"Could not create hierarchical visualization (may require original docs): {str(e)}")

    return vis_successful

In [16]:
# Function from sentiment_visualizations.py
# Mapping between Dutch and English sentiment labels for consistency
SENTIMENT_MAPPING = {
    # Dutch to English (ensure lowercase for matching)
    'positief': 'positive',
    'neutraal': 'neutral',
    'negatief': 'negative',
    'gemengd': 'mixed',
    # English to Dutch (or just ensure consistent output, e.g., always English)
    'positive': 'positive', # Keep consistent
    'neutral': 'neutral',   # Keep consistent
    'negative': 'negative', # Keep consistent
    'mixed': 'mixed',       # Keep consistent
    # Add potential variations if needed
    'Positive': 'positive',
    'Neutral': 'neutral',
    'Negative': 'negative',
    'Mixed': 'mixed',
}

def normalize_sentiment_labels(df):
    """
    Ensure consistent sentiment labels (lowercase English) by mapping Dutch/English variations.
    If a label isn't in our mapping, leave it as is but convert to lowercase.

    Args:
        df: DataFrame with sentiment columns

    Returns:
        DataFrame with normalized sentiment labels
    """
    df_copy = df.copy()
    sentiment_cols = [col for col in ['llm_sentiment', 'bert_sentiment', 'sentiment'] if col in df_copy.columns]

    for col in sentiment_cols:
        # Apply mapping, converting to lowercase first for better matching
        # If not found in map, keep original value but ensure lowercase
        df_copy[col] = df_copy[col].astype(str).str.lower().map(
            lambda x: SENTIMENT_MAPPING.get(x, x)
        )
        logger.info(f"Normalized '{col}'. Unique values now: {df_copy[col].unique()}")

    return df_copy

In [17]:
# Function from sentiment_visualizations.py (modified for inline display)

# Dutch UI text (ensure this is defined before the function if not already)
DUTCH_LABELS = {
    'dashboard_title': 'LLM vs. BERT Sentimentanalyse Vergelijking',
    'sentiment_distribution': 'LLM vs. BERT Sentimentverdeling',
    'sentiment_agreement': 'Sentimentovereenstemming per Onderwerp',
    'score_correlation': 'Sentimentscore Correlatie',
    'confidence_comparison': 'Vergelijking van Betrouwbaarheidsscores',
    'sentiment': 'Sentiment',
    'count': 'Aantal',
    'agreement': 'Overeenkomst (%)',
    'topic': 'Onderwerp',
    'llm_score': 'LLM Score',
    'bert_score': 'BERT Score',
    'text': 'Tekst',
    'perfect_correlation': 'Perfecte correlatie',
    'correlation': 'Correlatie',
    'model_sentiment': 'Model & Sentiment',
    'confidence': 'Absolute Betrouwbaarheidsscore', # Adjusted label
    'confusion_title': 'LLM vs. BERT Sentiment Confusion Matrix',
    'bert_prediction': 'BERT Sentiment Voorspelling',
    'llm_assignment': 'LLM Sentiment Toekenning',
    'percentage': 'Percentage (%)',
    'topic_sentiment_title': 'LLM vs. BERT Sentimentverdeling per Onderwerp'
}

# Fallback English labels
ENGLISH_LABELS = {
    'dashboard_title': 'LLM vs. BERT Sentiment Analysis Comparison',
    'sentiment_distribution': 'LLM vs. BERT Sentiment Distribution',
    'sentiment_agreement': 'Sentiment Agreement by Topic',
    'score_correlation': 'Sentiment Score Correlation',
    'confidence_comparison': 'Sentiment Confidence Comparison',
    'sentiment': 'Sentiment',
    'count': 'Count',
    'agreement': 'Agreement (%)',
    'topic': 'Topic',
    'llm_score': 'LLM Score',
    'bert_score': 'BERT Score',
    'text': 'Text',
    'perfect_correlation': 'Perfect correlation',
    'correlation': 'Correlation',
    'model_sentiment': 'Model & Sentiment',
    'confidence': 'Absolute Confidence Score',
    'confusion_title': 'LLM vs. BERT Sentiment Confusion Matrix',
    'bert_prediction': 'BERT Sentiment Prediction',
    'llm_assignment': 'LLM Sentiment Assignment',
    'percentage': 'Percentage (%)',
    'topic_sentiment_title': 'LLM vs. BERT Sentiment Distribution by Topic'
}


def create_sentiment_comparison_dashboard(df, output_file="sentiment_comparison.html", use_dutch=False, display_inline=True):
    """
    Create a dashboard comparing LLM and BERT sentiment analysis.

    Args:
        df: DataFrame with both LLM and BERT sentiment columns (e.g., 'llm_sentiment', 'bert_sentiment')
            and their corresponding score columns (e.g., 'llm_sentiment_score', 'bert_sentiment_score').
            Requires 'topic_name' and 'combined_text' as well.
        output_file: Path to save the output HTML file
        use_dutch: Whether to use Dutch labels in the visualizations
        display_inline: Whether to show the plot in the notebook output
    """
    required_cols = ['llm_sentiment', 'bert_sentiment', 'llm_sentiment_score', 'bert_sentiment_score', 'topic_name', 'combined_text']
    if not all(col in df.columns for col in required_cols):
        logger.error(f"Dashboard requires columns: {required_cols}. Found: {df.columns.tolist()}")
        print(f"Error: Dashboard requires columns: {required_cols}. Found: {df.columns.tolist()}") # Also print error
        return None

    # Normalize sentiment labels first
    df_norm = normalize_sentiment_labels(df) # Use the function defined previously

    # Select language labels
    labels = DUTCH_LABELS if use_dutch else ENGLISH_LABELS

    try:
        # Create subplots with 2 rows and 2 columns
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=[
                labels['sentiment_distribution'],
                labels['sentiment_agreement'],
                labels['score_correlation'],
                labels['confidence_comparison']
            ],
            specs=[
                [{"type": "bar"}, {"type": "bar"}],
                [{"type": "scatter"}, {"type": "box"}]
            ],
            vertical_spacing=0.15,
            horizontal_spacing=0.1
        )

        # 1. Compare sentiment distributions (LLM vs BERT)
        llm_counts = df_norm['llm_sentiment'].value_counts().sort_index()
        bert_counts = df_norm['bert_sentiment'].value_counts().sort_index()

        # Ensure all categories are represented
        all_sentiments = sorted(set(list(llm_counts.index) + list(bert_counts.index)))
        llm_counts = llm_counts.reindex(all_sentiments, fill_value=0)
        bert_counts = bert_counts.reindex(all_sentiments, fill_value=0)

        fig.add_trace(
            go.Bar(
                x=all_sentiments,
                y=llm_counts.values,
                name="LLM", # Simpler name
                marker_color='#636EFA'
            ),
            row=1, col=1
        )

        fig.add_trace(
            go.Bar(
                x=all_sentiments,
                y=bert_counts.values,
                name="BERT", # Simpler name
                marker_color='#EF553B'
            ),
            row=1, col=1
        )
        fig.update_xaxes(title_text=labels['sentiment'], row=1, col=1)
        fig.update_yaxes(title_text=labels['count'], row=1, col=1)


        # 2. Agreement by topic
        # Calculate agreement percentage by topic
        topic_agreement = df_norm.groupby('topic_name').apply(
            lambda x: (x['llm_sentiment'] == x['bert_sentiment']).mean() * 100 if len(x) > 0 else 0
        ).reset_index()
        topic_agreement.columns = ['topic_name', 'agreement_percent']

        # Add topic counts for context
        topic_counts = df_norm['topic_name'].value_counts().reset_index()
        topic_counts.columns = ['topic_name', 'topic_count']
        topic_agreement = pd.merge(topic_agreement, topic_counts, on='topic_name', how='left')


        # Sort by agreement, show top/bottom N topics might be better
        topic_agreement = topic_agreement.sort_values('agreement_percent', ascending=True)
        # Keep limited number for readability
        n_topics_to_show = min(len(topic_agreement), 15)
        topic_agreement_display = topic_agreement.tail(n_topics_to_show)


        fig.add_trace(
            go.Bar(
                x=topic_agreement_display['agreement_percent'],
                y=topic_agreement_display['topic_name'],
                text=topic_agreement_display['topic_count'].astype(str) + ' docs', # Show count on bar
                textposition='outside',
                orientation='h',
                marker_color='#00CC96'
            ),
            row=1, col=2
        )
        fig.update_xaxes(title_text=labels['agreement'], range=[0,105], row=1, col=2)
        fig.update_yaxes(title_text=labels['topic'], categoryorder='total ascending', row=1, col=2)


        # 3. Sentiment score correlation
        # Ensure scores are numeric
        df_norm['llm_sentiment_score'] = pd.to_numeric(df_norm['llm_sentiment_score'], errors='coerce')
        df_norm['bert_sentiment_score'] = pd.to_numeric(df_norm['bert_sentiment_score'], errors='coerce')
        df_corr = df_norm.dropna(subset=['llm_sentiment_score', 'bert_sentiment_score']) # Drop rows where scores aren't numeric

        fig.add_trace(
            go.Scatter(
                x=df_corr['llm_sentiment_score'],
                y=df_corr['bert_sentiment_score'],
                mode='markers',
                opacity=0.6,
                marker=dict(
                    size=7,
                    color=df_corr['bert_sentiment'].map({'positive': 'green', 'negative': 'red', 'neutral': 'grey'}), # Color by BERT sentiment
                    # colorscale='Viridis', # Alternative coloring
                    showscale=False
                ),
                text=df_corr['combined_text'].str[:100] + '...', # Longer hover text
                hovertemplate=(f"<b>LLM:</b> %{{x:.2f}} (<i>{df_corr['llm_sentiment']}</i>)<br>"
                               f"<b>BERT:</b> %{{y:.2f}} (<i>{df_corr['bert_sentiment']}</i>)<br>"
                               f"<b>Text:</b> %{{text}}<extra></extra>")
            ),
            row=2, col=1
        )

        # Add correlation line and coefficient if possible
        if not df_corr.empty:
             corr = df_corr['llm_sentiment_score'].corr(df_corr['bert_sentiment_score'])
             min_val = min(df_corr['llm_sentiment_score'].min(), df_corr['bert_sentiment_score'].min())
             max_val = max(df_corr['llm_sentiment_score'].max(), df_corr['bert_sentiment_score'].max())

             fig.add_trace(
                 go.Scatter(
                     x=[min_val, max_val], y=[min_val, max_val],
                     mode='lines', line=dict(color='rgba(0,0,0,0.5)', dash='dash'),
                     name=labels['perfect_correlation'], showlegend=False
                 ), row=2, col=1
             )
             fig.add_annotation(
                 xref="x domain", yref="y domain",
                 x=0.05, y=0.95, # Position in domain coords
                 text=f"{labels['correlation']}: {corr:.2f}", showarrow=False,
                 font=dict(size=12), bgcolor="rgba(255,255,255,0.7)",
                 row=2, col=1
            )
        fig.update_xaxes(title_text=labels['llm_score'], row=2, col=1)
        fig.update_yaxes(title_text=labels['bert_score'], row=2, col=1, scaleanchor="x2", scaleratio=1) # Ensure square aspect ratio if desired


        # 4. Confidence comparison (boxplot) using absolute scores
        scores_data = pd.DataFrame({
            'Source': ['LLM'] * len(df_norm) + ['BERT'] * len(df_norm),
            'Sentiment': list(df_norm['llm_sentiment']) + list(df_norm['bert_sentiment']),
            'Confidence': list(np.abs(pd.to_numeric(df_norm['llm_sentiment_score'], errors='coerce'))) + \
                          list(np.abs(pd.to_numeric(df_norm['bert_sentiment_score'], errors='coerce'))),
        }).dropna(subset=['Confidence']) # Drop rows where conversion failed

        fig.add_trace(
            go.Box(
                x=scores_data['Sentiment'],
                y=scores_data['Confidence'],
                color=scores_data['Source'], # Group boxes by source (LLM/BERT)
                boxmean=True, # Show mean
                name="Confidence Scores" # Legend entry
            ),
            row=2, col=2
        )
        fig.update_xaxes(title_text=labels['sentiment'], categoryorder='array', categoryarray=['negative', 'neutral', 'positive'], row=2, col=2)
        fig.update_yaxes(title_text=labels['confidence'], row=2, col=2)


        # Update layout
        fig.update_layout(
            title_text=labels['dashboard_title'],
            height=700, # Adjust height if needed
            # width=1000,
            showlegend=True,
            legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
            boxmode='group' # Group boxes for confidence plot
        )

        # Save to file
        if output_file:
            fig.write_html(output_file)
            logger.info(f"Sentiment comparison dashboard saved to {output_file}")

        if display_inline:
            fig.show()

        return fig

    except Exception as e:
        logger.error(f"Error creating sentiment comparison dashboard: {e}")
        import traceback
        traceback.print_exc() # Print detailed traceback
        return None


In [18]:
# Function from sentiment_visualizations.py (modified for inline display)

def create_confusion_matrix(df, output_file="confusion_matrix.html", use_dutch=False, display_inline=True):
    """
    Create a confusion matrix heatmap comparing LLM and BERT sentiment predictions.

    Args:
        df: DataFrame with normalized 'llm_sentiment' and 'bert_sentiment' columns.
        output_file: Path to save the output HTML file.
        use_dutch: Whether to use Dutch labels.
        display_inline: Whether to show the plot in the notebook output.
    """
    required_cols = ['llm_sentiment', 'bert_sentiment']
    if not all(col in df.columns for col in required_cols):
        logger.error(f"Confusion matrix requires columns: {required_cols}. Found: {df.columns.tolist()}")
        print(f"Error: Confusion matrix requires columns: {required_cols}. Found: {df.columns.tolist()}")
        return None

    # Ensure labels are normalized
    df_norm = normalize_sentiment_labels(df)

    # Select language labels
    labels = DUTCH_LABELS if use_dutch else ENGLISH_LABELS

    try:
        from sklearn.metrics import confusion_matrix as sk_confusion_matrix

        # Define the order of labels
        sentiment_order = ['positive', 'neutral', 'negative', 'mixed'] # Add 'mixed' if present
        present_sentiments = sorted([s for s in df_norm['llm_sentiment'].unique() if s in sentiment_order] + \
                                    [s for s in df_norm['bert_sentiment'].unique() if s in sentiment_order])
        # Keep only unique present sentiments in the desired order
        unique_present_sentiments = sorted(list(set(present_sentiments)), key=lambda x: sentiment_order.index(x) if x in sentiment_order else 99)


        # Calculate confusion matrix
        cm = sk_confusion_matrix(
            df_norm['llm_sentiment'],
            df_norm['bert_sentiment'],
            labels=unique_present_sentiments # Use the defined order
        )

        # Calculate percentages for annotation text
        cm_sum = np.sum(cm, axis=1, keepdims=True)
        cm_perc = cm / np.maximum(cm_sum, 1) * 100 # Avoid division by zero

        # Create annotations (count and percentage)
        annotations = []
        for i in range(cm.shape[0]):
            for j in range(cm.shape[1]):
                annotations.append(
                    dict(
                        x=unique_present_sentiments[j],
                        y=unique_present_sentiments[i],
                        text=f"{cm[i, j]}<br>({cm_perc[i, j]:.1f}%)", # Count and Percentage
                        showarrow=False,
                        font=dict(color="white" if cm_perc[i,j] > 50 else "black") # Adjust text color based on cell darkness
                    )
                )

        # Create heatmap figure
        fig = px.imshow(
            cm_perc, # Color based on percentage
            labels=dict(x=labels['bert_prediction'], y=labels['llm_assignment'], color=labels['percentage']),
            x=unique_present_sentiments,
            y=unique_present_sentiments,
            text_auto=False, # Disable default text auto-display
            color_continuous_scale=px.colors.sequential.Blues, # Choose a colorscale
            aspect="auto"
        )

        # Add custom annotations
        fig.update_layout(
            title=labels['confusion_title'],
            xaxis_title=labels['bert_prediction'],
            yaxis_title=labels['llm_assignment'],
            annotations=annotations
         )
        # Ensure axis labels match the order
        fig.update_xaxes(side="bottom", type='category')
        fig.update_yaxes(type='category')


        # Save to file
        if output_file:
            fig.write_html(output_file)
            logger.info(f"Confusion matrix saved to {output_file}")

        if display_inline:
            fig.show()

        return fig

    except ImportError:
         logger.error("Scikit-learn library not found. Please install it using 'pip install scikit-learn'")
         print("Error: Scikit-learn library not found. Please install it using 'pip install scikit-learn'")
         return None
    except Exception as e:
        logger.error(f"Error creating confusion matrix: {e}")
        import traceback
        traceback.print_exc()
        return None


In [19]:
# Function from sentiment_visualizations.py (modified for inline display)

def create_topic_sentiment_comparison(df, output_file="topic_sentiment_comparison.html", use_dutch=False, display_inline=True):
    """
    Create grouped bar charts comparing LLM vs BERT sentiment for each topic.

    Args:
        df: DataFrame with normalized 'llm_sentiment', 'bert_sentiment', and 'topic_name' columns.
        output_file: Path to save the output HTML file.
        use_dutch: Whether to use Dutch labels.
        display_inline: Whether to show the plot in the notebook output.
    """
    required_cols = ['llm_sentiment', 'bert_sentiment', 'topic_name']
    if not all(col in df.columns for col in required_cols):
        logger.error(f"Topic sentiment comparison requires columns: {required_cols}. Found: {df.columns.tolist()}")
        print(f"Error: Topic sentiment comparison requires columns: {required_cols}. Found: {df.columns.tolist()}")
        return None

    # Ensure labels are normalized
    df_norm = normalize_sentiment_labels(df)

    # Select language labels
    labels = DUTCH_LABELS if use_dutch else ENGLISH_LABELS

    try:
        # Calculate counts for LLM and BERT sentiment per topic
        counts = df_norm.groupby('topic_name').agg(
            llm_positive=('llm_sentiment', lambda x: (x == 'positive').sum()),
            llm_neutral=('llm_sentiment', lambda x: (x == 'neutral').sum()),
            llm_negative=('llm_sentiment', lambda x: (x == 'negative').sum()),
            bert_positive=('bert_sentiment', lambda x: (x == 'positive').sum()),
            bert_neutral=('bert_sentiment', lambda x: (x == 'neutral').sum()),
            bert_negative=('bert_sentiment', lambda x: (x == 'negative').sum()),
            total_count=('topic_name', 'size') # Get total count per topic
        ).reset_index()

        # Sort topics by total count for better visualization
        counts = counts.sort_values('total_count', ascending=False)

        # Keep top N topics for readability if too many
        n_topics_display = min(20, len(counts))
        counts_display = counts.head(n_topics_display)

        # Create figure
        fig = go.Figure()

        # Add bars for LLM
        fig.add_trace(go.Bar(name='LLM Positive', x=counts_display['topic_name'], y=counts_display['llm_positive'], marker_color='lightgreen'))
        fig.add_trace(go.Bar(name='LLM Neutral', x=counts_display['topic_name'], y=counts_display['llm_neutral'], marker_color='lightgrey'))
        fig.add_trace(go.Bar(name='LLM Negative', x=counts_display['topic_name'], y=counts_display['llm_negative'], marker_color='lightcoral'))

        # Add bars for BERT (offset slightly or use different pattern - difficult with standard bar)
        # Alternative: Stacked or separate charts might be clearer. Let's use stacked for now.
        # Re-creating fig for stacked layout:
        fig = go.Figure()
        # LLM Bars (Stacked)
        fig.add_trace(go.Bar(name='LLM Negative', x=counts_display['topic_name'], y=counts_display['llm_negative'], marker_color='red', legendgroup='LLM', legendgrouptitle_text='LLM'))
        fig.add_trace(go.Bar(name='LLM Neutral', x=counts_display['topic_name'], y=counts_display['llm_neutral'], marker_color='grey', legendgroup='LLM'))
        fig.add_trace(go.Bar(name='LLM Positive', x=counts_display['topic_name'], y=counts_display['llm_positive'], marker_color='green', legendgroup='LLM'))

        # BERT Bars (Stacked - will stack on top of LLM, need adjustment or separate plot)
        # Plotting side-by-side grouped bars is better here. Re-adjust fig creation.

        fig = go.Figure()
        bar_width = 0.3

        # LLM Bars
        fig.add_trace(go.Bar(
            name='LLM Positive', x=counts_display['topic_name'], y=counts_display['llm_positive'],
            marker_color='lightgreen', legendgroup='LLM', legendgrouptitle_text='LLM',
            offsetgroup=0, base=0 # Use offsetgroup for side-by-side
        ))
        fig.add_trace(go.Bar(
            name='LLM Neutral', x=counts_display['topic_name'], y=counts_display['llm_neutral'],
             marker_color='lightgrey', legendgroup='LLM', base=counts_display['llm_positive'],
             offsetgroup=0
        ))
        fig.add_trace(go.Bar(
            name='LLM Negative', x=counts_display['topic_name'], y=counts_display['llm_negative'],
             marker_color='lightcoral', legendgroup='LLM', base=counts_display['llm_positive'] + counts_display['llm_neutral'],
             offsetgroup=0
        ))


        # BERT Bars (Shifted using offsetgroup)
        fig.add_trace(go.Bar(
            name='BERT Positive', x=counts_display['topic_name'], y=counts_display['bert_positive'],
            marker_color='darkgreen', legendgroup='BERT', legendgrouptitle_text='BERT',
            offsetgroup=1, base=0
        ))
        fig.add_trace(go.Bar(
            name='BERT Neutral', x=counts_display['topic_name'], y=counts_display['bert_neutral'],
            marker_color='darkgrey', legendgroup='BERT', base=counts_display['bert_positive'],
            offsetgroup=1
        ))
        fig.add_trace(go.Bar(
             name='BERT Negative', x=counts_display['topic_name'], y=counts_display['bert_negative'],
            marker_color='darkred', legendgroup='BERT', base=counts_display['bert_positive'] + counts_display['bert_neutral'],
            offsetgroup=1
        ))

        # Update layout for grouped stacked bars
        fig.update_layout(
            barmode='relative', # Bars are relative within their offset group
            title=labels['topic_sentiment_title'],
            xaxis_title=labels['topic'],
            yaxis_title=labels['count'],
            legend_title="Model & Sentiment",
            xaxis_tickangle=-45 # Angle ticks if too many topics
        )

        # Save to file
        if output_file:
            fig.write_html(output_file)
            logger.info(f"Topic sentiment comparison saved to {output_file}")

        if display_inline:
            fig.show()

        return fig

    except Exception as e:
        logger.error(f"Error creating topic sentiment comparison: {e}")
        import traceback
        traceback.print_exc()
        return None


In [20]:
logger.info("Generating visualizations...")

# --- Original Visualizations ---

# Create topic distribution chart (using schema_main_category or topic_name)
logger.info("Creating Topic Distribution Chart...")
# Decide which column represents the final topics ('topic_name' or 'schema_main_category')
topic_col_for_dist = 'schema_main_category' if 'schema_main_category' in results_df.columns else 'topic_name'
create_topic_distribution_chart(
    results_df,
    output_file=VIS_DIR / "new_data_topic_distribution.html",
    display_inline=True
)

# Create sentiment pie charts (BERT)
logger.info("Creating Sentiment Pie Chart (BERT)...")
create_sentiment_pie_chart(
    results_df,
    sentiment_column='bert_sentiment', # Make sure this column name matches output
    output_file=VIS_DIR / "new_data_bert_sentiment.html",
    display_inline=True
)

# Get improved topic terms (if needed for custom barchart)
logger.info("Cleaning topic terms for custom barchart...")
improved_topics = clean_topic_terms(topic_model)

# Create meaningful topic visualization (Custom Barchart)
logger.info("Creating Custom Topic Barchart...")
create_custom_barchart(
    topic_model,
    improved_topics,
    VIS_DIR,
    display_inline=True
)

# --- Additional Visualizations ---

# Create Topic Sentiment Heatmap (BERT)
logger.info("Creating Topic Sentiment Heatmap (BERT)...")
create_topic_sentiment_heatmap(
    results_df,
    sentiment_column='bert_sentiment',
    output_file=VIS_DIR / "topic_sentiment_heatmap_bert.html",
    display_inline=True
)

# Create Topic Co-occurrence Network Graph
logger.info("Creating Topic Co-occurrence Network Graph...")
# Check if enough data points exist for a meaningful graph
if len(results_df) > 20: # Example threshold
     create_theme_network_graph(
         results_df,
         min_co_occurrence=2, # Adjust threshold as needed
         output_file=VIS_DIR / "theme_network_graph.html",
         display_inline=True
     )
else:
     logger.warning("Skipping theme network graph due to insufficient data.")


# Create standard BERTopic visualizations (will save multiple files)
logger.info("Creating Standard BERTopic Visualizations...")
create_bertopic_visualizations(
    topic_model,
    VIS_DIR,
    display_inline=True # Will show similarity map and barchart inline if possible
)


# --- LLM vs BERT Comparison Visualizations (Optional) ---
# Check if LLM results are present before attempting comparison plots
llm_cols_present = all(col in results_df.columns for col in ['llm_sentiment', 'llm_sentiment_score'])

if llm_cols_present:
    logger.info("LLM sentiment columns found. Creating comparison visualizations...")

    # Create LLM vs BERT Sentiment Dashboard
    logger.info("Creating LLM vs BERT Sentiment Dashboard...")
    create_sentiment_comparison_dashboard(
        results_df,
        output_file=VIS_DIR / "llm_vs_bert_dashboard.html",
        use_dutch=False, # Set to True for Dutch labels
        display_inline=True
    )

    # Create LLM vs BERT Confusion Matrix
    logger.info("Creating LLM vs BERT Confusion Matrix...")
    create_confusion_matrix(
        results_df,
        output_file=VIS_DIR / "llm_vs_bert_confusion_matrix.html",
        use_dutch=False, # Set to True for Dutch labels
        display_inline=True
    )

    # Create LLM vs BERT Topic Sentiment Comparison Bars
    logger.info("Creating LLM vs BERT Topic Sentiment Comparison...")
    create_topic_sentiment_comparison(
        results_df,
        output_file=VIS_DIR / "llm_vs_bert_topic_sentiment.html",
        use_dutch=False, # Set to True for Dutch labels
        display_inline=True
    )

else:
    logger.warning("LLM sentiment columns ('llm_sentiment', 'llm_sentiment_score') not found in results. Skipping LLM vs BERT comparison plots.")


# --- Final Summary ---
logger.info(f"All requested visualizations generated (or skipped) and saved to {VIS_DIR} and displayed inline where applicable.")
logger.info(f"Processing complete. Final results CSV at: {output_csv_path}")


2025-03-29 00:43:37,194 - New Data Processor Notebook - INFO - Generating visualizations...
2025-03-29 00:43:37,197 - New Data Processor Notebook - INFO - Creating Topic Distribution Chart...
2025-03-29 00:43:37,283 - New Data Processor Notebook - INFO - Topic distribution chart saved to /Users/DINGZEEFS/Documents/Fine_tuning_odido/output_dir/new_data_notebook_results/visualizations/new_data_topic_distribution.html


2025-03-29 00:43:37,286 - New Data Processor Notebook - INFO - Creating Sentiment Pie Chart (BERT)...
2025-03-29 00:43:37,314 - New Data Processor Notebook - INFO - Sentiment pie chart saved to /Users/DINGZEEFS/Documents/Fine_tuning_odido/output_dir/new_data_notebook_results/visualizations/new_data_bert_sentiment.html


2025-03-29 00:43:37,318 - New Data Processor Notebook - INFO - Cleaning topic terms for custom barchart...
2025-03-29 00:43:37,324 - New Data Processor Notebook - INFO - Creating Custom Topic Barchart...
2025-03-29 00:43:37,366 - New Data Processor Notebook - INFO - Meaningful topic barchart saved to /Users/DINGZEEFS/Documents/Fine_tuning_odido/output_dir/new_data_notebook_results/visualizations/meaningful_topic_barchart.html


2025-03-29 00:43:37,370 - New Data Processor Notebook - INFO - Creating Topic Sentiment Heatmap (BERT)...
2025-03-29 00:43:37,428 - New Data Processor Notebook - INFO - Topic sentiment heatmap saved to /Users/DINGZEEFS/Documents/Fine_tuning_odido/output_dir/new_data_notebook_results/visualizations/topic_sentiment_heatmap_bert.html


2025-03-29 00:43:37,432 - New Data Processor Notebook - INFO - Creating Topic Co-occurrence Network Graph...
2025-03-29 00:43:37,433 - New Data Processor Notebook - INFO - Creating Standard BERTopic Visualizations...
2025-03-29 00:43:37,433 - New Data Processor Notebook - INFO - Creating BERTopic: Topic Similarity Map...
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2025-03-29 00:43:41,981 - New Data Processor Notebook - INFO - Topic Similarity Map saved to /Users/DINGZEEFS/Documents/Fine_tuning_odido/output_dir/new_data_notebook_results/visualizations/bertopic_similarity_map.html


2025-03-29 00:43:41,983 - New Data Processor Notebook - INFO - Creating BERTopic: Topic Barchart...
2025-03-29 00:43:42,025 - New Data Processor Notebook - INFO - Topic Barchart saved to /Users/DINGZEEFS/Documents/Fine_tuning_odido/output_dir/new_data_notebook_results/visualizations/bertopic_barchart.html


2025-03-29 00:43:42,028 - New Data Processor Notebook - INFO - Creating BERTopic: Term Rank Charts (saving to files)...
2025-03-29 00:43:42,425 - New Data Processor Notebook - INFO - Term Rank charts saved to /Users/DINGZEEFS/Documents/Fine_tuning_odido/output_dir/new_data_notebook_results/visualizations
2025-03-29 00:43:42,426 - New Data Processor Notebook - INFO - LLM sentiment columns found. Creating comparison visualizations...
2025-03-29 00:43:42,426 - New Data Processor Notebook - INFO - Creating LLM vs BERT Sentiment Dashboard...
2025-03-29 00:43:42,428 - New Data Processor Notebook - INFO - Normalized 'llm_sentiment'. Unique values now: ['neutral']
2025-03-29 00:43:42,429 - New Data Processor Notebook - INFO - Normalized 'bert_sentiment'. Unique values now: ['positive' 'negative']



invalid value encountered in divide


invalid value encountered in divide

2025-03-29 00:43:42,459 - New Data Processor Notebook - ERROR - Error creating sentiment comparison dashboard: Invalid pro

2025-03-29 00:43:42,501 - New Data Processor Notebook - INFO - Creating LLM vs BERT Topic Sentiment Comparison...
2025-03-29 00:43:42,502 - New Data Processor Notebook - INFO - Normalized 'llm_sentiment'. Unique values now: ['neutral']
2025-03-29 00:43:42,503 - New Data Processor Notebook - INFO - Normalized 'bert_sentiment'. Unique values now: ['positive' 'negative']
2025-03-29 00:43:42,536 - New Data Processor Notebook - INFO - Topic sentiment comparison saved to /Users/DINGZEEFS/Documents/Fine_tuning_odido/output_dir/new_data_notebook_results/visualizations/llm_vs_bert_topic_sentiment.html


2025-03-29 00:43:42,539 - New Data Processor Notebook - INFO - All requested visualizations generated (or skipped) and saved to /Users/DINGZEEFS/Documents/Fine_tuning_odido/output_dir/new_data_notebook_results/visualizations and displayed inline where applicable.
2025-03-29 00:43:42,540 - New Data Processor Notebook - INFO - Processing complete. Final results CSV at: /Users/DINGZEEFS/Documents/Fine_tuning_odido/output_dir/new_data_notebook_results/new_data_analysis_notebook.csv
