In [7]:
company_name = "MCPV"
industry_of_interest = "Solar panels"
region = "Netherlands"
language = "English"
general_folder = "KnowledgeBase/MediaCoverageAnalytics/MCPV"
articles_sorted = load_data_from_json(f"{general_folder}/Outputs/CompiledOutputs/ArticlesList.json")




Data successfully loaded from 'KnowledgeBase/MediaCoverageAnalytics/MCPV/Outputs/CompiledOutputs/ArticlesList.json'.


In [9]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import re
import json
import collections
import time
import pandas as pd
import marvin
import fitz
import argparse
import gradio as gr
from pathlib import Path
from datetime import datetime, timedelta
from collections import defaultdict
from tabulate import tabulate
from scipy.spatial.distance import cosine
from scipy.signal import find_peaks
from difflib import SequenceMatcher
from typing import List, Dict, Union, Optional, Tuple
from sentence_transformers import SentenceTransformer
from PyPDF2 import PdfReader
from docx import Document as DocxDocument
from difflib import SequenceMatcher
from Classes.SimplifiedChatbots import ChatGPT, BigSummarizerGPT
import logging
import traceback
import ast
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import base64
import io
import mammoth
import csv
from pathlib import Path
import logging
import pypandoc
import tempfile

logging.basicConfig(level=logging.DEBUG, filename='app.log', filemode='w',
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Helper functions

def check_input_paths(pdf_folder_path=None, docx_file_path=None):
        """
        Check if the provided paths exist and are accessible.
        Returns True if at least one valid path is provided and accessible.
        """
        valid_path_found = False

        if pdf_folder_path:
            if os.path.exists(pdf_folder_path):
                print(f"✓ PDF folder path exists: {pdf_folder_path}")
                # Check if it contains any PDF files
                pdf_files = [f for f in os.listdir(pdf_folder_path) if f.lower().endswith('.pdf')]
                if pdf_files:
                    print(f"✓ Found {len(pdf_files)} PDF files:")
                    for pdf in pdf_files:
                        print(f"  - {pdf}")
                    valid_path_found = True
                else:
                    print("✗ No PDF files found in the folder")
            else:
                print(f"✗ PDF folder path does not exist: {pdf_folder_path}")

        if docx_file_path:
            if os.path.exists(docx_file_path):
                print(f"✓ DOCX file exists: {docx_file_path}")
                valid_path_found = True
            else:
                print(f"✗ DOCX file does not exist: {docx_file_path}")

        return valid_path_found

def parse_relative_date(date_str):
    """
    Parse relative date strings like "2 years ago" and convert them to absolute dates.
    Returns date in "Month Day, Year" format.
    """
    now = datetime.now()
    
    # Handle relative dates
    if 'ago' in date_str.lower():
        match = re.search(r'(\d+)\s*(year|month|week|day)s?\s*ago', date_str.lower())
        if match:
            amount = int(match.group(1))
            unit = match.group(2)
            
            if unit == 'year':
                delta = timedelta(days=amount * 365)
            elif unit == 'month':
                delta = timedelta(days=amount * 30)
            elif unit == 'week':
                delta = timedelta(weeks=amount)
            else:  # days
                delta = timedelta(days=amount)
                
            result_date = now - delta
            return result_date.strftime('%B %d, %Y')
    
    # Handle existing absolute date formats
    try:
        # Try parsing standard format first
        parsed_date = datetime.strptime(date_str, '%B %d, %Y')
        return parsed_date.strftime('%B %d, %Y')
    except ValueError:
        pass
    
    try:
        # Try parsing other common formats
        for fmt in ('%Y-%m-%d', '%d/%m/%Y', '%m/%d/%Y', '%d-%m-%Y', '%Y/%m/%d'):
            try:
                parsed_date = datetime.strptime(date_str, fmt)
                return parsed_date.strftime('%B %d, %Y')
            except ValueError:
                continue
    except Exception:
        pass
    
    # If no valid date format is found, return a default date
    return 'January 1, 2024'

def clean_date_string(date_str):
    """
    Clean and standardize date strings before parsing.
    """
    # Remove extra whitespace and normalize format
    date_str = ' '.join(date_str.split())
    
    # Handle common variations
    date_str = date_str.replace('- (extract the date in the format Month Day, Year - IN ENGLISH)', '')
    date_str = date_str.replace('[Date not provided in the document]', 'January 1, 2024')
    
    # Remove any parenthetical notes
    date_str = re.sub(r'\([^)]*\)', '', date_str)
    
    return date_str.strip()

def process_article_date(article):
    """
    Process the date field in an article dictionary.
    """
    if 'date' not in article:
        article['date'] = 'January 1, 2024'
        return article
    
    try:
        # Clean the date string
        cleaned_date = clean_date_string(article['date'])
        
        # Parse the date
        parsed_date = parse_relative_date(cleaned_date)
        
        # Update the article
        article['date'] = parsed_date
        
        # Convert to timestamp for sorting
        article['timestamp'] = datetime.strptime(parsed_date, '%B %d, %Y').timestamp()
        
    except Exception as e:
        print(f"Error processing date '{article.get('date', 'No date')}': {str(e)}")
        # Set default date if parsing fails
        article['date'] = 'January 1, 2024'
        article['timestamp'] = datetime.strptime('January 1, 2024', '%B %d, %Y').timestamp()
    
    return article

def ensure_directory_exists(file_path):
    """
    Ensure that the directory containing the specified file path exists.
    If not, create the directory.
    """
    try:
        os.makedirs(file_path, exist_ok=True)
    except Exception as e:
        print(f"Error creating directory {file_path}: {str(e)}")
        raise

def extract_hyperlinks(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        links = []

        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            link_list = page.get_links()

            for link in link_list:
                if link['kind'] == fitz.LINK_URI:
                    links.append(link['uri'])

        doc.close()
        return links
    except Exception as e:
        print(f"Error processing PDF {os.path.basename(pdf_path)}: {str(e)}")
        return []

def process_pdfs(file_folder):
    processed_files = []
    error_files = []

    for filename in os.listdir(file_folder):
        if filename.lower().endswith('.pdf'):
            file_path = os.path.join(file_folder, filename)
            try:
                with fitz.open(file_path) as doc:
                    # Just try to access the first page to check if the PDF is valid
                    doc.load_page(0)
                processed_files.append(file_path)
            except Exception as e:
                error_files.append((filename, str(e)))

    return processed_files, error_files

def get_files(folder_path):
    processed_files = []
    error_files = []

    for filename in os.listdir(folder_path):
        if filename.lower().endswith('.pdf'):
            file_path = os.path.join(folder_path, filename)
            try:
                with fitz.open(file_path) as doc:
                    # Just try to access the first page to check if the PDF is valid
                    doc.load_page(0)
                processed_files.append(file_path)
            except Exception as e:
                error_files.append((filename, str(e)))

    if error_files:
        print("The following files could not be processed:")
        for filename, error in error_files:
            print(f"- {filename}: {error}")
    
    return processed_files

def save_data_to_json(data, filename):
    with open(filename, 'w') as file:
        json.dump(data, file, indent=4)
    print(f"Data successfully saved to '{filename}'.")

def load_data_from_json(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
    print(f"Data successfully loaded from '{filename}'.")
    return data

def extract_metadata(articles: List[Dict]) -> List[Dict]:
    for article in articles:
        chatbot = ChatGPT(model_name="gpt-4o-mini", max_tokens=300, temperature=0)

        prompt = f"""
You will be given a document from which you have to extract the metadata. You must extract its title, the author, the media outlet and the date of publication. 
In case the author is not explicitly mentioned at the beginning of the document, you should return the value "Anonymous".
When extracting the media outlet, you should return the name of the media outlet without any extension like ".com", ".nl", ".fr", etc.
                           
Here is the document:{article['content']}

Format your response as follows:
Title: [Title of the document]
Author: [Author of the document]
Media Outlet: [Name of the media outlet]
Date of Publication: [Date of publication]- (extract the date in the format Month Day, Year - IN ENGLISH)

Here is an example of document provided: Document(metadata='source': "KnowledgeBase/CompanyAnalysis/Embraer/MediaCoverage/NewsEmbraer/What near-disasters, 'SNL' jabs mean for Alaska 's reputation.PDF", 'page': [1, 2, 3], page_content='Page 1 of 3\nWhat near-disasters, \'SNL\' jabs mean for Alaska \'s reputation\nWhat near-disasters, \'SNL\' jabs mean for Alaska\'s reputation - Correction \nAppended\nThe Seattle Times\nJanuary 25, 2024 Thursday\n Correction Appended\nCopyright 2024 The Seattle Times Company All Rights Reserved\nSection: Pg. A 1\nLength: 1340 words\nByline: Renata Geraldo, Seattle Times staff reporter\nBody\nWith two flights narrowly escaping disaster just months apart, Alaska Airlines was again in the national spotlight \nover the weekend, this time with a "Saturday Night Live" skit.\nFeaturing "Saltburn" star Jacob Elordi and "SNL" regulars Kenan Thompson and Heidi Gardner, the skit, which \naired last weekend, parodied an Alaska Airlines ad. "Our new slogan is \'Alaska Airlines: You didn\'t die, and you got \na cool story,\' " said Gardner, who played a fli......')
Here is the output you should provide:
Title: What near-disasters, 'SNL' jabs mean for Alaska 's reputation
Author: Renata Geraldo
Media Outlet: The Seattle Times
Date of Publication: January 25, 2024

Here is a second example of document provided: Document(metadata='source': 'KnowledgeBase/CompanyAnalysis/Embraer/MediaCoverage/NewsEmbraer/TUI fly lance une nouvelle destination depuis la Belgique.PDF', 'page': [1], page_content="Page 1 of 1\nTUI fly lance une nouvelle destination depuis la Belgique\nTUI fly lance une nouvelle destination depuis la Belgique\nLe Soir\njeudi 8 février 2024\nCopyright 2024 Rossel & Cie. S.A. tous droits réservés\nSection: NEWS\nLength: 156 words\nBody\n Par la rédaction\n L a compagnie aérienne TUI fly lance ce jeudi une nouvelle destination qui reliera l'aéroport d'Anvers à Oujda \n(Maroc) pendant l'été. Deux vols par semaine seront opérés, les mercredis et les dimanches, du 26 juin au 22 \nseptembre 2024.\n TUI fly répond ainsi à une demande importante de la communauté marocaine de la région d'Anvers de pouvoir se \nrendre directement à Oujda. Cette nouvelle ligne permet de rendre facilement visite aux familles et amis au Maroc. \nLes vols sont opérés avec l'Embraer E195-E2, avion moderne et plus durable, qui avait été mis en service à \nl'aéroport d'Anvers l'été dernier.\n Cette nouvelle destination porte donc à 15 le nombre de destinations desservies par TUI fly au départ d'Anvers, \navec, entre autres plusieurs destinations vers l'Espagne....
Here is the output you should provide:
Title: TUI fly lance une nouvelle destination depuis la Belgique
Author: Par la rédaction
Media Outlet: Le Soir
Date of Publication: February 8, 2024

Here is a third example of document provided: Document(metadata='source': 'KnowledgeBase/CompanyAnalysis/Embraer/MediaCoverage/NewsEmbraer/No Headline In Original(2).PDF', 'page': [1, 2], page_content='Page 1 of 2\nNo Headline In Original\nNo Headline In Original\nFlight International\nApril 25, 2024\nCopyright 2024 DVV Media International Ltd All Rights Reserved\nSection: IN FOCUS\nLength: 613 words\nBody\nEmbraer starts E190F flight testing\nConverted freighter makes maiden sortie, as airframer touts potential of civil cargo role for C-390 military transport\nAlfred Chua Sao Jose dos Campos\nHoward Hardee Sacramento\nEmbraer has performed the first flight of its passenger-to-freighter (P2F) conversion, with the modified E190 taking \nto the skies over Sao Jose dos Campos on 5 April.\nThe E190F – a 2010-built example first operated by Avianca El Salvador – flew for about 2h, allowing the Embraer \nteam to complete an \xadinitial evaluation of the aircraft. A second sortie followed five days later.\nThe jet will undergo further flight testing \xadbefore being delivered to US lessor Regional One, \xadEmbraer says.\nPreviously, the company had stated its intention to deliver the E190F in the second quarter of 2024.\n“We are very pleased with E190F’s and E195F’s fast progress during the testing period,” says Embraer chief \n\xadexecutive Francisco Gomes Neto.....
Here is the output you should provide:
Title: No Headline In Original
Author: Alfred Chua, Howard Hardee
Media Outlet: Flight International
Date of Publication: April 25, 2024

Here is a fourth example of document provided: Document(metadata='source': 'KnowledgeBase/MediaCoverageAnalytics/Philips/NewsPhilipsFull/Spectaculaire koerswinst Philips na schikking van apneuaffaire.PDF', 'page': [1, 2], page_content="Page 1 of 2\nSpectaculaire koerswinst Philips na schikking van apneuaffaire\nSpectaculaire koerswinst Philips na schikking van apneuaffaire\nHet Financieele Dagblad\n30 april 2024 dinsdag 12:00 AM GMT\nCopyright 2024 FD Mediagroep B.V. All Rights Reserved\nSection: PAGINA 3; Blz. 3\nLength: 473 words\nBody\nVervolg van pagina 1\nPhilips maakte de schikking gisterochtend bekend, gelijktijdig met de presentatie van zijn cijfers over het eerste \nkwartaal. In het akkoord erkent Philips geen schuld voor mogelijke schade bij patiënten, die vreesden dat ze \nkanker, astma en andere aandoeningen hadden opgelopen door het gebruik van de Philips-apparaten. Het \nconcern kondigde gisteren ook aan dat zijn verzekeraars ruim €0,5 mrd van de kosten zullen vergoeden.\nDe schikking betreft zowel de claims voor medische schade als voor medische controle voor patiënten. \nClaimadvocaten hadden meer dan 760 zaken aangebracht. Nog eens 60.000 patiënten hadden zich laten \nregistreren om in een later stadium aanspraak te kunnen maken op een vergoeding. Hoeveel geld afzonderlijke \npatiënten ontvangen, is nog niet duidelijk. Het is aan de advocaten om het totaalbedrag onder hun cliënten te \nverdelen. Philips heeft zich stevig verweerd tegen de schadeclaims. Volgens het bedrijf is uit onderzoek gebleken \ndat de uitstoot van schadelijke stoffen te gering was om ernstige gezondheidsschade te kunnen veroorzaken. \nPhilips stemde toch in met een schikking omdat de uitkomst van rechtszaken moeilijk te voorspellen is. In het \nAmerikaanse rechtssysteem kunnen jury's soms onverwacht grote schadevergoedingen toekennen. Eerder \nbereikte Philips al een akkoord over economische schade voor patiënten, evenals een schikkingsakkoord met de \nAmerikaanse medische toezichthouder, de FDA. Met de nieuwe, tweeledige schikking zijn de totale kosten voor het \nafhandelen van de apneuaffaire opgelopen tot €5,2 mrd, zo blijkt uit berekeningen van het FD.....
Here is the output you should provide:
Title: Spectaculaire koerswinst Philips na schikking van apneuaffaire
Author: Anonymous
Media Outlet: Het Financieele Dagblad
Date of Publication: 30 april, 2024

Here is a fifth example of document provided: Document(metadata='source': 'KnowledgeBase/MediaCoverageAnalytics/Swissport/NewsSwissport/Ontluisterend rapport over bagagetillen op Schiphol_ medewerkers sjouwen zich nog tien jaar een breu.PDF', 'page': [1, 2, 3], page_content="Page 1 of 3\nOntluisterend rapport over bagagetillen op Schiphol: medewerkers sjouwen zich nog tien jaar een breuk\nOntluisterend rapport over bagagetillen op Schiphol: medewerkers sjouwen \nzich nog tien jaar een breuk\nPZC.nl\n16 september 2023 zaterdag 01:00 AM GMT\nCopyright 2023 DPG Media B.V. All Rights Reserved\nLength: 2147 words\nByline: David Bremmer\nBody\nHet gaat jaren duren voordat de honderden bagagemedewerkers op Schiphol niet langer  veel te zware koffers \nmeer hoeven te tillen, zoals de Arbeidsinspectie eist. Een nieuw rapport schetst een somber beeld van de \nbagageafhandeling op Schiphol: de tijd heeft er stilgestaan. Zelf zeggen Schiphol en de zes bagageafhandelaars \nhet werk snel flink lichter te kunnen maken.De Arbeidsinspectie wil korte metten maken met het vele getil en \ngesjouw op Schiphol. Bij drukte tillen medewerkers al snel 200 koffers per u......")
Here is the output you should provide:
Title: Ontluisterend rapport over bagagetillen op Schiphol: medewerkers sjouwen zich nog tien jaar een breuk
Author: David Bremmer
Media Outlet: PZC
Date of Publication: September 16, 2023
"""

        response = chatbot.ask(prompt)
        try:
            lines = response.split("\n")
            article['title'] = lines[0].split(": ", 1)[1].strip() if len(lines) > 0 else "Untitled"
            article['author_name'] = lines[1].split(": ", 1)[1].strip() if len(lines) > 1 else "Anonymous"
            article['media_outlet'] = lines[2].split(": ", 1)[1].strip() if len(lines) > 2 else "Unknown"
            
            if len(lines) > 3:
                article['date'] = lines[3].split(": ", 1)[1].strip()
            else:
                article['date'] = 'January 1, 2024'
            
            article = process_article_date(article)
            
        except Exception as e:
            logging.error(f"Error extracting metadata: {str(e)}")
            # Set default values if extraction fails
            article['title'] = article.get('title', 'Untitled')
            article['author_name'] = article.get('author_name', 'Anonymous')
            article['media_outlet'] = article.get('media_outlet', 'Unknown')
            article['date'] = article.get('date', 'January 1, 2024')
            article = process_article_date(article)
    
    return articles

def clean_articles(articles):
    cleaned_articles = []
    
    for article in articles:
        author_name = article['author_name']
        author_name = author_name.title()  # Capitalize the first letter of each word
        
        # Ensure small letters for non-proper nouns (preserve 'van', 'de', etc.)
        author_name = re.sub(r'\b(Van|De|Der)\b', lambda m: m.group(0).lower(), author_name)
        
        article['author_name'] = author_name
        cleaned_articles.append(article)
    
    return cleaned_articles

def filter_duplicates(articles, embeddings, similarity_threshold=0.92, content_similarity_threshold=0.9):
    def text_similarity(text1, text2):
        return SequenceMatcher(None, text1, text2).ratio()

    unique_articles = []
    unique_embeddings = []
    deleted_articles = []

    for idx, (article, emb) in enumerate(zip(articles, embeddings)):
        is_duplicate = False
        for unique_idx, unique_emb in enumerate(unique_embeddings):
            embedding_similarity = 1 - cosine(emb, unique_emb)
            
            if embedding_similarity > similarity_threshold:
                content_similarity = text_similarity(article['content'], unique_articles[unique_idx]['content'])
                
                if content_similarity > content_similarity_threshold:
                    is_duplicate = True
                    deleted_articles.append((article, unique_articles[unique_idx]))
                    break

        if not is_duplicate:
            unique_articles.append(article)
            unique_embeddings.append(emb)

    print("Deleted Articles:")
    for deleted, kept in deleted_articles:
        print(f"Deleted: {deleted['file_path']} (Duplicate of {kept['file_path']})")

    return unique_articles

# Additional helper function for embeddings
def get_embeddings(articles, embedding_model):
    embeddings = []
    for article in articles:
        embeddings.append(embedding_model.encode(article['content'], normalize_embeddings=False))
    return embeddings

# Initialize the embedding model (you might want to do this outside the function in a global scope)
embeddings_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

def filter_relevant_articles(articles, company_name, industry_of_interest, region):
    filtered_articles = []
    system_prompt = f"""You are a helpful assistant. Your role is to decide whether or not a given article is relevant to a given company. If the given company is not a specific entity but a topic or industry, make sure that the article's content is directly and closely related to that topic or industry. The company or topic to be considered is the following: {company_name}."""
    model_name = "gpt-4o-mini"  # You might want to make this configurable
    temperature = 0
    max_tokens = 50

    for article in articles:
        article_content = article.get('content', '')
        chatbot = ChatGPT(
            system_prompt=system_prompt,
            model_name=model_name,
            temperature=temperature,
            max_tokens=max_tokens,
        )

        question = f"""
You will be provided with a news media article and a company, topic or industry name. Here is the company or industry of interest: {company_name}. It operates in the {industry_of_interest}, with a prefered focus on the {region} market. Your task is to determine if the article is sufficiently relevant to {company_name} based on the following criterion:

Centrality: The article is considered relevant if {company_name} is central to the article's content, meaning the focus of the conversation revolves around {company_name}.

If {company_name} is only mentioned a few times without being a primary focus, the article should be considered not relevant.

Here is the article you will be evaluating: {article_content}

Your response should be based on whether the article meets the relevance criteria for {company_name}.
        """

        response = marvin.classify(
            question,
            labels=["Yes", "No"]
        )

        print(response)
        article['relevance'] = response

        if response != 'No':
            filtered_articles.append(article)

    return filtered_articles

def filter_top_categories(posts, keep_percentage=90):
    category_counts = collections.Counter(post['category'] for post in posts)
    total_posts = sum(category_counts.values())
    sorted_categories = sorted(category_counts.items(), key=lambda x: x[1], reverse=True)
    cumulative_percentage = 0
    keep_categories = set()
    
    for category, count in sorted_categories:
        if count < 2:
            continue
        cumulative_percentage += (count / total_posts) * 100
        keep_categories.add(category)
        if cumulative_percentage >= keep_percentage:
            break
    
    filtered_posts = [post for post in posts if post['category'] in keep_categories]
    return filtered_posts, keep_categories

def extract_sentiment_score(response):
    try:
        match = re.search(r'(-?[0-5])', response.strip())
        if match:
            return int(match.group(1))
        else:
            raise ValueError("Invalid sentiment score format.")
    except Exception as e:
        print(f"Error extracting sentiment score: {e}")
        return None

def save_plot_base64():
    buf = io.BytesIO()
    plt.savefig(buf, format='png', dpi=300)
    buf.seek(0)
    return base64.b64encode(buf.getvalue()).decode('utf-8')

def create_bar_chart_compiled_insights(data: dict, title: str, xlabel: str, ylabel: str, rotate_labels: bool = False, 
                    figsize: tuple = (8, 4), color: str = '#1f77b4') -> str:
    """
    Create a bar chart with improved styling and compact layout.
    
    Args:
        data (dict): Dictionary of labels and values
        title (str): Chart title
        xlabel (str): X-axis label
        ylabel (str): Y-axis label
        rotate_labels (bool): Whether to rotate x-axis labels
        figsize (tuple): Figure size in inches
        color (str): Bar color
        
    Returns:
        str: Base64 encoded image
    """
    plt.style.use('seaborn-v0_8')
    fig, ax = plt.subplots(figsize=figsize)
    
    # Create bars
    bars = ax.bar(range(len(data)), list(data.values()), 
                 color=color, alpha=0.7,
                 edgecolor=color, linewidth=1)
    
    # Customize title and labels
    ax.set_title(title, fontsize=12, pad=15, fontweight='bold')
    ax.set_xlabel(xlabel, fontsize=10, labelpad=8)
    ax.set_ylabel(ylabel, fontsize=10, labelpad=8)
    
    # Customize x-axis labels
    if rotate_labels:
        ax.set_xticks(range(len(data)))
        ax.set_xticklabels(list(data.keys()), rotation=45, ha='right', fontsize=9)
    else:
        ax.set_xticks(range(len(data)))
        ax.set_xticklabels(list(data.keys()), fontsize=9)
    
    # Customize y-axis
    ax.yaxis.set_tick_params(labelsize=9)
    
    # Add value labels on top of bars
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{int(height)}',
                ha='center', va='bottom', fontsize=9)
    
    # Customize grid and spines
    ax.grid(axis='y', linestyle='--', alpha=0.3)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    # Adjust layout
    plt.tight_layout()
    
    # Convert plot to base64 string
    buf = io.BytesIO()
    plt.savefig(buf, format='png', dpi=300, bbox_inches='tight', facecolor='white')
    buf.seek(0)
    plt.close()
    return base64.b64encode(buf.getvalue()).decode('utf-8')

def create_sentiment_graph(df, media_outlet, ax):
    df_outlet = df[df['media_outlet'] == media_outlet].sort_values('date')
    df_outlet['moving_avg'] = df_outlet['sentiment score'].rolling(window=30, min_periods=1).mean()
    
    ax.plot(df_outlet['date'], df_outlet['moving_avg'], linewidth=2, color='#1f77b4')
    ax.fill_between(df_outlet['date'], df_outlet['moving_avg'], alpha=0.3, color='#1f77b4')
    
    ax.set_title(media_outlet, fontsize=12, pad=10)
    ax.set_xlabel('Date', fontsize=10, labelpad=5)
    ax.set_ylabel('Sentiment Score', fontsize=10, labelpad=5)
    
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3))
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha='right')
    
    ax.grid(True, linestyle='--', alpha=0.7)
    ax.legend(['30-Day Moving Average'], loc='upper left', fontsize=8)

def create_category_sentiment_graph(df, category, ax):
    df_category = df[df['category'] == category].sort_values('date')
    if df_category.empty:
        ax.text(0.5, 0.5, f"No data for {category}", ha='center', va='center')
    else:
        df_category['moving_avg'] = df_category['sentiment score'].rolling(window=30, min_periods=1).mean()
        
        ax.plot(df_category['date'], df_category['moving_avg'], linewidth=2, color='#1f77b4')
        ax.fill_between(df_category['date'], df_category['moving_avg'], alpha=0.3, color='#1f77b4')
        
        ax.set_title(f'Sentiment Scores: {category}', fontsize=12, pad=10)
        ax.set_xlabel('Date', fontsize=10, labelpad=5)
        ax.set_ylabel('Sentiment Score', fontsize=10, labelpad=5)
        
        ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3))
        ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
        plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha='right')
        
        ax.grid(True, linestyle='--', alpha=0.7)
        
        y_min, y_max = df_category['moving_avg'].min(), df_category['moving_avg'].max()
        if np.isfinite(y_min) and np.isfinite(y_max) and y_min != y_max:
            y_range = y_max - y_min
            ax.set_ylim(y_min - 0.1 * y_range, y_max + 0.1 * y_range)
        else:
            ax.set_ylim(-5, 5)  # Set a default range
        
        min_date, max_date = df_category['date'].min(), df_category['date'].max()
        if not pd.isnull(min_date) and not pd.isnull(max_date):
            ax.annotate(f'Start: {min_date.strftime("%b %Y")}', xy=(min_date, df_category.loc[df_category['date'] == min_date, 'moving_avg'].values[0]),
                        xytext=(10, 10), textcoords='offset points', ha='left', va='bottom',
                        bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
                        arrowprops=dict(arrowstyle = '->', connectionstyle='arc3,rad=0'), fontsize=8)
            ax.annotate(f'End: {max_date.strftime("%b %Y")}', xy=(max_date, df_category.loc[df_category['date'] == max_date, 'moving_avg'].values[0]),
                        xytext=(-10, 10), textcoords='offset points', ha='right', va='bottom',
                        bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
                        arrowprops=dict(arrowstyle = '->', connectionstyle='arc3,rad=0'), fontsize=8)
        
        ax.legend(['30-Day Moving Average'], loc='upper left', fontsize=8)

def create_horizontal_bar_chart(data, title, xlabel, ylabel):
    fig, ax = plt.subplots(figsize=(12, 8))
    bars = ax.barh(data.index, data.values, color='skyblue', edgecolor='navy')
    ax.set_title(title, fontsize=16, fontweight='bold')
    ax.set_xlabel(xlabel, fontsize=12)
    ax.set_ylabel(ylabel, fontsize=12)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    for i, v in enumerate(data.values):
        ax.text(v + 0.5, i, str(v), va='center', fontweight='bold')
    ax.grid(axis='x', linestyle='--', alpha=0.7)
    plt.tight_layout()
    return fig, ax

def create_stacked_bar_chart(data, title, xlabel, ylabel):
    total_counts = data.sum(axis=1)
    data_sorted = data.loc[total_counts.sort_values(ascending=False).index]
    
    fig, ax = plt.subplots(figsize=(14, 6))
    color_map = {'Negative': 'darkred', 'Neutral': 'gray', 'Positive': 'darkgreen'}
    data_sorted.plot(kind='bar', stacked=True, ax=ax, color=[color_map.get(tone, 'blue') for tone in data_sorted.columns])
    
    percentages = data_sorted.div(data_sorted.sum(axis=1), axis=0) * 100
    for i, container in enumerate(ax.containers):
        for j, bar in enumerate(container):
            height = bar.get_height()
            if height > 0:
                percentage = percentages.iloc[j, i]
                color = bar.get_facecolor()
                text_color = 'white' if sum(color[:3]) < 1.5 else 'black'
                ax.text(bar.get_x() + bar.get_width()/2, bar.get_y() + height/2, 
                        f'{percentage:.1f}%', ha='center', va='center', 
                        color=text_color, fontsize=9, fontweight='bold')
    
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.xticks(rotation=45, ha='right')
    plt.legend(title='Tone')
    plt.tight_layout()
    return fig, ax

def generate_media_outlet_pie_chart(df):
    media_counts = df['media_outlet'].value_counts()
    total_articles = media_counts.sum()
    threshold = 0.04  # 4% threshold

    main_outlets = media_counts[media_counts / total_articles >= threshold]
    other_outlets = media_counts[media_counts / total_articles < threshold]

    if not other_outlets.empty:
        main_outlets['Others'] = other_outlets.sum()

    num_colors = len(main_outlets)
    colors = plt.cm.get_cmap('tab20')(np.linspace(0, 1, num_colors))

    plt.figure(figsize=(12, 8))
    plt.pie(main_outlets, labels=main_outlets.index, autopct='%1.1f%%', startangle=140, colors=colors)
    plt.title('Proportion of Articles by Media Outlet', fontsize=16)
    plt.axis('equal')
    chart = save_plot_base64()
    plt.close()
    return chart

def generate_media_outlet_tone_chart(df):
    tone_media_counts = df.groupby(['media_outlet', 'tone']).size().unstack(fill_value=0)
    fig, ax = create_stacked_bar_chart(tone_media_counts, 'Number of Articles per Media Outlet (Divided by Tone)', 'Media Outlet', 'Number of Articles')
    chart = save_plot_base64()
    plt.close()
    return chart

def generate_overall_sentiment_trend(df, company_name):
    plt.figure(figsize=(16, 7))
    df['moving_avg'] = df['sentiment score'].rolling(window=30).mean()
    plt.plot(df['date'], df['moving_avg'], linewidth=2, color='#1f77b4')
    plt.fill_between(df['date'], df['moving_avg'], alpha=0.3, color='#1f77b4')
    plt.title(f'30-Day Moving Average of Sentiment Scores for {company_name}', fontsize=20, pad=20)
    plt.xlabel('Date', fontsize=14, labelpad=10)
    plt.ylabel('Sentiment Score', fontsize=14, labelpad=10)
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=2))
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
    plt.xticks(rotation=45, ha='right')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend(['30-Day Moving Average'], loc='upper left')
    plt.tight_layout()
    chart = save_plot_base64()
    plt.close()
    return chart

def generate_sentiment_trends_by_outlet(df):
    media_outlets = df['media_outlet'].value_counts()
    eligible_outlets = media_outlets[media_outlets > 15].head(9).index
    num_outlets = len(eligible_outlets)
    rows = (num_outlets + 2) // 3
    fig, axs = plt.subplots(rows, 3, figsize=(20, 5*rows))
    axs = axs.flatten()
    for i, outlet in enumerate(eligible_outlets):
        create_sentiment_graph(df, outlet, axs[i])
    for j in range(i+1, len(axs)):
        axs[j].axis('off')
    plt.tight_layout()
    chart = save_plot_base64()
    plt.close()
    return chart

def generate_sentiment_trends_by_category(df):
    categories = df['category'].value_counts()
    eligible_categories = categories[categories > 15].head(9).index
    num_categories = len(eligible_categories)
    rows = (num_categories + 2) // 3
    fig, axs = plt.subplots(rows, 3, figsize=(20, 5*rows))
    axs = axs.flatten()
    for i, category in enumerate(eligible_categories):
        create_category_sentiment_graph(df, category, axs[i])
    for j in range(i+1, len(axs)):
        axs[j].axis('off')
    plt.tight_layout()
    chart = save_plot_base64()
    plt.close()
    return chart

def generate_articles_per_category(df):
    category_counts = df['category'].value_counts()
    fig, ax = create_horizontal_bar_chart(category_counts, 'Number of Articles per Category', 'Number of Articles', 'Category')
    chart = save_plot_base64()
    plt.close()
    return chart

def generate_category_tone_chart(df):
    tone_category_counts = df.groupby(['category', 'tone']).size().unstack(fill_value=0)
    fig, ax = create_stacked_bar_chart(tone_category_counts, 'Number of Articles per Category (Divided by Tone)', 'Category', 'Number of Articles')
    chart = save_plot_base64()
    plt.close()
    return chart

def generate_top_journalists_chart(df, company_name):
    top_authors = df[df['author_name'] != 'Anonymous']
    author_counts = top_authors['author_name'].value_counts().head(10)
    author_sentiments = top_authors.groupby('author_name')['sentiment score'].mean()
    author_outlets = top_authors.groupby('author_name')['media_outlet'].apply(lambda x: ', '.join(x.unique()))

    fig, ax = plt.subplots(figsize=(14, 5))
    bars = ax.bar(author_counts.index, author_counts.values, color='skyblue', edgecolor='navy')
    ax.set_title(f'Top 10 Journalists Writing on {company_name}', fontsize=16)
    ax.set_xlabel('Journalist', fontsize=12)
    ax.set_ylabel('Number of Articles', fontsize=12)

    ax.set_xticks(range(len(author_counts.index)))
    ax.set_xticklabels(author_counts.index, rotation=45, ha='right', fontsize=10)

    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height, f'{height}', ha='center', va='bottom')

    plt.tight_layout()
    chart = save_plot_base64()
    plt.close()
    return chart

def read_insights_content(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return ""

def generate_toc(content):
    headings = re.findall(r'^(#{1,2}) (.+)$', content, re.MULTILINE)
    toc = "## Table of Contents\n\n"
    for heading_marks, heading_text in headings:
        level = len(heading_marks)
        link = re.sub(r'\W+', '-', heading_text.lower())
        toc += f"{'  ' * (level - 1)}- [{heading_text}](#{link})\n"
    return toc

def extract_categories(articles_sorted: List[Dict], company_name: str, industry_of_interest: str, region: str) -> List[Dict]:
    """
    Extract categories from articles and assign them to each article.
    
    Args:
        articles_sorted (List[Dict]): List of preprocessed articles
        company_name (str): Name of the company being analyzed
        industry_of_interest (str): Industry sector being analyzed
        region (str): Geographic region of interest
        
    Returns:
        List[Dict]: Updated articles list with categories assigned
    """
    try:
        logging.info("Starting category extraction process")
        
        # Generate one-sentence descriptions
        compiled_sentences = ""
        system_prompt = """You are a helpful assistant. Your role is to describe in one single sentence what a given news media article says about a company. The final goal of this exercise is to be able to extract general themes and topics from the article. The one sentence you have to write should be focussed on a given company."""
        
        for article in articles_sorted:
            article_content = article.get('content', '')
            chatbot = ChatGPT(
                system_prompt=system_prompt,
                model_name="gpt-4o-mini",
                temperature=0,
                max_tokens=350,
            )

            question = f"""
Please write a single sentence about the content of the news article. The one sentence description should highlight in which regards does the article relate to {company_name}. Your output should only consist of that one sentence.
This one sentence should highlight the main topic or theme of the article from the perspective of {company_name}. We are interested about what is said on {company_name} in the article and on this overall topic or industry: {industry_of_interest} in the {region} market.

Here is the article: {article_content}
            """

            response = chatbot.ask(question)
            article['one_sentence_description'] = response
            compiled_sentences += response + "\n"

        # Define categories
        logging.info("Defining topic categories")
        system_prompt = """You are a helpful assistant. Your role is to define topic categories based on a series of one-sentence descriptions of news articles related to a company. The goal is to identify exclusive, non-overlapping topic categories based on the media coverage of the company."""
        chatbot = ChatGPT(
            system_prompt=system_prompt,
            model_name="chatgpt-4o-latest",
            temperature=0,
            max_tokens=1000,
        )

        question = f"""
You will be provided with a document named compiled_sentences. This document contains a series of one-sentence descriptions, each summarizing a news article related to {company_name}. Your task is to identify a maximum of 10 exclusive, non-overlapping topic categories based on the media coverage of the company. However, it is better and prefered if fewer categories are sufficient to cover the main aspects of the media coverage.

Follow these guidelines:

Topic Categories: Define categories that are neither too general nor too specific. Ensure the categories are mutually exclusive, meaning no two categories should cover the same subject matter.

Clarity: Each category should have a clear focus, reflecting distinct aspects of the media coverage related to {company_name}. Secondary focus should be on the general topic or industry: {industry_of_interest} in the {region} market.

Output Format: List the categories in a bullet-point format with a brief description (1-2 sentences) explaining each category. do not produce a numbered list but just a bullet point list starting with "-" symbol.

Here is the compiled_sentences document: {compiled_sentences}

Be sure to focus on key themes present in the document and avoid redundant or overly broad topics. The fewer the number of categories, the better, as long as they are distinct and cover the main aspects of the media coverage.
Avoid defining categories that are too semantically similar or overlapping. For instance, "Financial Performance" and "Economic Growth" are too closely related to be separate categories. For example, Staffing Shortages, Labor Relations, Working Conditions and Recruitment Challenges are too closely related too and should be grouped under a single category like "Human Resources Issues".
        """

        response = chatbot.ask(question)
        category_titles = [
            re.sub(r'^\d+\.\s*', '', title.strip())
            for title in re.findall(r'\*\*(.*?)\*\*', response)
        ]

        # Categorize articles
        logging.info("Categorizing articles")
        for article in articles_sorted:
            article_content = article.get('content', '')
            chatbot = ChatGPT(
                system_prompt=f"""You are a helpful assistant. Your role is to categorize a given news media article about {company_name} into one of the predefined categories. Your output should consist solely of the category name, chosen from the provided list of categories.""",
                model_name="gpt-4o-mini",
                temperature=0,
                max_tokens=50,
            )

            question = f"""
Please categorize the following article about {company_name} into one of the predefined categories. 
Your output should only consist of the category name.
Here is the article content: {article_content}

Based on the content of the article, choose the most appropriate category from the following list: {category_titles}
Your output should solely be the name of the category chosen and nothing else.
            """

            article['category'] = chatbot.ask(question)

        logging.info("Category extraction completed successfully")
        return articles_sorted

    except Exception as e:
        logging.error(f"Error in category extraction: {str(e)}")
        logging.error(traceback.format_exc())
        raise

def generate_markdown_report(company_name, total_articles, date_range, avg_sentiment, median_sentiment,
                             media_outlet_pie_chart, top_journalists_chart, media_outlet_tone_chart,
                             overall_sentiment_trend, sentiment_trends_by_outlet, media_outlet_stats,
                             articles_per_category, category_tone_chart, sentiment_trends_by_category, df):
    markdown_content = f"""
# {company_name} - Media Analytics Report

## Table of Contents
1. [Introduction](#introduction)
2. [Data Overview](#data-overview)
3. [Proportion of Articles by Media Outlet](#proportion-of-articles-by-media-outlet)
4. [Top Journalists](#top-journalists)
5. [Sentiment Analysis](#sentiment-analysis)
6. [Media Outlet Statistics](#media-outlet-statistics)
7. [Category Analysis](#category-analysis)

## Introduction
This report presents an analysis of media coverage about {company_name}. It includes various visualizations and statistics to provide insights into the sentiment, categories, and sources of the articles.

## Data Overview
- Total number of articles: {total_articles}
- Date range: {date_range}
- Average sentiment score: {avg_sentiment:.2f}
- Median sentiment score: {median_sentiment:.2f}

## Proportion of Articles by Media Outlet
The pie chart below shows the distribution of articles across different media outlets. Outlets with less than 4% of the total coverage are combined into the "Others" category.

![Proportion of Articles by Media Outlet](data:image/png;base64,{media_outlet_pie_chart})

<div style="page-break-after: always;"></div>

## Top Journalists
![Top 10 Journalists](data:image/png;base64,{top_journalists_chart})

### Top 10 Journalists and Their Media Outlets
| Journalist | Media Outlet(s) | Number of Articles | Average Sentiment |
|------------|----------------|-------------------|-------------------|
"""

    top_authors = df[df['author_name'] != 'Anonymous']
    author_counts = top_authors['author_name'].value_counts().head(10)
    author_sentiments = top_authors.groupby('author_name')['sentiment score'].mean()
    author_outlets = top_authors.groupby('author_name')['media_outlet'].apply(lambda x: ', '.join(x.unique()))

    for author, count in author_counts.items():
        avg_sentiment = author_sentiments[author]
        markdown_content += f"| {author} | {author_outlets[author]} | {count} | {avg_sentiment:.2f} |\n"

    markdown_content += f"""

### Articles per Media Outlet (Divided by Tone)
![Articles per Media Outlet (Divided by Tone)](data:image/png;base64,{media_outlet_tone_chart})

## Sentiment Analysis
![Overall Sentiment Trend](data:image/png;base64,{overall_sentiment_trend})

"""
    # Add sentiment evolution analysis
    sentiment_analysis_content = generate_sentiment_analysis_section(df, company_name)
    markdown_content += sentiment_analysis_content

    markdown_content += f"""

### Sentiment Trends by Media Outlet
![Sentiment Trends by Media Outlet](data:image/png;base64,{sentiment_trends_by_outlet})

## Media Outlet Statistics
| Media Outlet | Number of Articles | Average Sentiment | Median Sentiment |
|--------------|---------------------|-------------------|-------------------|
"""

    for stat in media_outlet_stats:
        markdown_content += f"| {stat['outlet']} | {stat['articles']} | {stat['avg_sentiment']:.2f} | {stat['median_sentiment']:.2f} |\n"

    markdown_content += f"""
## Category Analysis
### Number of Articles per Category
![Number of Articles per Category](data:image/png;base64,{articles_per_category})

### Articles per Category (Divided by Tone)
![Articles per Category (Divided by Tone)](data:image/png;base64,{category_tone_chart})
<div style="page-break-after: always;"></div>

### Sentiment Trends by Category
![Sentiment Trends by Category](data:image/png;base64,{sentiment_trends_by_category})

"""
    category_analysis_content = generate_category_sentiment_section(df, company_name)
    markdown_content += category_analysis_content

    return markdown_content

def process_stakeholder_info(company_name, articles):
    """
    Extracts stakeholder information from a collection of articles.
    
    Args:
        company_name (str): Name of the company being analyzed
        articles_sorted (list): List of processed articles
        
    Returns:
        str: Markdown table containing stakeholder information
    """
    summary_md = f"""
# Stakeholder quotes retrieved from Media Coverage - {company_name}
| Stakeholder Name | Stakeholder quote related to {company_name} | Translation | Sentiment |
| --- | --- | --- | --- | --- |
"""
    
    for article in articles:
        article_link = article.get('link', '#')
        
        # First chatbot: Extract quotes
        quote_extractor = ChatGPT(
            model_name="gpt-4o-mini",
            temperature=0,
            max_tokens=800,
        )       
        quotes_response = quote_extractor.ask(f"""
You are tasked with extracting stakeholder information and direct quotes from a given news article related to {company_name}. Your output should be in the form of a markdown table without column titles or any additional information. Each table row should contain:
1. The name of a real person who has expressed a quote in the provided article
2. Their original quote about {company_name} exactly as it appears in the article, without any modifications or translations

Guidelines:
- Include only real people: Exclude organizations, companies, or any other non-person entities
- Direct connection: Ensure that each selected individual is explicitly connected to {company_name}
- Only include those whose statements are directly about {company_name}
- Exclude individuals whose comments are not directly relevant to {company_name} or are too general
- Pertinent stakeholders only: Only include the most relevant individuals
- Do not include stakeholders who are not directly associated with {company_name}

Output Format Requirements:
- Each line must contain exactly two columns separated by "|" symbols
- Column 1: Stakeholder's name
- Column 3: Stakeholder's role or link to {company_name}. If not directly direclty described in the article, mention "not mentioned".
- Column 2: Original quote in its original language

Example of correct formatting:
| Pierre Dupont | Consumer | "Cette décision de {company_name} est très importante." |
| John Smith | Client |"We need to review {company_name}'s proposal carefully." |
| Hans Weber | Financial analyst | "Die Strategie von {company_name} ist überzeugend." |

If no relevant individuals with quotes are found, return "None."

Article:
{article['content']}
        """)
        
        print("Quotes extraction response:", quotes_response)
        
        # Second chatbot: Add translations
        translator = ChatGPT(
            model_name="gpt-4o-mini",
            temperature=0,
            max_tokens=800,
        )
        
        translation_response = translator.ask(f"""
You are tasked with adding English translations to a set of stakeholder quotes. For each quote:
- If the quote is in English, add "N.A." as the translation
- If the quote is in any other language, provide an accurate English translation

Input format is a markdown table with stakeholder names and quotes.
Output should be a markdown table with three columns: name, original quote, and translation.

IMPORTANT RULES:
- Never translate English quotes to English - use "N.A." instead
- Only translate non-English quotes
- Keep original quotes exactly as they are
- Maintain the markdown table format, without including any horizontal divider lines

Example input:
| Pierre Dupont | Consumer | "Cette décision est importante." |
| John Smith | {company_name}'s corporate lawyer | "We need to review this carefully." |

Example output:
| Pierre Dupont | Consumer | "Cette décision est importante." | "This decision is important." |
| John Smith | {company_name}'s corporate lawyer | "We need to review this carefully." | N.A. |

If the input is "None", return "None".

Input quotes:
{quotes_response}
        """)
        
        print("Translation response:", translation_response)

        chatbot2 = ChatGPT(
            model_name="gpt-4o-mini",
            temperature=0,
            max_tokens=800,
        )

        response2 = chatbot2.ask(f"""
You are tasked with analyzing the following list of stakeholders and returning only the lines that represent real people. Disregard any lines that mention entities or organizations. Remove all rows which are table dividers using "-".

Input:
{translation_response}

Guidelines:
1. Only include lines that represent real people or individuals. Only return lines that mention a specific person's name in the first column.
2. Exclude any lines that mention companies, organizations or entities.
3. If a line is ambiguous, err on the side of exclusion.
4. Exclude mentions of John Doe or Jane Smith. Exclude the author of the article or other people which are not directly related to {company_name}.
5. Maintain the original format of each line (markdown table row). Keep the whole line, including the stakeholder description. Each line must be a valid markdown table row with 4 columns, strictly.
6. If no lines represent real people, return "None".
7. Filter out too general or irrelevant stakeholders. Only include stakeholders that express direct opinions on {company_name} or are directly involved with the company.
8. Remove all rows which are only containing the "-" symbol as your output should only be the rows that represent real people.
9. Maintain the markdown table format, without including any horizontal divider lines

Please provide the filtered list of stakeholders below:
        """)
        
        print("Filtered response:", response2)

        chatbot3 = ChatGPT(
            model_name="gpt-4o-mini",
            temperature=0,
            max_tokens=800,
        )

        response3 = chatbot3.ask(f"""
You are tasked with analyzing the impact and implications of stakeholder quotes for {company_name}'s reputation, business, and industry position. Your analysis should focus on how each quote affects {company_name}, not just the quote's general tone.

CRITICAL ANALYSIS GUIDELINES:
1. Business Impact Assessment
- Evaluate how the quote affects {company_name}'s:
  * Market position and competitive advantage
  * Stakeholder trust and relationships
  * Regulatory compliance and legal standing
  * Public perception and brand value

2. Context-Sensitive Analysis
- Consider the broader industry context
- Evaluate current market conditions and challenges
- Account for regulatory environment and compliance requirements
- Factor in {company_name}'s current strategic objectives

3. Stakeholder Influence Assessment
- Consider the stakeholder's role and influence
- Evaluate the potential reach and impact of their statement
- Assess how their opinion might affect other stakeholders

SENTIMENT CLASSIFICATION:
Positive (from {company_name}'s perspective):
- Statements that support or validate {company_name}'s strategy
- Comments that could enhance market position or stakeholder trust
- Quotes that defend {company_name} against criticism
- Statements highlighting {company_name}'s strengths or improvements

Negative (from {company_name}'s perspective):
- Statements that could damage reputation or credibility
- Comments raising concerns about practices or decisions
- Quotes that could trigger regulatory scrutiny
- Statements that might negatively influence other stakeholders
- Even positively-worded criticism (e.g., "They're making progress, but still far behind competitors")

Neutral (from {company_name}'s perspective):
- Factual statements without significant impact
- Balanced observations that neither help nor harm
- Technical or procedural comments without clear implications

FORMAT REQUIREMENTS:
- Maintain the original markdown table format
- Add sentiment classification as the final column
- Use only "Positive," "Negative," or "Neutral" in the sentiment column

Example Analysis:
Original quote: "The company is making impressive progress in sustainability."
Surface tone: Positive
Deeper analysis: Could be Negative if competitors are far ahead or if it implies previous poor performance

Example Input:
| John Weber | Expert | "The technological improvements at {company_name} are impressive, but they're still years behind industry leaders." | N.A. |
| Maria Chen | Lawyer | "{company_name} follows all regulations perfectly." | N.A. |
| Pierre Dubois | Business partner | "We are pleased to partner with {company_name} on this groundbreaking initiative." | N.A. |

Example Output:
| John Weber | Expert | "The technological improvements at {company_name} are impressive, but they're still years behind industry leaders." | N.A. | Negative |
| Maria Chen | Lawyer | "{company_name} follows all regulations perfectly." | N.A. | Neutral |
| Pierre Dubois | Business partner | "We are pleased to partner with {company_name} on this groundbreaking initiative." | N.A. | Positive |

Article Context:
{article['content']}

Quotes Table:
{response2}

Your output must contain the initial Quotes table provided with an additional column containing the appropriate sentiment classifications. You must include all the previous md table columns, even if it contains the value "N.A.", in the Quotes Table provided. Do not change anything from the provided initial Quotes Table. Your output should therefore have 4 columns. Return "None" if the Quotes Table is "None." Your response must be returned as a md table. Remove all rows which are only containing the "-" symbol , Maintain the markdown table format, without including any horizontal divider lines.
        """)
        
        print("Sentiment response:", response3)

        if response3.strip().lower() not in ["none", "none."]:
            response_lines = response3.strip().split("\n")
            for line in response_lines:
                if "none" not in line.lower():
                    # Split the line into its components
                    parts = line.split('|')
                    if len(parts) >= 4:  # Ensure we have enough columns
                        stakeholder_name = parts[1].strip()
                        # Add hyperlink to stakeholder name
                        parts[1] = f" [{stakeholder_name}]({article_link}) "
                        # Rejoin the line with the hyperlinked stakeholder name
                        hyperlinked_line = '|'.join(parts)
                        summary_md += f"{hyperlinked_line}| \n"
        
    return summary_md

def process_markdown_table(md_content):
    """
    Process a markdown table by:
    1. Sorting rows alphabetically by stakeholder name
    2. Removing duplicate or near-duplicate quotes
    
    Args:
        md_content (str): The markdown table content as a string
        
    Returns:
        str: Processed markdown table
    """
    # Split content into header and rows
    lines = md_content.strip().split('\n')
    
    # Preserve the title and header rows
    title = lines[0]
    headers = lines[1:3]  # This includes the header row and the separator row
    
    # Process only the data rows (skip headers and separator)
    data_rows = [row for row in lines[3:] if row.strip() and '|' in row]
    
    # Function to extract stakeholder name from a row
    def get_stakeholder(row):
        return row.split('|')[1].strip()
    
    # Function to extract quote from a row
    def get_quote(row):
        return row.split('|')[2].strip()
    
    # Function to calculate similarity between two quotes
    def quote_similarity(quote1, quote2):
        from difflib import SequenceMatcher
        return SequenceMatcher(None, quote1.lower(), quote2.lower()).ratio()
    
    # Remove duplicates while preserving order
    seen_quotes = {}
    unique_rows = []
    
    for row in data_rows:
        quote = get_quote(row)
        is_duplicate = False
        
        # Check against existing quotes for similarity
        for existing_quote in seen_quotes:
            if quote_similarity(quote, existing_quote) > 0.8:  # 80% similarity threshold
                is_duplicate = True
                break
                
        if not is_duplicate:
            seen_quotes[quote] = True
            unique_rows.append(row)
    
    # Sort unique rows by stakeholder name
    sorted_rows = sorted(unique_rows, key=get_stakeholder)
    
    # Reconstruct the markdown table
    processed_table = '\n'.join([title] + headers + sorted_rows)
    
    return processed_table

def translate_content(content: str, source_language: str, target_language: str) -> str:
    """
    Translate content from source language to target language using ChatGPT.
    
    Args:
        content (str): Content to translate
        source_language (str): Source language (detected automatically if not specified)
        target_language (str): Target language to translate into
        
    Returns:
        str: Translated content
    """
    try:
        # Skip translation if target language is English
        if target_language.lower() == "english":
            return content

        # Initialize translation chatbot
        translator = ChatGPT(
            system_prompt=f"""You are a professional translator specializing in {target_language} translations. 
            Your task is to translate content while maintaining the original formatting, including markdown syntax, 
            titles, headings, and any special characters. Preserve all markdown formatting exactly as in the original.""",
            model_name="gpt-4o-mini",
            temperature=0,
            max_tokens=4000,
        )

        question = f"""
Please translate the following content into {target_language}. Maintain all formatting, 
including markdown syntax (#, ##, etc.), bullet points, and special characters. 
Keep any technical terms, company names, and proper nouns in their original form.

Content to translate:
{content}

Important guidelines:
1. Preserve all markdown formatting (#, ##, -, *, etc.)
2. Keep URLs, file paths, and technical terms unchanged
3. Maintain all line breaks and spacing
4. Do not translate proper nouns or company names
5. Keep any HTML tags or special formatting intact
"""

        translated_content = translator.ask(question)
        return translated_content

    except Exception as e:
        logging.error(f"Error in translation: {str(e)}")
        logging.error(traceback.format_exc())
        return content  # Return original content if translation fails

def convert_md_to_pdf(input_file: str, output_file: str = None, css_file: str = 'template/CompactCSSTemplate.css') -> str:
    """
    Convert a Markdown file to PDF using pypandoc.
    
    Args:
        input_file (str): Path to the input Markdown file
        output_file (str): Path for the output PDF file (optional)
        css_file (str): Path to the CSS template file
        
    Returns:
        str: Path to the generated PDF file
    """
    if output_file is None:
        output_file = str(Path(input_file).with_suffix('.pdf'))
        
    try:
        # Read the CSS file
        with open(css_file, 'r', encoding='utf-8') as f:
            css_content = f.read()
            
        # Create a temporary CSS file with the content
        with tempfile.NamedTemporaryFile(mode='w', suffix='.css', delete=False) as temp_css:
            temp_css.write(css_content)
            temp_css_path = temp_css.name
            
        # Configure pandoc options
        extra_args = [
            '--pdf-engine=pdflatex',
            f'--css={temp_css_path}',
            '--toc',
            '--toc-depth=3',
            '-V', 'geometry:margin=2.5cm',
            '-V', 'documentclass=article',
            '-V', 'fontsize=11pt',
            '--highlight-style=tango'
        ]
        
        # Convert markdown to PDF
        pypandoc.convert_file(
            input_file,
            'pdf',
            outputfile=output_file,
            extra_args=extra_args
        )
        
        # Clean up temporary CSS file
        Path(temp_css_path).unlink()
        
        logging.info(f"Successfully converted {input_file} to {output_file}")
        return output_file
        
    except Exception as e:
        logging.error(f"Error converting file to PDF: {str(e)}")
        raise

def create_markdown_anchor(category_name: str) -> str:
    """
    Creates a properly formatted markdown anchor from a category name.
    
    Args:
        category_name (str): The name of the category
        
    Returns:
        str: A properly formatted markdown anchor
    """
    # Convert to lowercase
    anchor = category_name.lower()
    # Replace spaces and special characters with hyphens
    anchor = re.sub(r'[^\w\s-]', '', anchor)
    # Replace spaces with hyphens
    anchor = re.sub(r'\s+', '-', anchor)
    # Remove any duplicate hyphens
    anchor = re.sub(r'-+', '-', anchor)
    # Remove leading/trailing hyphens
    anchor = anchor.strip('-')
    
    return anchor

def find_extrema_points(df, min_days_separation=45):
    """
    Find local minima and maxima in the sentiment moving average with minimum separation,
    excluding the first 60 days of data.
    
    Args:
        df: DataFrame with 'date' and 'sentiment score' columns
        min_days_separation: Minimum days between extrema points
    
    Returns:
        tuple: Lists of (maxima_dates, maxima_values), (minima_dates, minima_values)
    """
    
    # Calculate 30-day moving average if not already present
    if 'moving_avg' not in df.columns:
        df = df.copy()
        df['moving_avg'] = df['sentiment score'].rolling(window=30).mean()
    
    # Sort by date and exclude first 60 days
    df = df.sort_values('date')
    start_date = df['date'].min()
    exclusion_date = start_date + pd.Timedelta(days=60)
    df_filtered = df[df['date'] > exclusion_date].copy()
    
    # Convert moving average to numpy array, handling NaN values
    values = df_filtered['moving_avg'].fillna(method='bfill').fillna(method='ffill').values
    
    # Find peaks (maxima) and valleys (minima)
    min_separation = min_days_separation  # minimum separation in data points
    maxima_idx, _ = find_peaks(values, distance=min_separation)
    minima_idx, _ = find_peaks(-values, distance=min_separation)
    
    # Get corresponding dates and values
    maxima_dates = df_filtered['date'].iloc[maxima_idx]
    maxima_values = values[maxima_idx]
    minima_dates = df_filtered['date'].iloc[minima_idx]
    minima_values = values[minima_idx]
    
    # Sort by value to get top extrema
    maxima = list(zip(maxima_dates, maxima_values))
    minima = list(zip(minima_dates, minima_values))
    
    maxima.sort(key=lambda x: x[1], reverse=True)
    minima.sort(key=lambda x: x[1])
    
    return maxima[:2], minima[:2]  # Return top 2 maxima and minima

def analyze_sentiment_period(df, start_date, end_date, company_name, is_peak=True):
    """
    Analyze articles and sentiment for a specific period with content size limits.
    
    Args:
        df: DataFrame with articles data
        start_date: Period start date
        end_date: Period end date
        company_name: Name of the company
        is_peak: Boolean indicating if this is a peak (True) or dip (False)
    
    Returns:
        str: Analysis of the period
    """
    # Filter articles for the period
    mask = (df['date'] >= start_date - pd.Timedelta(days=40)) & (df['date'] <= end_date)
    period_df = df[mask].copy()
    
    # Prepare article information with content limits
    articles_info = []
    total_chars = 0
    max_article_chars = 5000
    max_total_chars = 500000
    
    for _, row in period_df.iterrows():
        # Skip if we've exceeded total character limit
        if total_chars >= max_total_chars:
            break
            
        # Truncate content if needed
        content = row.get('content', '')
        if content:
            content = content[:max_article_chars]
        
        # Update total character count
        total_chars += len(content)
        
        article_info = {
            'date': row['date'].strftime('%Y-%m-%d'),
            'title': row.get('title', 'No title'),
            'sentiment': row['sentiment score'],
            'content': content,
            'media_outlet': row.get('media_outlet', 'Unknown')
        }
        articles_info.append(article_info)
    
    # Create prompt for the chatbot
    period_type = "peak" if is_peak else "dip"
    period_sign = "positive" if is_peak else "negative"
    chatbot = ChatGPT(
        system_prompt=f"""You are an expert media analyst focused on analyzing sentiment changes in media coverage. 
        Your task is to explain a significant {period_type} in sentiment regarding {company_name} by analyzing relevant articles from the period. Since the {period_type} represents a {period_sign} sentiment shift, I expect you retrieve and analyse articles that contributed to this {period_sign} sentiment change.
        Provide a concise one-paragraph analysis highlighting the key events or narratives that drove the sentiment trend.""",
        model_name="models/gemini-1.5-pro",
        temperature=0,
        max_tokens=2000
    )
    
    question = f"""
    Analyze the media coverage of {company_name} between {start_date.strftime('%Y-%m-%d')} and {end_date.strftime('%Y-%m-%d')}, 
    which represents a sentiment {period_type}. Based on the provided articles, explain what events or narratives drove this trend.
    Focus on key stories, key events narrated in the media about {company_name}, which could explain the {period_type} trend, and their impact on public perception. Since we are investigating the reasons for a sentiment {period_type}, we want to observe which {period_sign} events contributed to that trend. Provide tangible facts and examples from the coverage. Reference specific media outlets and dates where relevant.
    
    Articles for analysis:
    {json.dumps(articles_info, indent=2)}
    
    Provide a comprehensive but not too long paragraph analysis that explains the sentiment trend during this period. Include tangible examples or facts from the coverage. Cite your sources into brackets: (media outlet, date)
    """
    
    try:
        response = chatbot.ask(question)
        return response if response else "Analysis could not be generated for this period."
    except Exception as e:
        print(f"Error analyzing period {start_date} to {end_date}: {str(e)}")
        return f"Error analyzing period: {str(e)}"

def generate_sentiment_analysis_section(df, company_name):
    """
    Generate comprehensive sentiment analysis section for the report.
    """
    # Ensure date column is datetime
    df['date'] = pd.to_datetime(df['date'])
    
    # Calculate moving average
    df = df.copy()
    df['moving_avg'] = df['sentiment score'].rolling(window=30).mean()
    
    # Sort by date
    df = df.sort_values('date')
    
    # Find extrema points
    maxima, minima = find_extrema_points(df)
    print(maxima, minima)
    
    # Generate analysis for each period
    analyses = []
    
    # Analyze periods around maxima
    for peak_date, peak_value in maxima:
        analysis = analyze_sentiment_period(
            df,
            peak_date - pd.Timedelta(days=40),
            peak_date,
            company_name,
            is_peak=True
        )
        analyses.append((peak_date, "Peak", analysis))
        print(analysis)
    
    # Analyze periods around minima
    for dip_date, dip_value in minima:
        analysis = analyze_sentiment_period(
            df,
            dip_date - pd.Timedelta(days=40),
            dip_date,
            company_name,
            is_peak=False
        )
        analyses.append((dip_date, "Dip", analysis))
    
    # Generate overall analysis
    chatbot = ChatGPT(
        system_prompt=f"""You are an expert media analyst. Summarize the overall sentiment trajectory for {company_name} 
        based on the previous period analyses.""",
        model_name="models/gemini-1.5-pro",
        temperature=0,
        max_tokens=1000
    )
    
    overall_analysis = chatbot.ask(f"Synthesize the following period analyses into an overall sentiment trajectory:\n{json.dumps([a[2] for a in analyses])}")
    
    # Format the sentiment analysis section
    sentiment_section = "\n### Sentiment Evolution Analysis\n\n"
    
    # Add identified extrema points
    sentiment_section += "#### Key Sentiment Points\n"
    sentiment_section += "**Peak Periods:**\n"
    for date, value in maxima:
        sentiment_section += f"- {date.strftime('%B %d, %Y')}: {value:.3f}\n"
    sentiment_section += "\n**Dip Periods:**\n"
    for date, value in minima:
        sentiment_section += f"- {date.strftime('%B %d, %Y')}: {value:.3f}\n"
    
    # Add chronological analyses
    analyses.sort(key=lambda x: x[0])
    for date, period_type, analysis in analyses:
        sentiment_section += f"\n#### {period_type} Period - {date.strftime('%B %Y')}\n{analysis}\n"
    
    sentiment_section += f"\n#### Overall Sentiment Trajectory\n{overall_analysis}\n"
    
    return sentiment_section

def find_category_extrema(df, category_name):
    """
    Find the maximum and minimum sentiment points for a specific category.
    
    Args:
        df: DataFrame with category data
        category_name: Name of the category to analyze
        
    Returns:
        tuple: (maximum_point, minimum_point) where each point is (date, value)
    """
    # Filter for the specific category
    category_df = df[df['category'] == category_name].copy()
    
    # Calculate 30-day moving average
    if 'moving_avg' not in category_df.columns:
        category_df['moving_avg'] = category_df['sentiment score'].rolling(window=30).mean()
    
    # Sort by date and exclude first 60 days
    category_df = category_df.sort_values('date')
    start_date = category_df['date'].min()
    exclusion_date = start_date + pd.Timedelta(days=60)
    df_filtered = category_df[category_df['date'] > exclusion_date].copy()
    
    # Find maximum and minimum points
    max_idx = df_filtered['moving_avg'].idxmax()
    min_idx = df_filtered['moving_avg'].idxmin()
    
    max_point = (df_filtered.loc[max_idx, 'date'], df_filtered.loc[max_idx, 'moving_avg'])
    min_point = (df_filtered.loc[min_idx, 'date'], df_filtered.loc[min_idx, 'moving_avg'])
    
    return max_point, min_point

def analyze_category_period(df, category, start_date, end_date, company_name, is_peak=True):
    """
    Analyze articles for a specific category and period.
    """
    # Filter for category and period
    mask = (
        (df['category'] == category) & 
        (df['date'] >= start_date - pd.Timedelta(days=40)) & 
        (df['date'] <= end_date)
    )
    period_df = df[mask].copy()
    
    # Prepare article information with content limits
    articles_info = []
    total_chars = 0
    max_article_chars = 5000
    max_total_chars = 500000
    
    for _, row in period_df.iterrows():
        if total_chars >= max_total_chars:
            break
            
        content = row.get('content', '')[:max_article_chars] if row.get('content') else ''
        total_chars += len(content)
        
        article_info = {
            'date': row['date'].strftime('%Y-%m-%d'),
            'title': row.get('title', 'No title'),
            'sentiment': row['sentiment score'],
            'content': content,
            'media_outlet': row.get('media_outlet', 'Unknown')
        }
        articles_info.append(article_info)
    
    # Create prompt for the chatbot
    period_type = "peak" if is_peak else "dip"
    period_sign = "positive" if is_peak else "negative"
    chatbot = ChatGPT(
        system_prompt=f"""
        You are an expert media analyst focused on analyzing sentiment changes in media coverage. 
        Your task is to explain a significant {period_type} in sentiment regarding {company_name}, specifically regarding {category}-related topics, by analyzing relevant articles from the period. Therefore, since the sentiment has been {period_sign}, you need to analyze the articles to explain what specific events or narratives within this category drove the sentiment trend.
        Explain in your output, with tanglible examples (media outlet, date) what specific events or narratives drove the {period_type}, {period_sign,} sentiment trend.
        Provide a complete but not too long one-paragraph analysis highlighting the key events or narratives that drove the sentiment trend, specifically with regards to {company_name} on {category}-related topics.
        """,
        model_name="models/gemini-1.5-pro",
        temperature=0,
        max_tokens=2000
    )
    
    question = f"""
    Analyze the media coverage of {company_name}'s {category}-related topics between {start_date.strftime('%Y-%m-%d')} and {end_date.strftime('%Y-%m-%d')}, 
    a period of {period_type} sentiment. Based on the provided articles, explain what specific events or narratives within this category drove the sentiment trend.
    
    Articles for analysis:
    {json.dumps(articles_info, indent=2)}
    
    Provide a comprehensive but not too long paragraph analysis that explains explaining the {period_sign } sentiment trend for {category}-related coverage during this period, from the perspective of {company_name}. Include tangible examples or facts from the coverage. Cite your sources into brackets: (media outlet, date). Your explanation should be chrnological.
    """
    
    try:
        response = chatbot.ask(question)
        return response if response else f"Analysis could not be generated for {category} during this period."
    except Exception as e:
        print(f"Error analyzing {category} period {start_date} to {end_date}: {str(e)}")
        return f"Error analyzing period: {str(e)}"

def generate_category_sentiment_section(df, company_name):
    """
    Generate sentiment analysis for top 3 categories.
    """
    # Get top 3 categories by article count
    category_counts = df['category'].value_counts()
    print(category_counts)
    top_categories = category_counts.head(3).index.tolist()
    print(top_categories)
    
    # Initialize section content
    category_section = "\n### Category Sentiment Analysis\n\n"
    
    for category in top_categories:
        category_section += f"\n#### {category}\n"
        
        # Find extrema points for category
        try:
            max_point, min_point = find_category_extrema(df, category)
            print(max_point, min_point)
            
            # Add extrema points
            category_section += f"**Peak**: {max_point[0].strftime('%B %d, %Y')} (Score: {max_point[1]:.3f})\n"
            category_section += f"**Dip**: {min_point[0].strftime('%B %d, %Y')} (Score: {min_point[1]:.3f})\n\n"

            # Analyze peak period
            peak_analysis = analyze_category_period(
                df, category, 
                max_point[0] - pd.Timedelta(days=40),
                max_point[0],
                company_name, True
            )
            print(peak_analysis)
            category_section += f"**Peak Period Analysis:**\n{peak_analysis}\n\n"
            
            # Analyze dip period
            dip_analysis = analyze_category_period(
                df, category,
                min_point[0] - pd.Timedelta(days=40),
                min_point[0],
                company_name, False
            )
            print(dip_analysis)
            category_section += f"**Dip Period Analysis:**\n{dip_analysis}\n\n"
            
        except Exception as e:
            category_section += f"Error analyzing {category}: {str(e)}\n\n"
    
    return category_section

def setup_journalist_directories(news_folder_path: str, journalist_name: str) -> str:
    """
    Set up directory structure for journalist analysis based on the provided news folder path.
    
    Args:
        news_folder_path (str): Path to the folder containing news articles
        journalist_name (str): Name of the journalist being analyzed
        
    Returns:
        str: Path to the general folder for outputs
    """
    try:
        # Get the journalist's base directory by going up one level from news folder
        general_folder = os.path.dirname(news_folder_path)
        outputs_folder = os.path.join(general_folder, "Outputs")
        compiled_outputs = os.path.join(outputs_folder, "CompiledOutputs")
        
        # Create directories
        os.makedirs(compiled_outputs, exist_ok=True)
        
        logging.info(f"Created directory structure at {general_folder}")
        return general_folder
        
    except Exception as e:
        logging.error(f"Error creating directory structure: {str(e)}")
        raise
    
def preprocess_journalist_articles(journalist_name: str, articles: List[Dict], news_folder_path: str, 
                                 force_reprocess: bool = False) -> Tuple[List[Dict], str, bool]:
    try:
        logging.info(f"Starting article preprocessing for journalist {journalist_name}")
        print(f"Found {len(articles)} articles to process")
        
        # Set up directory structure
        general_folder = setup_journalist_directories(news_folder_path, journalist_name)
        print(f"Set up directory structure at: {general_folder}")
        
        preprocessed_path = os.path.join(general_folder, "Outputs", "CompiledOutputs", "PreprocessedArticles.json")
        
        # Check if we have preprocessed articles saved
        if not force_reprocess and os.path.exists(preprocessed_path):
            logging.info("Loading previously preprocessed articles")
            print("Found existing preprocessed articles, loading...")
            articles_sorted = load_data_from_json(preprocessed_path)
            return articles_sorted, general_folder, True
        
        # If no saved data or force_reprocess, do the preprocessing
        logging.info("Preprocessing articles")
        print("Starting preprocessing...")

        # Extract metadata and clean articles
        print("Extracting metadata...")
        articles = extract_metadata(articles)
        articles = clean_articles(articles)
        print(f"Cleaned and extracted metadata from {len(articles)} articles")

        # Extract hyperlinks
        print("Extracting hyperlinks...")
        for article in articles:
            hyperlinks = extract_hyperlinks(article['file_path'])
            article['link'] = hyperlinks[0] if hyperlinks else None

        # Sort articles by date
        print("Sorting articles...")
        articles_sorted = sorted(articles, 
                               key=lambda x: datetime.strptime(x.get('date', '2024-01-01'), '%B %d, %Y'))

        # Save preprocessed articles
        print("Saving preprocessed articles...")
        save_data_to_json(articles_sorted, preprocessed_path)
        
        logging.info("Article preprocessing completed successfully")
        print("Preprocessing completed successfully")
        
        if not articles_sorted:
            print("Warning: No articles remained after preprocessing")
            return None, general_folder, False
            
        return articles_sorted, general_folder, True

    except Exception as e:
        logging.error(f"Error in article preprocessing: {str(e)}")
        logging.error(traceback.format_exc())
        print(f"Error during preprocessing: {str(e)}")
        print(traceback.format_exc())
        return None, None, False

def generate_introduction(profile_md, journalist_name, max_retries=3):
    for attempt in range(max_retries):
        try:
            intro_bot = ChatGPT(
                model_name="chatgpt-4o-latest",
                temperature=0,
                max_tokens=1000,
                max_retries=3,  # Add retry logic to the ChatGPT instance
                retry_delay=2
            )
            
            intro_prompt = f"""
Based on the provided journalist profile, create a comprehensive introduction section.

Profile content:
{profile_md}

Write an engaging introduction that:
1. Provides an overview of {journalist_name}'s primary areas of expertise and focus
2. Highlights their most significant or impactful coverage topics
3. Identifies any overarching patterns or themes across their work
4. Notes their typical approach to reporting and story development

Keep the introduction to approximately 300-400 words, making it substantive but concise.
Use a professional, analytical tone while remaining engaging.
Base all observations strictly on the evidence from the analyzed articles.
Start your output with "## Introduction".
"""
            
            introduction = intro_bot.ask(intro_prompt)
            if introduction and introduction.strip():
                print("Successfully generated introduction")
                return introduction
                
        except Exception as e:
            print(f"Attempt {attempt + 1} failed to generate introduction: {str(e)}")
            if attempt < max_retries - 1:
                time.sleep(2 * (attempt + 1))  # Exponential backoff
                continue
            else:
                raise Exception(f"Failed to generate introduction after {max_retries} attempts: {str(e)}")
    
    raise Exception("Failed to generate valid introduction after all retry attempts")

# Class to process document inputs
class DocumentProcessor:
    def __init__(self, min_length: int = 1000, max_length: int = 25500, similarity_threshold: float = 0.8):
        """
        Initialize the document processor with configurable parameters.
        
        Args:
            min_length (int): Minimum character length for valid documents
            max_length (int): Maximum character length for valid documents
            similarity_threshold (float): Threshold for detecting duplicate content
        """
        self.min_length = min_length
        self.max_length = max_length
        self.similarity_threshold = similarity_threshold
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)

    def _text_similarity(self, text1: str, text2: str) -> float:
        """Calculate similarity ratio between two texts"""
        return SequenceMatcher(None, text1.strip(), text2.strip()).ratio()

    def _get_files(self, folder_path: str) -> List[str]:
        """Get list of files from folder, excluding system files"""
        if not os.path.exists(folder_path):
            self.logger.error(f"Folder path does not exist: {folder_path}")
            return []
        
        files = [f"{folder_path}/{file}" for file in os.listdir(folder_path)]
        return [f for f in files if f.lower().endswith('.pdf')]

    def _extract_text_from_pdf(self, file_path: str) -> Optional[str]:
        """Extract text content from a PDF file using PyPDF2"""
        try:
            reader = PdfReader(file_path)
            text = ""
            for page in reader.pages:
                text += page.extract_text() or ""
            return text.strip()
        except Exception as e:
            self.logger.error(f"Error extracting text from {file_path}: {e}")
            return None

    def _extract_text_from_docx(self, file_path: str) -> Optional[str]:
        """Extract text content from a DOCX file using python-docx"""
        try:
            doc = DocxDocument(file_path)
            text = "\n".join([para.text for para in doc.paragraphs])
            return text.strip()
        except Exception as e:
            self.logger.error(f"Error extracting text from DOCX {file_path}: {e}")
            return None

    def process_pdf_folder(self, folder_path: str) -> List[Dict]:
        """Process all PDFs in a folder."""
        if not os.path.exists(folder_path):
            self.logger.error(f"PDF folder path does not exist: {folder_path}")
            return []

        files = self._get_files(folder_path)
        self.logger.info(f"Found {len(files)} files in folder")

        articles = [{'file_path': file, 'position': idx + 1} for idx, file in enumerate(files)]

        # Process files using PyPDF2
        processed_documents = []
        for file in files:
            try:
                pdf_text = self._extract_text_from_pdf(file)

                if pdf_text is None or not self.min_length <= len(pdf_text) <= self.max_length:
                    self.logger.info(
                        f"Removing file {file} (text extraction failed or length out of range)"
                    )
                    continue

                pdf_text = pdf_text.replace("William Masquelier", " ")  # Optional cleanup
                processed_documents.append({'file_path': file, 'content': pdf_text})
            except Exception as e:
                self.logger.error(f"Error processing PDF {file}: {e}")

        self.logger.info(f"Processed {len(processed_documents)} valid PDF files")
        return processed_documents

    def process_docx(self, docx_path: str, separator: str = "--") -> List[Dict]:
        """
        Process a DOCX file containing multiple articles.
        
        Args:
            docx_path (str): Path to the DOCX file
            separator (str): Separator used between articles
            
        Returns:
            List[Dict]: List of processed articles
        """
        if not os.path.exists(docx_path):
            self.logger.error(f"DOCX file does not exist: {docx_path}")
            return []

        try:
            # Extract text from DOCX
            docx_text = self._extract_text_from_docx(docx_path)
            if not docx_text:
                self.logger.error(f"Failed to extract text from DOCX {docx_path}")
                return []

            # Split into articles based on separator
            raw_sections = docx_text.split(separator)
            articles = []

            for idx, section in enumerate(raw_sections, 1):
                content = section.strip()
                if not self.min_length <= len(content) <= self.max_length:
                    continue

                is_duplicate = any(
                    self._text_similarity(content, existing['content']) > self.similarity_threshold 
                    for existing in articles
                )

                if not is_duplicate:
                    articles.append({
                        'file_path': docx_path,
                        'content': content,
                        'position': idx
                    })

            self.logger.info(f"Processed {len(articles)} articles from DOCX")
            return articles

        except Exception as e:
            self.logger.error(f"Error processing DOCX {docx_path}: {e}")
            return []

    def process_documents(self, 
                         pdf_folder_path: Optional[str] = None, 
                         docx_file_path: Optional[str] = None, 
                         docx_separator: str = "--") -> List[Dict]:
        """
        Main processing function that handles both PDF folder and DOCX file inputs.
        
        Args:
            pdf_folder_path (str, optional): Path to folder containing PDFs
            docx_file_path (str, optional): Path to DOCX file
            docx_separator (str): Separator for DOCX processing
            
        Returns:
            List[Dict]: Combined list of processed articles from both sources
        """
        all_articles = []
        position = 1

        if pdf_folder_path:
            pdf_articles = self.process_pdf_folder(pdf_folder_path)
            for article in pdf_articles:
                article['reordered_position'] = position
                position += 1
            all_articles.extend(pdf_articles)

        if docx_file_path:
            docx_articles = self.process_docx(docx_file_path, docx_separator)
            for article in docx_articles:
                article['reordered_position'] = position
                position += 1
            all_articles.extend(docx_articles)

        self.logger.info(f"Total processed articles: {len(all_articles)}")
        return all_articles

#Function to create the base for all the outputs

def preprocess_articles(company_name, articles, industry_of_interest, region):
    """
    Performs common preprocessing steps on articles that are required for all output types.
    
    Args:
        company_name (str): Name of the company being analyzed
        articles (list): List of article dictionaries
        industry_of_interest (str): Industry relevant to the analysis
        region (str): Geographic region of interest
        
    Returns:
        tuple: (preprocessed_articles, general_folder, directories_created)
    """
    try:
        logging.info(f"Starting article preprocessing for {company_name}")
        
        # Set up base directory structure
        general_folder = os.path.join("KnowledgeBase", "MediaCoverageAnalytics", company_name)
        
        directories_created = False
        try:
            # Create all required subdirectories using os.path.join
            ensure_directory_exists(os.path.join(general_folder, "Outputs"))
            ensure_directory_exists(os.path.join(general_folder, "Outputs", "CompiledOutputs"))
            ensure_directory_exists(os.path.join(general_folder, "Outputs", "IndividualInsights"))
            ensure_directory_exists(os.path.join(general_folder, "Outputs", "TopicsSummaries"))
            directories_created = True
        except Exception as e:
            logging.error(f"Error creating directories: {str(e)}")
            return None, general_folder, False

        # Extract metadata and clean articles
        logging.info("Extracting metadata and cleaning articles")
        articles = filter_relevant_articles(articles, company_name, industry_of_interest, region)
        articles = extract_metadata(articles)
        articles = clean_articles(articles)

        # Get embeddings for filtered articles
        logging.info("Generating article embeddings")
        article_embeddings = get_embeddings(articles, embeddings_model)
        
        # Filter duplicates using embeddings
        logging.info("Filtering duplicates")
        articles = filter_duplicates(articles, article_embeddings)

        # Extract hyperlinks
        logging.info("Extracting hyperlinks")
        for article in articles:
            hyperlinks = extract_hyperlinks(article['file_path'])
            article['link'] = hyperlinks[0] if hyperlinks else None

        # Sort articles
        articles_sorted = sorted(articles, key=lambda x: x['reordered_position'] if x.get('reordered_position') is not None else float('inf'))

        # Save checkpoint of preprocessed data using os.path.join
        checkpoint_path = os.path.join(general_folder, "Outputs", "CompiledOutputs", "PreprocessedArticles.json")
        save_data_to_json(articles_sorted, checkpoint_path)
        
        logging.info("Article preprocessing completed successfully")
        return articles_sorted, general_folder, directories_created

    except Exception as e:
        logging.error(f"Error in article preprocessing: {str(e)}")
        logging.error(traceback.format_exc())
        return None, None, False

# Function to generate the journalist list

def generate_journalist_list_output(articles_sorted: List[Dict], company_name: str, general_folder: str, language: str = "English") -> str:
    """
    Generate a markdown list of journalists and media outlets from the processed articles.
    
    Args:
        articles_sorted (List[Dict]): List of preprocessed and sorted articles
        company_name (str): Name of the company being analyzed
        general_folder (str): Base path for output files
        language (str): Output language for the list
        
    Returns:
        str: Generated markdown content
    """
    try:
        logging.info(f"Starting journalist list generation for {company_name}")
        
        # Generate markdown content
        logging.info("Generating markdown content")
        md_content = f"# List of media articles related to {company_name}\n\n"
        md_content += "## Overview\n\n"
        
        # Add summary statistics
        total_articles = len(articles_sorted)
        unique_authors = len(set(article.get('author_name', 'Unknown') for article in articles_sorted))
        unique_outlets = len(set(article.get('media_outlet', 'Unknown') for article in articles_sorted))
        
        md_content += f"- Total Articles: {total_articles}\n"
        md_content += f"- Unique Journalists: {unique_authors}\n"
        md_content += f"- Media Outlets: {unique_outlets}\n\n"
        
        md_content += "## Complete list of articles\n\n"
        
        # Sort articles by date (most recent first) while maintaining the original sorting as secondary criterion
        articles_list = sorted(
            articles_sorted,
            key=lambda x: (
                datetime.strptime(x.get('date', 'January 1, 2024'), '%B %d, %Y').timestamp(),
                x.get('reordered_position', float('inf'))
            ),
            reverse=True
        )

        # Generate the detailed list
        for article in articles_list:
            media_outlet = article.get('media_outlet', 'Unknown')
            author_name = article.get('author_name', 'Unknown')
            date = article.get('date', 'Unknown')
            title = article.get('title', 'Unknown')
            link = article.get('link', None)
            
            media_outlet_hyperlinked = f"[{media_outlet}]({link})" if link else media_outlet
            md_content += f"- **{author_name}**, {media_outlet_hyperlinked} ({date}): *{title}*\n"

        # Add journalist statistics section
        md_content += "\n## Journalist Statistics\n\n"
        author_counts = collections.Counter(
            article.get('author_name', 'Unknown') 
            for article in articles_sorted 
            if article.get('author_name', 'Unknown') != 'Unknown'
        )
        
        if author_counts:
            md_content += "### Top Contributors\n\n"
            for author, count in author_counts.most_common(10):  # Top 10 journalists
                md_content += f"- {author}: {count} articles\n"

        # Add media outlet statistics section
        md_content += "\n## Media Outlet Statistics\n\n"
        outlet_counts = collections.Counter(
            article.get('media_outlet', 'Unknown') 
            for article in articles_sorted
        )
        
        if outlet_counts:
            md_content += "### Coverage by Media Outlet\n\n"
            for outlet, count in outlet_counts.most_common():
                md_content += f"- {outlet}: {count} articles\n"

        # Save markdown file
        output_folder = f"{general_folder}/Outputs/CompiledOutputs"
        output_file_path = os.path.join(output_folder, f"Journalists_medialist_{company_name.replace(' ', '_')}.md")
        Path(output_folder).mkdir(parents=True, exist_ok=True)
        
        try:
            with open(output_file_path, 'w', encoding='utf-8') as md_file:
                md_file.write(md_content)
            logging.info(f"Journalist list saved successfully to {output_file_path}")
        except Exception as e:
            logging.error(f"Error saving journalist list: {str(e)}")
            raise

        return md_content

    except Exception as e:
        logging.error(f"Error generating journalist list: {str(e)}")
        logging.error(traceback.format_exc())
        raise

# Function to generate insights for individual articles

def generate_insights_output(articles_sorted: List[Dict], company_name: str, general_folder: str, 
                           industry_of_interest: str, region: str, language: str = 'English') -> str:
    """
    Generate compiled insights from the processed articles with enhanced statistics overview and visualizations.
    """
    try:
        logging.info(f"Starting insights generation for {company_name}")
        
        # Calculate statistics for overview
        total_articles = len(articles_sorted)
        date_range = f"{min(article.get('date', 'Unknown') for article in articles_sorted)} to {max(article.get('date', 'Unknown') for article in articles_sorted)}"
        unique_outlets = len(set(article.get('media_outlet', 'Unknown') for article in articles_sorted))
        unique_authors = len(set(article.get('author_name', 'Unknown') for article in articles_sorted))
        
        # Count articles by outlet
        outlet_counts = collections.Counter(
            article.get('media_outlet', 'Unknown') 
            for article in articles_sorted
        )
        top_outlets = dict(outlet_counts.most_common(10))  # Increased to top 10 for better visualization
        
        # Generate monthly distribution data
        months_data = collections.defaultdict(int)
        for article in articles_sorted:
            try:
                date = datetime.strptime(article.get('date', ''), '%B %d, %Y')
                month_key = date.strftime('%B %Y')
                months_data[month_key] += 1
            except Exception:
                continue
        
        # Sort months chronologically
        sorted_months = sorted(months_data.keys(), 
                             key=lambda x: datetime.strptime(x, '%B %Y'))
        months_data_sorted = {month: months_data[month] for month in sorted_months}
        
        # Create charts
        outlets_chart = create_bar_chart_compiled_insights(
            top_outlets,
            f'Media Coverage Distribution - {company_name}',
            'Media Outlet',
            'Articles',
            rotate_labels=True,
            figsize=(12, 5),
            color='#2e86c1'
        )
        
        monthly_chart = create_bar_chart_compiled_insights(
            months_data_sorted,
            f'Timeline of Media Coverage - {company_name}',
            'Month',
            'Articles',
            rotate_labels=True,
            figsize=(12, 5),
            color='#27ae60'
        )
        
        # Initialize insights document with enhanced overview
        summary_insights_md = f"""
# Compiled Summaries of articles related to {company_name}

## Overview

### Coverage Statistics
| Metric | Value |
|--------|--------|
| Total Articles Analyzed | {total_articles} |
| Coverage Period | {date_range} |
| Media Outlets | {unique_outlets} |
| Unique Authors | {unique_authors} |

### Media Coverage Analysis

<div style="text-align: center;">

#### Distribution by Media Outlet
![Top Media Outlets](data:image/png;base64,{outlets_chart})

#### Timeline Analysis
![Monthly Distribution](data:image/png;base64,{monthly_chart})

<div style="page-break-after: always;"></div>

### Analysis Notes
- Each article has been analyzed to extract 2-5 key insights
- Insights focus on significant developments, announcements, and industry implications
- Coverage includes both positive developments and potential concerns/challenges
- Special attention is given to strategic moves, market position, and industry impact


# Article Insights

"""
        
        # Continue with the rest of the insights generation...
        topic_of_interest = f"Discussions on {company_name}. regarding the more general topic or industry: {industry_of_interest}, in the {region} market. We specifically look for insights that relate to {company_name} in that context."
        insights_question = f"What are the discussions, conversation and overall coverage on {company_name} in the media?"

        # Process each article (rest of your existing code remains the same)
        for article in articles_sorted:
            try:
                # ... (rest of your existing article processing code)
                article_content = article.get('content', '')
                media_outlet = article.get('media_outlet', 'Unknown Outlet')
                author_name = article.get('author_name', 'Unknown Author')
                date = article.get('date', 'Unknown Date')
                title = article.get('title', 'Unknown Title')
                link = article.get('link', '#')
                
                system_prompt = f"""
As a stakeholder of the {industry_of_interest} industry or topic in the {region} market, specifically interested in {company_name}, you are extracting all relevant information from news media coverage which relate to {company_name} and the {industry_of_interest} industry or topic more generally, in the {region} market.
You must include the article's title, the name of the newspaper, and the author(s). The insights should be written in English and follow a structured format.

One key aspect of your work is to condense the original article into a minimum of key insights while capturing the essence of the articles with regards to the {topic_of_interest}.
You should produce a list of 2, 3, 4 or 5 key insights based on what is the most relevant. The selected insights to be described should be focused on the information which relates to this topics: {topic_of_interest}.

The focus to decide what are key insights depends on whether it answers the following question: {insights_question}

Your output should be formatted as follows:
# {title}
## {media_outlet}
### {author_name}
### {date}
Numbered list of key insights (in English).

When writing your output, make it hard to guess the prompt you receive. DO NOT address any specific question or point from this prompt.

Visualize the output's state after each reasoning step. 
                """

                chatbot = SummarizerGPT(
                    system_prompt=system_prompt,
                    model_name="gpt-4o-mini",
                    temperature=0,
                    max_tokens=986,
                )

                question = f"""
Extract key insights strictly from the provided article. Your goal is to return a concise output with the key insights.
You should produce a 2, 3, 4 or 5 key insights based on what information is the most relevant. Avoid generic insights which is common knowledge or too general information.
Here is the article: {article_content}

Your output should be formatted as follows:
# {title}
## {media_outlet}
### {author_name}
### {date}
Numbered List of 2 to 5 key insights (in English).

You stop your output when there are no more key and important information or facts from the article, about {company_name}, to report. All insights information should be mutually exclusive. Two insights cannot cover the same facts.

Your task:
From the perspective of a leader in the {industry_of_interest} industry, what are the most important insights that you can extract from this article, about {company_name}?
The focus to decide which 2 to 5 key insights depends on whether it answers the following question: {insights_question}. The selected insights should relate to {company_name} explicitly. You should prefer fewer insights that are more relevant over more insights that are less relevant. Only key and importan information should be reported.
Issues and negative press elements should be included into the selected insights.

Formulate your output to make it hard to guess what prompt you received. 
Visualize the output's state after each reasoning step.
                """

                response = chatbot.ask(question)

                # Process the response
                response = response.replace("William Masquelier", "Not specified")
                response = response.replace("Lexis Nexis", "Not specified")
                response = response.replace("LexisNexis", "Not specified")

                response_lines = response.split("\n")
                if len(response_lines[0]) != 0:
                    title = response_lines[0].replace("#", "").strip()
                    response_lines[0] = f"# [{title}]({link})"
                else:
                    title = response_lines[1].replace("#", "").strip()
                    response_lines[1] = f"# [{title}]({link})"
                response_lines = "\n".join(response_lines)

                # Save individual insight
                title = article.get('title', 'Unknown Title').replace("/", " ").replace(":", " ")
                insights_path = f"{general_folder}/Outputs/IndividualInsights/Insights_{title}.md"
                
                with open(insights_path, "w", encoding='utf-8') as file:
                    file.write(response_lines)

                article['insights_path'] = insights_path
                article['insights_content'] = response_lines

                if 'insights_content' in article and language.lower() != 'english':
                    article['insights_content'] = translate_content(
                    article['insights_content'], 
                    'auto', 
                    language
                    )
                
                # Save translated individual insight
                title = article.get('title', 'Unknown Title').replace("/", " ").replace(":", " ")
                insights_path = f"{general_folder}/Outputs/IndividualInsights/Insights_{title}.md"
                
                with open(insights_path, "w", encoding='utf-8') as file:
                    file.write(article['insights_content'])

            except Exception as e:
                logging.error(f"Error processing article {title}: {str(e)}")
                logging.error(traceback.format_exc())
                continue

        # Save compiled insights

        all_insights = []
        for article in articles_sorted:
            if 'insights_content' in article:
                all_insights.append(article['insights_content'] + "\n\n---\n")

        summary_insights_md = "\n".join(all_insights)

        compiled_insights_path = f"{general_folder}/Outputs/CompiledOutputs/CompiledInsights_{company_name}.md"
        with open(compiled_insights_path, "w", encoding='utf-8') as file:
            file.write(summary_insights_md)

        # Save updated article data
        save_data_to_json(articles_sorted, f"{general_folder}/Outputs/CompiledOutputs/ArticlesList.json")

        logging.info(f"Insights generation completed successfully")
        return summary_insights_md

    except Exception as e:
        logging.error(f"Error generating insights: {str(e)}")
        logging.error(traceback.format_exc())
        raise

# Function to generate issue analysis

def generate_issue_analysis_output(articles_sorted: List[Dict], company_name: str, general_folder: str, 
                           industry_of_interest: str, region: str, language: str ) -> str:
    """
    Generate comprehensive issues analysis from the processed articles.
    Uses article content directly with metadata instead of compiled insights.
    
    Args:
        articles_sorted (List[Dict]): List of preprocessed and sorted articles
        company_name (str): Name of the company being analyzed
        general_folder (str): Base path for output files
        industry_of_interest (str): Industry relevant to the analysis
        region (str): Geographic region of interest
        language (str): Output language
        
    Returns:
        str: Generated markdown content with comprehensive analysis
    """
    try:
        logging.info(f"Starting issues analysis generation for {company_name}")
        
        # Sort articles based on dates
        for article in articles_sorted:
            if 'date' in article:
                try:
                    date_object = datetime.strptime(article['date'], '%B %d, %Y')
                    article['timestamp'] = date_object.timestamp()
                except ValueError:
                    logging.warning(f"Could not parse date: {article['date']}")
                    article['timestamp'] = 0  # Set to epoch for invalid dates
            else:
                article['timestamp'] = 0  # Set to epoch for articles without dates
        
        # Sort articles by timestamp in ascending order (chronological)
        articles_sorted.sort(key=lambda x: x.get('timestamp', 0))
        
        # Compile articles content with metadata
        compiled_content = ""
        for article in articles_sorted:
            metadata_header = f"""
# Title: {article.get('title', 'Untitled')}
## Media outlet: {article.get('media_outlet', 'Unknown Media Outlet')}
### Author: {article.get('author_name', 'Anonymous')}
### Date: {article.get('date', 'Unknown Date')}

"""
            content = article.get('content', '')
            
            metadata_footer = f"""
---
Media outlet: {article.get('media_outlet', 'Unknown')}
Author: {article.get('author_name', 'Anonymous')}
Date: {article.get('date', 'Unknown Date')}
Title: {article.get('title', 'Untitled')}
---

"""
            compiled_content += metadata_header + content + metadata_footer
            
        # Save compiled content to a temporary file for BigSummarizerGPT
        temp_compiled_path = f"{general_folder}/Outputs/CompiledOutputs/TempCompiled_{company_name}.md"
        with open(temp_compiled_path, "w", encoding='utf-8') as file:
            file.write(compiled_content)
            
        # Get business model description
        logging.info("Generating business model description")
        chatbot = ChatGPT(
            model_name="models/gemini-1.5-pro",
            temperature=0,
            max_tokens=1822
        )
        
        business_model_question = f"""
Generate a comprehensive business model description for {company_name}, with a focus on its interactions within the market and its relationships with key stakeholders. Describe how {company_name} collaborates, competes, or forms partnerships with other companies in the industry ecosystem, including suppliers, distributors, regulators, and any strategic alliances. Highlight {company_name}'s approach to customer engagement and how it adapts its offerings to meet the needs of its target audience or specific market demands. Provide insights into how {company_name} navigates its competitive landscape and builds relationships that reinforce its position in the industry. Additionally, discuss the company's strategy for addressing external challenges such as regulatory changes, shifting customer expectations, and technological advancements. In this case, we are interested by {company_name}'s business model in {industry_of_interest}, and more specifically in the {region} market.
        """
        
        business_model_description = chatbot.ask(business_model_question)
        print(business_model_description)
        
        # Create list of issues using BigSummarizerGPT
        logging.info("Creating list of issues")
        list_issues_insights = f"# List of pains and issues extracted from the media coverage:\n"
        
        chatbot = BigSummarizerGPT(
            model_name="models/gemini-1.5-pro",
            temperature=0,
            max_tokens=1822
        )
        
        question = f"""
Analyze the following compiled media coverage and create a bullet point list of issues, negative press, or bad press specifically related to {company_name}. Each point must:
1. Directly relate to {company_name}
2. Cover a negative aspect of the company or the topic
3. Be detailed and descriptive regarding the negative aspects and issues faced by {company_name}. You can report exact quotes or sentences from the article to support your points.
4. Include the source (media outlet, author and date) for each issue identified. include the following reference at the end of each bullet point: (media outlet and date), into brackets

Your output should be focused on the negative aspects and the bad press surrounding {company_name}. If there are no negative aspects, mention that there are no issues found, in a single bullet point.
Format your response as a bullet point list, with each point starting with a dash (-).
        """
        
        response = chatbot.ask(question, temp_compiled_path)
        print(response)
        
        # Extract and format bullet points
        bullet_points = re.findall(r'(?:^|\n)- .+', response)
        bullet_points = [point.strip() for point in bullet_points]
        
        if bullet_points:
            list_issues_insights += "\n" + "\n".join(bullet_points) + "\n"
        else:
            list_issues_insights += "\n- No negative aspects or bad press found for this section.\n"
            
        list_issues_insights = list_issues_insights.replace(" -", "\n-")
        
        # Save issues list
        issues_list_path = f"{general_folder}/Outputs/CompiledOutputs/IssueListInsights_{company_name}.md"
        with open(issues_list_path, "w") as file:
            file.write(list_issues_insights)
            
        with open(issues_list_path, "r") as file:
            list_issues_content = file.read()
        
        # Extract issue categories with descriptions
        logging.info("Extracting issue categories and descriptions")
        chatbot = ChatGPT(
            model_name="models/gemini-1.5-pro",
            temperature=0,
            max_tokens=2000
        )
        
        category_question = f"""
Based on the following list of issues faced by {company_name}, identify distinct issue categories and provide descriptions for each.
Format your response strictly as follows:
CATEGORY: [Issue 1]
DESCRIPTION: [Detailed description of what this issue entails]
....
CATEGORY: [Issue N]
DESCRIPTION: [Detailed description of what this issue entails]

For each category:
1. Give it a clear, concise name that refers to the specific issue.
2. Provide a detailed description explaining what the issue(s) consist of.
3. Make sure categories are distinct and don't overlap. each issue should be exclusively distinct from the other listed "CATEGORY".
4. Focus on {company_name}'s specific context and issues.

Bullet point list of issues to analyze:
{list_issues_insights}

You should propose a maximum of 6, 7 or 8 main issues based on the bullet point list of issues identified.
        """
        
        categories_response = chatbot.ask(category_question)
        print(categories_response)
        
        # Parse categories and descriptions
        categories_data = []
        current_category = None
        current_description = None
        
        for line in categories_response.split('\n'):
            if line.startswith('CATEGORY:'):
                if current_category is not None:
                    categories_data.append({
                        'category': current_category.strip(),
                        'description': current_description.strip() if current_description else '',
                        'issues': []
                    })
                current_category = line.replace('CATEGORY:', '').strip()
                current_description = None
            elif line.startswith('DESCRIPTION:'):
                current_description = line.replace('DESCRIPTION:', '').strip()
        
        # Add the last category
        if current_category is not None:
            categories_data.append({
                'category': current_category,
                'description': current_description if current_description else '',
                'issues': []
            })
        
        # Process each bullet point and classify it
        logging.info("Classifying individual issues")
        bullet_points = [point.strip() for point in list_issues_insights.split('\n') if point.strip().startswith('-')]
        
        for bullet_point in bullet_points:
            classification_prompt = ""
            for category in categories_data:
                classification_prompt += f"\n{category['category']}: {category['description']}"
            
            # Use Marvin to classify the bullet point
            classification = marvin.classify(
                f"""
Given the following issue categories and their descriptions:
{classification_prompt}

Classify the following issue into one of these categories. Choose the most appropriate category based on the descriptions provided.

Issue to classify:
{bullet_point}

Only output the exact category name that best matches this issue.
                """,
                labels=[cat['category'] for cat in categories_data]
            )
            print(classification)
            
            # Add the classified issue to the appropriate category
            for category in categories_data:
                if category['category'] == classification:
                    category['issues'].append(bullet_point)
                    break
        
        # Format the categorized issues
        categorized_issues = "# Categorized Issues Analysis\n\n"
        for category in categories_data:
            categorized_issues += f"## {category['category']}\n"
            categorized_issues += f"**Description**: {category['description']}\n\n"
            categorized_issues += "**Issues Identified**:\n"
            for issue in category['issues']:
                categorized_issues += f"{issue}\n"
            categorized_issues += "\n---\n\n"
        
        # Save categorized issues
        categorized_issues_path = f"{general_folder}/Outputs/CompiledOutputs/CategorizedIssues_{company_name}.md"
        with open(categorized_issues_path, "w", encoding='utf-8') as file:
            file.write(categorized_issues)
            
        # Save raw categories data for further processing
        categories_json_path = f"{general_folder}/Outputs/CompiledOutputs/IssuesCategories_{company_name}.json"
        with open(categories_json_path, 'w', encoding='utf-8') as file:
            json.dump(categories_data, file, indent=4)

 # Generate comprehensive analysis by category
        logging.info("Generating comprehensive category-based analysis")
        
        # Calculate total number of issues and identify major categories
        total_issues = sum(len(category['issues']) for category in categories_data)
        categories_with_counts = [
            {
                **category,
                'issue_count': len(category['issues']),
                'percentage': (len(category['issues']) / total_issues) * 100
            }
            for category in categories_data
        ]
        
        # Sort categories by issue count
        sorted_categories = sorted(
            categories_with_counts,
            key=lambda x: x['issue_count'],
            reverse=True
        )
        
        # Calculate cumulative percentage and select categories covering 80%
        cumulative_percentage = 0
        major_categories = []
        for category in sorted_categories:
            cumulative_percentage += category['percentage']
            major_categories.append(category)
            if cumulative_percentage >= 80:
                break
                
        logging.info(f"Selected {len(major_categories)} major categories covering {cumulative_percentage:.1f}% of issues")
        
        # Initialize comprehensive analysis
        comprehensive_analysis = f"""# Executive Summary of Issues and Negative Press Related to {company_name}

## Introduction
This analysis examines the key challenges and issues faced by {company_name}, focusing on the most significant areas of concern identified through media coverage. The analysis is structured by major issue categories.

## Table of Contents

### Major Issues
"""
        for category in major_categories:
            anchor = create_markdown_anchor(category['category'])

        comprehensive_analysis += "\n## Detailed Category Analysis\n"

        # Analyze each major category sequentially
        previous_analyses = ""
        for idx, category in enumerate(major_categories, 1):
            logging.info(f"Analyzing category {idx}/{len(major_categories)}: {category['category']}")
            
            chatbot = ChatGPT(
                model_name="models/gemini-1.5-pro",
                temperature=0,
                max_tokens=3000
            )
            
            category_analysis_prompt = f"""
You are conducting a comprehensive analysis of issues related to {company_name}, organized category by category. This is Analysis #{idx} of {len(major_categories)}.

Context:
Current Category: {category['category']}
Category Description: {category['description']}
Business Model Overview: {business_model_description}

Background Information:
Previous Analyses Overview:
{previous_analyses}

Issues Identified in this Category:
{chr(10).join(category['issues'])}

Your Task:
Develop a detailed analysis for this category that adheres to the following requirements:

    - Builds upon Previous Analyses: Reference and expand on insights from earlier analyses where relevant, ensuring minimal repetition. However, do not mention in your output that the analysis is built upon previous analyses, only take this into account for writing the analyses down.
    - Explains the issue in a chronological and referenced manner. cite its source using the following format: (Media Outlet, Date).
    - Evaluates Severity and Impact: Assess how the identified issues affect {company_name} in terms of operations, reputation, and stakeholder trust.
    - Stakeholder Analysis: Identify and describe the interests and concerns of key stakeholders linked to this category.
    - Implications for the Business Model: Analyze how the identified issues in this category could influence or challenge {company_name}’s business model.
    - Use Specific Examples: Incorporate relevant quotes or examples from the issues list to strengthen the analysis.
    - Cite Sources: Properly attribute each issue to its source using the following format: (Media Outlet, Date). aim to reference most of your sentences when relevant to do so.

Formatting Guidelines, Your analysis should follow this structure:

Introduction: Provide a brief summary of the category and its relevance to {company_name}.

Chronological developments relative to the issue.

Severity and Impact Analysis: Impact Assessment should consider : Reputation risk: What is the potential impact on the organization's image or reputation? Operational impact: Could the issue affect business operations or productivity? Financial implications: Are there any financial consequences, such as lost revenue or lawsuits? Regulatory or legal concerns: Does the issue involve legal or regulatory violations? (Do not directly respond or adress these questions in your output)

Stakeholder Interests: Detail the perspectives and priorities of stakeholders involved. Stakeholder Perspectives should include: Key audiences: Who are the primary and secondary audiences affected by or interested in the issue? Sentiments: How do different stakeholders perceive the issue? Reactions: What are stakeholders saying or doing in response to the issue? Expectations: What do stakeholders expect from the organization at this time? Reputational and Communications Risks: Examine potential impacts on {company_name}’s reputation and public communication strategies. (Do not directly respond or adress these questions in your output)

Business Model Implications: Discuss potential short- and long-term effects on {company_name}’s business model.

Conclusion and Recommendations: Summarize the analysis and provide actionable insights or recommendations for {company_name}.

"""
            
            category_response = chatbot.ask(category_analysis_prompt)
            print(category_response)

            anchor = create_markdown_anchor(category['category'])
            comprehensive_analysis += f"\n<h3 id='{anchor}'>{category['category']} ({category['percentage']:.1f}% of Issues)</h3>\n"
            comprehensive_analysis += f"{category_response}\n"
            previous_analyses += f"\nCategory: {category['category']}\n{category_response}\n"

        # Generate overall conclusion
        conclusion_prompt = f"""
Based on the complete analysis of major issue categories for {company_name}, create a concluding section that:

1. Summarizes the most critical challenges across all categories
2. Identifies common themes or patterns
3. Assesses the overall severity of the issues
4. Discusses potential interactions between different categories
5. Provides a future outlook considering all analyzed issues

Previous analyses:
{previous_analyses}

Format your response as:
## Conclusion and Future Outlook
[Your analysis here]

Keep the conclusion to approximately 400 words.
"""
        
        chatbot = ChatGPT(
            model_name="chatgpt-4o-latest",
            temperature=0,
            max_tokens=1000
        )
 
        conclusion = chatbot.ask(conclusion_prompt)
        print(conclusion)
        comprehensive_analysis += f"\n{conclusion}"
        
        # After generating the conclusion but before assembling the final analysis, add:

        # Generate introduction
        logging.info("Generating executive introduction")
        chatbot = ChatGPT(
            model_name="chatgpt-4o-latest",
            temperature=0,
            max_tokens=1000
        )

        introduction_prompt = f"""
        Based on the complete analysis of issues faced by {company_name}, create a compelling executive introduction that:

        1. Provides context about {company_name}'s position in the {industry_of_interest} industry, specifically in the {region} market
        2. Summarizes the scope and methodology of the analysis
        3. Highlights the key findings and most critical issues identified
        4. Outlines the structure of the report
        5. Sets appropriate expectations for readers

        Analysis to summarize:
        {comprehensive_analysis}

        Additional issues identified:
        {response2}

        Format your response as:
        # Executive Introduction
        [Your analysis here]

        Keep the introduction to approximately 500 words and ensure it provides a strong foundation for understanding the detailed analysis that follows.
        """

        introduction = chatbot.ask(introduction_prompt)
        print(introduction)

        # Modify the final analysis assembly to include the introduction
        final_analysis = f"""
        {introduction}

        {comprehensive_analysis}

        {response2}

        ## Analysis Methodology Note
        This analysis was conducted in two phases:
        1. Detailed analysis of major issue categories (representing {cumulative_percentage:.1f}% of identified issues)
        2. Comprehensive review of all identified issues to ensure complete coverage and identify cross-cutting concerns
        """

        # Save comprehensive analysis
        comprehensive_analysis_path = f"{general_folder}/Outputs/CompiledOutputs/ComprehensiveIssuesAnalysis_{company_name}.md"
        with open(comprehensive_analysis_path, "w", encoding='utf-8') as file:
            file.write(comprehensive_analysis)

        # Check for additional issues
        logging.info("Checking for additional issues not covered in category analyses")
        
        # Get all bullet points, including those from minor categories
        all_bullet_points = "\n".join([
            "\n".join(category['issues'])
            for category in categories_data
        ])
        
        system_prompt = """You are an expert analyst focused on identifying overlooked or underanalyzed issues. Your role is to find significant issues that weren't adequately addressed in the previous category-based analyses."""
        
        chatbot = ChatGPT(
            system_prompt=system_prompt,
            model_name="chatgpt-4o-latest",
            temperature=0,
            max_tokens=2600,
        )
        
        question2 = f"""
A List of Issues: This list contains various issues relevant to the company or stakeholder.
A Comprehensive Analysis Summary: This is a detailed summary that describes identified issues.
Task:

Compare the List of Issues with the Comprehensive Analysis Summary.
Identify any issues from the List of Issues that are not adequately described or are completely missing in the Comprehensive Analysis Summary.
Output Format:

Present the missing issues in a structured format, using bullet points with the following title "\n\n## Additional Issues Identified:".
For each missing issue, provide a brief description that highlights its significance or impact.

Output Example:
## Additional Issues Identified:
Missing Issue 1: Brief description of the issue and why it is important.
Missing Issue 2: Brief description of the issue and its potential impact.

Here is the List of Issues: {all_bullet_points}

Here is the Comprehensive Analysis Summary: {comprehensive_analysis}

Your response should only include the missing issues identified with their brief descriptions in a clear and concise manner. Avoid repeating similar issues multiple times into the list. Do not rewrite the existing analysis or list."""
        
        response2 = chatbot.ask(question2)
        
        if language.lower() != 'english':
            comprehensive_analysis = translate_content(comprehensive_analysis, 'auto', language)
            response2 = translate_content(response2, 'auto', language)

        # Generate final combined analysis
        final_analysis = f"""
{comprehensive_analysis}

{response2}

## Analysis Methodology Note
This analysis was conducted in two phases:
1. Detailed analysis of major issue categories (representing {cumulative_percentage:.1f}% of identified issues)
2. Comprehensive review of all identified issues to ensure complete coverage and identify cross-cutting concerns
"""
        
        # Save final analysis
        final_analysis_path = f"{general_folder}/Outputs/CompiledOutputs/ComprehensiveIssuesAnalysis_{company_name}.md"
        with open(final_analysis_path, "w", encoding='utf-8') as file:
            file.write(final_analysis)
            
        # Convert to PDF
        try:
            pdf_path = convert_md_to_pdf(
                input_file=final_analysis_path,
                output_file=str(Path(final_analysis_path).with_suffix('.pdf')),
                css_file='template/CompactCSSTemplate.css'
            )
            logging.info(f"Generated PDF report at: {pdf_path}")
        except Exception as e:
            logging.error(f"Failed to generate PDF: {str(e)}")
            logging.error("Continuing with markdown output only")
            
        logging.info("Issues analysis generation completed successfully")
        return final_analysis
        
    except Exception as e:
        logging.error(f"Error generating issues analysis: {str(e)}")
        logging.error(traceback.format_exc())
        raise

# Function to generate the topic summaries

def generate_topics_output(articles_sorted: List[Dict], company_name: str, language: str, general_folder: str, region: str,
                         industry_of_interest: str = None) -> str:
    """
    Generate topic-based summaries from the processed articles.
    Automatically checks if insights need to be generated first.
    
    Args:
        articles_sorted (List[Dict]): List of preprocessed and sorted articles
        company_name (str): Name of the company being analyzed
        language (str): Output language for the summaries
        general_folder (str): Base path for output files
        region (str): Geographic region of interest
        industry_of_interest (str): Industry relevant to the analysis
        
    Returns:
        str: Generated markdown content with topic summaries
    """
    try:
        logging.info(f"Starting topic summaries generation for {company_name}")

        # Validate required parameters
        if industry_of_interest is None or not industry_of_interest.strip():
            raise ValueError("industry_of_interest is required for generating topic summaries")
            
        # Check if we need insights for our summaries
        compiled_insights_path = f"{general_folder}/Outputs/CompiledOutputs/CompiledInsights_{company_name}.md"
        if not os.path.exists(compiled_insights_path):
            logging.info("Compiled insights not found, generating insights first")
            print("Compiled insights not found, generating insights first")
            generate_insights_output(
                articles_sorted=articles_sorted,
                company_name=company_name,
                general_folder=general_folder,
                industry_of_interest=industry_of_interest,
                region=region,
                language=language
            )
            print("Compiled insights generated")

        # Check if categories exist, if not extract them
        if not any('category' in article for article in articles_sorted):
            logging.info("Categories not found, extracting categories")
            articles_sorted = extract_categories(
                articles_sorted=articles_sorted, 
                company_name=company_name,
                industry_of_interest=industry_of_interest,
                region=region
            )
            # Save updated articles data
            save_data_to_json(articles_sorted, f"{general_folder}/Outputs/CompiledOutputs/ArticlesList.json")
            
        else:
            logging.info("Using existing categories")

        # Generate one-sentence descriptions
        logging.info("Generating article descriptions")
        system_prompt = """You are a helpful assistant. Your role is to describe in one single sentence what a given news media article says about a company. The final goal of this exercise is to be able to extract general themes and topics from the article. The one sentence you have to write should be focussed on a given company."""
        compiled_sentences = ""

        for article in articles_sorted:
            article_content = article.get('content', '')
            chatbot = ChatGPT(
                system_prompt=system_prompt,
                model_name="gpt-4o-mini",
                temperature=0,
                max_tokens=350,
            )

            question = f"""
Please write a single sentence about the content of the news article with regards to {company_name}. Your output should only consist of that one sentence.
This one sentence should highlight the main topic or theme of the article from the perspective of {company_name}. We are interested about what is said on {company_name} in the article.

Here is the article: {article_content}
            """

            response = chatbot.ask(question)
            article['one_sentence_description'] = response
            compiled_sentences += response + "\n"

        # Define categories
        logging.info("Defining topic categories")
        system_prompt = """You are a helpful assistant. Your role is to define topic categories based on a series of one-sentence descriptions of news articles related to a company. The goal is to identify exclusive, non-overlapping topic categories based on the media coverage of the company."""
        chatbot = ChatGPT(
            system_prompt=system_prompt,
            model_name="chatgpt-4o-latest",
            temperature=0,
            max_tokens=1000,
        )

        question = f"""
You will be provided with a document named compiled_sentences. This document contains a series of one-sentence descriptions, each summarizing a news article related to {company_name}. Your task is to identify a maximum of 10 exclusive, non-overlapping topic categories based on the media coverage of the company. However, it is better and prefered if fewer categories are sufficient to cover the main aspects of the media coverage.

Follow these guidelines:

Topic Categories: Define categories that are neither too general nor too specific. Ensure the categories are mutually exclusive, meaning no two categories should cover the same subject matter.

Clarity: Each category should have a clear focus, reflecting distinct aspects of the media coverage related to {company_name}.

Output Format: List the categories in a bullet-point format with a brief description (1-2 sentences) explaining each category.

here is the compiled_sentences document: {compiled_sentences}

Be sure to focus on key themes present in the document and avoid redundant or overly broad topics. The fewer the number of categories, the better, as long as they are distinct and cover the main aspects of the media coverage.
Avoid defining categories that are too semantically similar or overlapping. For instance, "Financial Performance" and "Economic Growth" are too closely related to be separate categories. For example, Staffing Shortages, Labor Relations, Working Conditions and Recruitment Challenges should be grouped under a single category like "Human Resources Issues".
        """

        response = chatbot.ask(question)
        category_titles = re.findall(r'\*\*(.*?)\*\*', response)

        # Categorize articles
        logging.info("Categorizing articles")
        for article in articles_sorted:
            article_content = article.get('content', '')
            chatbot = ChatGPT(
                system_prompt=f"""You are a helpful assistant. Your role is to categorize a given news media article about {company_name} into one of the predefined categories. Your output should consist solely of the category name, chosen from the provided list of categories.""",
                model_name="gpt-4o-mini",
                temperature=0,
                max_tokens=50,
            )

            question = f"""
Please categorize the following article about {company_name} into one of the predefined categories. 
Your output should only consist of the category name.
Here is the article content: {article_content}

Based on the content of the article, choose the most appropriate category from the following list: {category_titles}
Your output should solely be the name of the category chosen and nothing else.
            """

            article['category'] = chatbot.ask(question)

        # Filter and sort categories
        filtered_posts, kept_categories = filter_top_categories(articles_sorted)
        category_titles = [cat for cat in category_titles if cat in kept_categories]

        # Sort articles based on dates
        for article in articles_sorted:
            if 'date' in article:
                date_object = datetime.strptime(article['date'], '%B %d, %Y')
                article['timestamp'] = date_object.timestamp()

        articles_sorted = sorted(articles_sorted, key=lambda x: x['timestamp'])

        # Prepare articles by category
        category_articles = defaultdict(list)
        for article in filtered_posts:
            category = article.get('category')
            if category in kept_categories:
                category_articles[category].append(article)

        sorted_categories = sorted(category_articles.keys(), 
                                 key=lambda x: len(category_articles[x]), 
                                 reverse=True)

        # Generate topic summaries
        logging.info("Generating category summaries")
        compiled_topics_summaries = f"# Media coverage - Topics Summaries - {company_name}\n\n"

        for index, category in enumerate(sorted_categories):
            logging.info(f"Processing category: {category}")
            compiled_insights = ""
            article_sources = []
            
            for article in category_articles[category]:
                if insights_path := article.get('insights_path'):
                    insights_content = read_insights_content(insights_path)
                    date_str = article.get('date')
                    compiled_insights += f"Article: {article['title']} (Date: {date_str})\n{insights_content}\n\n"
                    
                    article_sources.append(
                        f"- [{article['title']}]({article['link']}), "
                        f"{article['author_name']}, {article['media_outlet']}, {date_str}"
                    )

            if compiled_insights:
                system_prompt = f"""You are a helpful assistant. Your role is to provide a detailed summary of media coverage for a specific category related to {company_name}. 
The compiled insights are structured in a chronological order. It is important to reflect the evolution of the media coverage along time in your summary.
Based on the compiled insights from multiple articles, create a comprehensive summary that captures the key points, trends, and developments within this category."""
                chatbot = ChatGPT(
                    system_prompt=system_prompt,
                    model_name="chatgpt-4o-latest",
                    temperature=0,
                    max_tokens=4500,
                )
                
                summary = chatbot.ask(f"""
Please provide a detailed summary of the media coverage for the category: {category}. Base your summary on the following compiled insights from multiple articles:

{compiled_insights}

Structure your output as follows:

# {category}

## 1. Overview
Provide a brief overview of the general trends and key themes in the media coverage for this category.

## 2. Chronological Analysis
### 2.1 Early Coverage
Summarize the initial media coverage and key events.

### 2.2 Developing Trends
Describe how the coverage evolved over time, highlighting significant shifts or new developments.

### 2.3 Recent Developments
Focus on the most recent media coverage and current state of affairs.

## 3. Key Themes
### 3.1 [Theme 1]
....
### 3.X [Theme X]
(Identify and elaborate on 3, 4 or 5 major themes or topics that emerged in the media coverage)

## 4. Stakeholder Perspectives
### 4.1 [Stakeholder Group 1]
....
### 4.X [Stakeholder Group X]
(Summarize perspectives from different stakeholders, preferably specific individuals or entities mentionned in the coverage. Provide insights into their positions, concerns, actions or opinions. Mention tangible names of stakeholders from the compiled insights you have received.)

## 5. Implications and Future Outlook
Discuss the potential implications of the media coverage and provide insights into future trends or developments.

Additional Guidelines:
1. Ensure your summary comprehensively describes the overall media coverage for this category.
2. Reflect the evolution of the media coverage over time in your analysis.
3. At the end of each sentence, cite the source you fetched the information from. Use the format: [Media Outlet, Date].
4. Avoid citing the same sources twice.
5. Do not include a separate sources section at the end.
6. Take your time and visualize your output at each step of the reasoning process.
7. If a section is not applicable based on the available information, you may omit it, but maintain the overall structure.
8. Make your output as long as it is possible or needed to contain all the most relevant information and insights.

Your structured summary should provide a clear and comprehensive analysis of the media coverage, making it easy for readers to navigate and understand the key points and developments in this category.
                """)
                
                if language.lower() != 'english':
                    summary = translate_content(summary, 'auto', language)
                
                # Format summary with anchors and sources
                summary_with_anchors = re.sub(
                    r'^(#+) (.+)$',
                    lambda m: '<a name="{}"></a>\n\n{}'.format(re.sub(r"\W+", "-", m.group(2).lower()), m.group(0)),
                    summary,
                    flags=re.MULTILINE
                )
                
                sources_section = "**Sources**:\n" + "\n".join(article_sources)
                page_break = "\n<div style=\"page-break-after: always;\"></div>\n" if index < len(sorted_categories) - 1 else "\n"
                
                compiled_topics_summaries += f"{summary_with_anchors}\n\n{sources_section}{page_break}"
                
                # Save individual category summary
                safe_category = re.sub(r'[^\w\-_\. ]', '_', category)
                summary_path = os.path.join(general_folder, "Outputs", "TopicsSummaries", f"Summary_{safe_category}.md")
                os.makedirs(os.path.dirname(summary_path), exist_ok=True)
                
                with open(summary_path, "w", encoding='utf-8') as file:
                    file.write(f"{summary_with_anchors}\n\n{sources_section}")

        # Generate and insert table of contents
        toc = generate_toc(compiled_topics_summaries)
        main_title_end = compiled_topics_summaries.find("\n", compiled_topics_summaries.find("# Media coverage"))
        compiled_topics_summaries = (
            compiled_topics_summaries[:main_title_end] + 
            "\n\n" + toc + 
            compiled_topics_summaries[main_title_end:]
        )

        # Generate executive introduction
        logging.info("Generating executive introduction")
        chatbot = ChatGPT(
            model_name="chatgpt-4o-latest",
            temperature=0,
            max_tokens=1000
        )

        introduction_prompt = f"""
        Based on the complete topic-based analysis of media coverage for {company_name}, create an executive introduction that:

        1. Provides context about {company_name}'s media coverage in the {industry_of_interest} industry, specifically in the {region} market
        2. Summarizes the scope of the analysis and the time period covered
        3. Highlights the main topics identified and their relative importance
        4. Outlines the key trends and patterns observed across different topics
        5. Explains how the analysis is structured and what readers can expect to learn

        Analysis to summarize:
        {compiled_topics_summaries}

        Format your response as:
        # Executive Introduction
        [Your analysis here]

        Keep the introduction to approximately 500 words and ensure it provides a clear roadmap for understanding the detailed topic analyses that follow.
        """

        introduction = chatbot.ask(introduction_prompt)
        print(introduction)

        # Modify the final document assembly to include the introduction
        compiled_topics_summaries = (
            f"{introduction}\n\n" +  # Add introduction at the start
            f"# Media coverage - Topics Summaries - {company_name}\n\n" +
            toc +  # Keep table of contents
            compiled_topics_summaries[main_title_end:]  # Keep rest of the content
        )

        # Save complete topics summary (rest of the code remains the same)
        topics_summaries_path = os.path.join(general_folder, "Outputs", "CompiledOutputs", f"TopicsSummaries{company_name}.md")
        with open(topics_summaries_path, "w", encoding='utf-8') as file:
            file.write(compiled_topics_summaries)

        # Save complete topics summary
        topics_summaries_path = os.path.join(general_folder, "Outputs", "CompiledOutputs", f"TopicsSummaries{company_name}.md")
        with open(topics_summaries_path, "w", encoding='utf-8') as file:
            file.write(compiled_topics_summaries)

        logging.info("Topic summaries generation completed successfully")
        return compiled_topics_summaries

    except Exception as e:
        logging.error(f"Error generating topic summaries: {str(e)}")
        logging.error(traceback.format_exc())
        raise

# Function to generate the analytics report

def generate_analytics_output(articles_sorted: List[Dict], company_name: str, general_folder: str, language: str = 'English') -> str:
    """
    Generate media analytics report with sentiment analysis and visualizations.
    
    Args:
        articles_sorted (List[Dict]): List of preprocessed and sorted articles
        company_name (str): Name of the company being analyzed
        general_folder (str): Base path for output files
        language (str, optional): Output language for the analytics report. Defaults to 'English'.
        
    Returns:
        str: Generated markdown content with media analytics
    """
    try:
        logging.info(f"Starting media analytics generation for {company_name}")
        
        # Check if categories exist, if not extract them
        if not any('category' in article for article in articles_sorted):
            logging.info("Categories not found, extracting categories")
            articles_sorted = extract_categories(articles_sorted, company_name)
            # Save updated articles data
            save_data_to_json(articles_sorted, f"{general_folder}/Outputs/CompiledOutputs/ArticlesList.json")

        # Perform sentiment analysis
        logging.info("Performing sentiment tone analysis")
        system_prompt = f"""You are a helpful assistant. Your role is to assess the overall tone of a news article, specifically focussing on the article's content about {company_name}."""
        
        for article in articles_sorted:
            chatbot = ChatGPT(
                system_prompt=system_prompt,
                model_name="gpt-4o-mini",
                temperature=0,
                max_tokens=50,
            )
            
            # Determine tone (Positive/Neutral/Negative)
            question = f"""
            Please assess the tone of the news article with specific regard to the company {company_name}. Focus on how the article portrays the company in terms of its actions, reputation, performance, and impact. The tone should be categorized as one of the following:

            Positive: The article reflects well on {company_name}, highlighting favorable aspects.
            Neutral: The article presents information about {company_name} in a balanced, objective manner without any strong positive or negative bias.
            Negative: The article is critical of {company_name}, highlighting challenges, controversies, failures, or unfavorable developments.

            Provide the final tone assessment as "Positive," "Neutral," or "Negative."

            Here is the article content: {article['content']}
            """
            
            response = marvin.classify(
                question,
                labels=["Positive", "Neutral", "Negative"],
            )
            print(response)
            article['tone'] = response
            
            # Determine sentiment score (-5 to 5)
            question = f"""
Please perform a sentiment analysis on the following articles with a focus on how {company_name} is described. Your task is to assess the sentiment expressed in the article toward the company, considering factors such as tone, language, and context.

Provide a sentiment score between -5 to 5, where:

-5:The article is highly critical of {company_name}, using harsh language or strongly negative connotations. The company may be portrayed as responsible for serious failures, unethical behavior, scandals, or significant negative impacts. There’s a clear sense that the company is in severe trouble or being portrayed in a completely unfavorable light.
-4:The article expresses notable criticism of {company_name}. While not as extreme as a -5 rating, it still emphasizes significant issues such as poor performance, leadership failures, or public backlash. The tone is largely negative, though it may leave some room for possible improvements or recovery.
-3:The article discusses {company_name} in a primarily negative light, but the critique is more moderate. It might point out failures, setbacks, or unfavorable developments, yet the language isn’t harsh or overly aggressive. The overall tone conveys dissatisfaction or disappointment with the company’s recent actions or performance.
-2:The article contains some negative remarks about {company_name}, but the tone is fairly mild. It could mention minor setbacks, criticism, or challenges the company is facing, but these issues are not presented as disastrous. The negative sentiment is present, but not dominating the narrative.
-1:There is a mild negative tone toward {company_name}. The article might hint at a few concerns or criticisms, but they are not heavily emphasized. Any negativity in the article is subtle and doesn’t have a significant impact on the overall perception of the company.
0:The article maintains a balanced, objective tone. {company_name} is presented in a factual way, with no clear positive or negative sentiment. The language is neutral, avoiding emotionally charged words, and focuses on reporting facts, data, or events without any bias.
1:The article offers a slight positive portrayal of {company_name}. There might be some light praise, or favorable developments are mentioned. However, the positive sentiment is not strong or overwhelming. It might suggest that the company is moving in a good direction, but without great enthusiasm.
2:The article leans more positively toward {company_name}, offering clear but moderate praise. It might highlight successes, improvements, or positive steps taken by the company. The tone is somewhat optimistic, though not overly enthusiastic or celebratory.
3:The article portrays {company_name} in a largely favorable light. It discusses positive developments, achievements, or strengths, and conveys a sense of confidence in the company’s direction or leadership. While it’s not effusive, the overall tone is optimistic and supportive of the company.
4:The article provides strong praise for {company_name}, focusing on notable successes, leadership, or innovation. It emphasizes the positive impact of the company’s actions or performance, suggesting that it is doing particularly well. The tone is upbeat and reflects a confident view of the company’s future.
5:The article is highly complimentary of {company_name}, offering glowing praise and highlighting outstanding achievements, leadership, or innovations. The language is celebratory and enthusiastic, leaving the reader with a very positive impression of the company. It suggests that the company is excelling or making a significant positive impact.

Ensure that the sentiment score is based solely on the article's description and tone regarding {company_name}. 

Here is the article content: {article['content']}

Your output should solely be one of the following value: [-5,-4,-3,-2,-1,0,1,2,3,4,5]. Your output should only be one of the number in this list and nothing else.
"""
            
            response = marvin.classify(
                question,
                labels=["-5", "-4", "-3", "-2", "-1", "0", "1", "2", "3", "4", "5"],
            )
            print(response)
            sentiment_score = extract_sentiment_score(response)
            article['sentiment score'] = sentiment_score if sentiment_score is not None else 'Invalid response'

        # Save updated articles data
        save_data_to_json(articles_sorted, f"{general_folder}/Outputs/CompiledOutputs/ArticlesList.json")
        articles_sorted = load_data_from_json(f"{general_folder}/Outputs/CompiledOutputs/ArticlesList.json")

        # Create DataFrame for analysis
        df = pd.DataFrame(articles_sorted)
        df['date'] = pd.to_datetime(df['timestamp'], unit='s')
        
        # Set visualization style
        plt.style.use('ggplot')

        # Initialize charts dictionary
        charts = {}
        
        # Function to safely generate chart
        def generate_chart_safely(chart_name: str, chart_function, *args, **kwargs):
            try:
                logging.info(f"Generating chart: {chart_name}")
                if df.empty:
                    logging.warning(f"DataFrame is empty, skipping chart: {chart_name}")
                    return None
                chart = chart_function(*args, **kwargs)
                logging.info(f"Successfully generated chart: {chart_name}")
                return chart
            except Exception as e:
                logging.error(f"Error generating {chart_name}: {str(e)}")
                logging.error(traceback.format_exc())
                return None

        # Generate each chart with error handling
        chart_generators = {
            'media_outlet_pie': (generate_media_outlet_pie_chart, [df]),
            'media_outlet_tone': (generate_media_outlet_tone_chart, [df]),
            'overall_sentiment': (generate_overall_sentiment_trend, [df, company_name]),
            'sentiment_by_outlet': (generate_sentiment_trends_by_outlet, [df]),
            'sentiment_by_category': (generate_sentiment_trends_by_category, [df]),
            'articles_by_category': (generate_articles_per_category, [df]),
            'category_tone': (generate_category_tone_chart, [df]),
            'top_journalists': (generate_top_journalists_chart, [df, company_name])
        }

        for chart_name, (func, args) in chart_generators.items():
            charts[chart_name] = generate_chart_safely(chart_name, func, *args)

        # Calculate statistics (only if we have data)
        if df.empty:
            logging.warning("DataFrame is empty, using default statistics")
            stats = {
                'total_articles': 0,
                'date_range': "No data available",
                'avg_sentiment': 0,
                'median_sentiment': 0
            }
        else:
            try:
                stats = {
                    'total_articles': len(df),
                    'date_range': f"from {df['date'].min().strftime('%d %b %Y')} to {df['date'].max().strftime('%d %b %Y')}",
                    'avg_sentiment': df['sentiment score'].mean(),
                    'median_sentiment': df['sentiment score'].median()
                }
            except Exception as e:
                logging.error(f"Error calculating statistics: {str(e)}")
                stats = {
                    'total_articles': len(df),
                    'date_range': "Error calculating date range",
                    'avg_sentiment': 0,
                    'median_sentiment': 0
                }

        try:
            media_outlet_stats = []
            for outlet in df['media_outlet'].unique():
                try:
                    outlet_df = df[df['media_outlet'] == outlet]
                    media_outlet_stats.append({
                        'outlet': outlet,
                        'articles': len(outlet_df),
                        'avg_sentiment': outlet_df['sentiment score'].mean(),
                        'median_sentiment': outlet_df['sentiment score'].median()
                    })
                except Exception as e:
                    logging.error(f"Error processing outlet {outlet}: {str(e)}")
                    continue
            
            media_outlet_stats.sort(key=lambda x: x['articles'], reverse=True)
        except Exception as e:
            logging.error(f"Error generating media outlet statistics: {str(e)}")
            media_outlet_stats = []

        # Generate final report with error handling for missing charts
        try:
            markdown_content = generate_markdown_report(
                company_name=company_name,
                total_articles=stats['total_articles'],
                date_range=stats['date_range'],
                avg_sentiment=stats['avg_sentiment'],
                median_sentiment=stats['median_sentiment'],
                media_outlet_pie_chart=charts.get('media_outlet_pie', ''),
                top_journalists_chart=charts.get('top_journalists', ''),
                media_outlet_tone_chart=charts.get('media_outlet_tone', ''),
                overall_sentiment_trend=charts.get('overall_sentiment', ''),
                sentiment_trends_by_outlet=charts.get('sentiment_by_outlet', ''),
                media_outlet_stats=media_outlet_stats,
                articles_per_category=charts.get('articles_by_category', ''),
                category_tone_chart=charts.get('category_tone', ''),
                sentiment_trends_by_category=charts.get('sentiment_by_category', ''),
                df=df  # Make sure df contains 'date', 'sentiment score', and 'content' columns
            )
        except Exception as e:
            logging.error(f"Error generating markdown report: {str(e)}")
            markdown_content = f"Error generating media analytics report: {str(e)}"

        # Save the report
        try:
            media_analytics_path = f'{general_folder}/Outputs/CompiledOutputs/MediaAnalytics{company_name}.md'
            with open(media_analytics_path, 'w', encoding='utf-8') as f:
                f.write(markdown_content)
        except Exception as e:
            logging.error(f"Error saving media analytics report: {str(e)}")

        logging.info("Media analytics report generation completed")
        return markdown_content

    except Exception as e:
        logging.error(f"Error in generate_analytics_output: {str(e)}")
        logging.error(traceback.format_exc())
        return f"Error generating media analytics: {str(e)}"

def generate_stakeholder_quotes(articles_sorted: List[Dict], company_name: str, general_folder: str, language: str = 'English') -> str:
    """
    Generate stakeholder analysis from the processed articles.
    
    Args:
        articles_sorted (List[Dict]): List of preprocessed and sorted articles
        company_name (str): Name of the company being analyzed
        general_folder (str): Base path for output files
        language (str, optional): Output language for the stakeholder analysis. Defaults to 'English'.
        
    Returns:
        str: Generated markdown content with stakeholder analysis
    """
    try:
        logging.info(f"Starting stakeholder analysis generation for {company_name}")
        
        # Process stakeholder information
        stakeholder_quotes = process_stakeholder_info(company_name, articles_sorted)
        
        # Process and clean the markdown table
        stakeholder_quotes_processed = process_markdown_table(stakeholder_quotes)
        
        # Save the processed content
        output_folder = os.path.join(general_folder, "Outputs", "CompiledOutputs")
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # Save both markdown and CSV versions
        md_filename = os.path.join(output_folder, f"StakeholderQuotes_{company_name}.md")
        csv_filename = os.path.join(output_folder, f"StakeholderQuotes_{company_name}.csv")
        
        # Create the full markdown document
        md_document = f"""# Stakeholder Analysis Report - {company_name}
Generated on: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}

## Overview
This report contains stakeholder quotes and sentiments related to {company_name}.

## Data Table
{stakeholder_quotes_processed}

## Notes
- Quotes have been deduplicated and sorted alphabetically by stakeholder name
- Translations are provided where the original quote is not in English
- Sentiment analysis is based on the context and content of each quote
"""
        
        # Save markdown file
        with open(md_filename, 'w', encoding='utf-8') as md_file:
            md_file.write(md_document)
            
        # Extract and save CSV data
        lines = stakeholder_quotes_processed.strip().split('\n')
        header_row = [col.strip() for col in lines[1].split('|')[1:-1]]
        
        data_rows = []
        for line in lines[3:]:
            if '|' in line:
                row = [cell.strip() for cell in line.split('|')[1:-1]]
                data_rows.append(row)
        
        with open(csv_filename, 'w', encoding='utf-8', newline='') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(header_row)
            writer.writerows(data_rows)
        
        logging.info("Stakeholder analysis generation completed successfully")
        return md_document

    except Exception as e:
        logging.error(f"Error in generate_stakeholder_quotes: {str(e)}")
        logging.error(traceback.format_exc())
        return f"Error generating stakeholder analysis: {str(e)}"

def generate_consolidated_stakeholder_analysis(company_name: str, articles: list, general_folder: str) -> str:
    """
    Generate a consolidated stakeholder analysis by combining quotes from the same stakeholder
    and analyzing their overall opinion and sentiment towards the company.
    
    Args:
        company_name (str): Name of the company being analyzed
        articles (list): List of processed articles
        general_folder (str): Base path for output files
        
    Returns:
        str: Generated markdown content with consolidated stakeholder analysis
    """
    try:
        # Check for existing stakeholder quotes file
        existing_quotes_path = os.path.join(general_folder, "Outputs", "CompiledOutputs", f"StakeholderQuotes_{company_name}.md")
        
        if not os.path.exists(existing_quotes_path):
            logging.info("No existing stakeholder quotes file found, generating new quotes")
            # Generate the stakeholder quotes first
            md_document = generate_stakeholder_quotes(articles, company_name, general_folder)
            
            if md_document.startswith("Error"):
                raise ValueError(f"Failed to generate stakeholder quotes: {md_document}")
        else:
            logging.info(f"Found existing stakeholder quotes file: {existing_quotes_path}")
            with open(existing_quotes_path, 'r', encoding='utf-8') as f:
                md_document = f.read()

        # Extract the data table section from the markdown document
        table_start = md_document.find('## Data Table\n') + len('## Data Table\n')
        table_end = md_document.find('## Notes')
        if table_end == -1:  # If '## Notes' section doesn't exist
            table_end = len(md_document)
        
        raw_stakeholder_data = md_document[table_start:table_end].strip()

        # Initialize dictionary to store stakeholder information
        stakeholders = {}
        
        # Parse the markdown table and group by stakeholder
        lines = raw_stakeholder_data.split('\n')
        header_found = False
        
        for line in lines:
            # Skip lines until we find the header row
            if '| Stakeholder Name |' in line:
                header_found = True
                continue
            # Skip the separator line
            if header_found and '|---' in line:
                continue
            # Process data rows
            if header_found and '|' in line and not line.strip().startswith('#'):
                parts = [part.strip() for part in line.split('|')]
                if len(parts) < 4:  # Skip malformed rows
                    continue
                
                # Extract stakeholder name from markdown link if present
                stakeholder_name = parts[1].strip()
                if '[' in stakeholder_name and ']' in stakeholder_name:
                    stakeholder_name = stakeholder_name[stakeholder_name.find('[')+1:stakeholder_name.find(']')]
                
                # Skip rows where stakeholder name is empty or contains "Stakeholder Name"
                if not stakeholder_name or 'Stakeholder Name' in stakeholder_name:
                    continue

                try:
                    role = parts[2].strip()
                    quote = parts[3].strip()
                    translation = parts[4].strip() if len(parts) > 4 else "N.A."
                    sentiment = parts[5].strip() if len(parts) > 5 else "Unknown"
                    
                    # Find corresponding article content
                    article_link = None
                    if '[' in parts[1] and '](' in parts[1] and ')' in parts[1]:
                        article_link = parts[1][parts[1].find('](')+2:parts[1].find(')')]
                    
                    if stakeholder_name not in stakeholders:
                        stakeholders[stakeholder_name] = {
                            'roles': set(),
                            'quotes': [],
                            'articles': [],
                            'sentiments': []
                        }
                    
                    stakeholders[stakeholder_name]['roles'].add(role)
                    stakeholders[stakeholder_name]['quotes'].append(quote)
                    for article in articles:
                        if article.get('link') == article_link:
                            stakeholders[stakeholder_name]['articles'].append(article.get('content', ''))
                            break
                    stakeholders[stakeholder_name]['sentiments'].append(sentiment)
                    
                    logging.debug(f"Successfully parsed stakeholder: {stakeholder_name}")
                except IndexError as e:
                    logging.error(f"Error parsing row for stakeholder {stakeholder_name}: {str(e)}")
                    continue

        # Generate consolidated analysis
        consolidated_md = f"""# Consolidated Stakeholder Analysis - {company_name}

## Overview
This analysis consolidates stakeholder opinions and sentiments, providing a comprehensive view of key perspectives on {company_name}. Each stakeholder's quotes have been analyzed in context to understand their overall position and potential impact on the company.

## Stakeholder Perspectives

| Stakeholder Name | Role/Position | Opinion and Sentiment Analysis |
|-----------------|---------------|--------------------------------|
"""
        
        # Process each stakeholder
        for stakeholder_name, data in stakeholders.items():
            # Skip if no meaningful data
            if not data['quotes'] or not data['articles']:
                continue
                
            # Combine all quotes and article contexts for analysis
            context = "\n\n".join([
                f"Quote: {quote}\nArticle Context: {article}"
                for quote, article in zip(data['quotes'], data['articles'])
            ])
            
            # Analyze stakeholder role
            role_chatbot = ChatGPT(
                model_name="chatgpt-4o-latest",
                temperature=0,
                max_tokens=50
            )
            
            role_prompt = f"""
Based on the following context, determine a clear, concise role or position for the stakeholder {stakeholder_name} in relation to {company_name}.

Previously identified role(s): {', '.join(data['roles'])}

Article contexts:
{chr(10).join(data['articles'])}

Requirements:
1. Provide a single role or position in 1-4 words maximum
2. Be specific and relevant to {company_name} or its industry
3. Focus on the stakeholder's professional capacity or expertise
4. Use concise, clear terminology

Example good responses:
- "Chief Technology Officer"
- "Industry Analyst"
- "Solar Energy Expert"
- "Investment Director"

Example bad responses:
- "A highly experienced professional in the field" (too long)
- "Person involved in operations" (too vague)
- "Stakeholder" (too generic)

Output only the role/position, nothing else.
"""
            
            role_analysis = role_chatbot.ask(role_prompt).strip()
            logging.debug(f"Determined role for {stakeholder_name}: {role_analysis}")
            
            # Initialize analysis chatbot for opinion/sentiment
            opinion_chatbot = ChatGPT(
                model_name="gpt-4o-mini",
                temperature=0,
                max_tokens=500
            )
            
            analysis_prompt = f"""
Analyze the following stakeholder's opinions and sentiment towards {company_name}. The stakeholder is {stakeholder_name}, who has been identified as: {role_analysis}.

Context and Quotes:
{context}

Provide a 3-4 sentence analysis that covers:
1. The stakeholder's overall position towards {company_name}
2. The consistency or evolution of their views
3. The potential impact of their opinions on {company_name}'s reputation or operations
4. Any notable patterns in their sentiment ({', '.join(data['sentiments'])})

Format your response as a single paragraph without any headers or bullet points.
"""
            
            analysis = opinion_chatbot.ask(analysis_prompt)
            
            # Add row to markdown table using the analyzed role
            consolidated_md += f"| {stakeholder_name} | {role_analysis} | {analysis} |\n"
        
        # Save the consolidated analysis
        output_path = os.path.join(general_folder, "Outputs", "CompiledOutputs", f"ConsolidatedStakeholderAnalysis_{company_name}.md")
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(consolidated_md)
        
        return consolidated_md

    except Exception as e:
        logging.error(f"Error in generate_consolidated_stakeholder_analysis: {str(e)}")
        logging.error(traceback.format_exc())
        return f"Error generating consolidated stakeholder analysis: {str(e)}"
def generate_journalist_profile(articles_sorted: List[Dict], journalist_name: str, news_folder_path: str, 
                              language: str = 'English', force_reprocess: bool = False) -> str:
    """
    Generate comprehensive profile analysis of a specific journalist based on their articles.
    
    Args:
        articles_sorted (List[Dict]): List of preprocessed articles
        journalist_name (str): Name of the journalist being analyzed
        news_folder_path (str): Path to the folder containing news articles
        language (str): Output language for the analysis
        force_reprocess (bool): If True, reprocess everything even if saved data exists
        
    Returns:
        str: Generated markdown content with journalist analysis
    """
    try:
        logging.info(f"Starting journalist profile analysis for {journalist_name}")
        
        # Set up directory structure
        general_folder = setup_journalist_directories(news_folder_path, journalist_name)
        processed_articles_path = os.path.join(general_folder, "Outputs", "CompiledOutputs", "ProcessedArticles.json")
        categorized_articles_path = os.path.join(general_folder, "Outputs", "CompiledOutputs", "CategorizedArticles.json")
        
        # Initialize compiled_sentences
        compiled_sentences = ""
        
        # Load or generate one-sentence descriptions
        if not force_reprocess and os.path.exists(processed_articles_path):
            logging.info("Loading previously processed articles")
            articles_sorted = load_data_from_json(processed_articles_path)
            # Compile sentences from loaded articles
            compiled_sentences = "\n".join([article.get('one_sentence_description', '') 
                                         for article in articles_sorted 
                                         if 'one_sentence_description' in article])
        else:
            # Generate one-sentence descriptions
            logging.info("Generating article descriptions")
            compiled_sentences = ""
            system_prompt = """You are a helpful assistant. Your role is to describe in one single sentence what a given news media article's main topic and angle is."""

            for article in articles_sorted:
                article_content = article.get('content', '')
                chatbot = ChatGPT(
                    system_prompt=system_prompt,
                    model_name="gpt-4o-mini",
                    temperature=0,
                    max_tokens=350,
                )

                question = f"""
Please write a single sentence summarizing this article's main topic and {journalist_name}'s angle or approach.
Focus on both the subject matter and how {journalist_name} covers it.

Article: {article_content}
                """

                response = chatbot.ask(question)
                print(response)
                article['one_sentence_description'] = response
                compiled_sentences += response + "\n"

            # Save processed articles
            save_data_to_json(articles_sorted, processed_articles_path)

        # Check if categories need to be generated
        need_categorization = True
        if not force_reprocess and os.path.exists(categorized_articles_path):
            logging.info("Loading previously categorized articles")
            saved_data = load_data_from_json(categorized_articles_path)
            
            # Check if ALL articles have categories
            all_categorized = all('category' in article for article in articles_sorted)
            
            if all_categorized:
                need_categorization = False
                # Reconstruct categories_data from saved data
                categories_data = []
                for category_info in saved_data['categories']:
                    category = {
                        'category': category_info['category'],
                        'description': category_info['description'],
                        'articles': []
                    }
                    # Find articles belonging to this category
                    for article in articles_sorted:
                        if article.get('category') == category_info['category']:
                            category['articles'].append(article)
                    categories_data.append(category)
            else:
                logging.info("Some articles missing categories - will recategorize")
                
        if need_categorization:
            logging.info("Defining coverage categories")
            system_prompt = f"""You are a media analyst. Your role is to identify the main recurring stories, narratives, and series of connected events that {journalist_name} covers across multiple articles."""
            
            chatbot = ChatGPT(
                system_prompt=system_prompt,
                model_name="models/gemini-1.5-pro",
                temperature=0,
                max_tokens=1000,
            )

            question = f"""
Based on these article summaries, identify the main recurring stories or narrative threads in {journalist_name}'s articles.
Create 5-7 distinct categories that represent specific stories, events, or series of connected events that appear across multiple articles.

Article summaries:
{compiled_sentences}

For each category:
1. Give it a name that describes the specific story/narrative (e.g., "Tech Company X Layoff Series" rather than just "Tech Industry")
2. Explain what specific events, developments, or connected stories this narrative encompasses
3. Focus on identifying stories that span multiple articles or connected events rather than broad topics

Format as:
CATEGORY: [Story/Narrative Name]
DESCRIPTION: [Explanation of the specific story thread and how it develops across articles]
"""

            categories_response = chatbot.ask(question)
            print(categories_response)

            categories_data = []
            current_category = None
            current_description = None
        
            for line in categories_response.split('\n'):
                if line.startswith('CATEGORY:'):
                    if current_category is not None:
                        categories_data.append({
                            'category': current_category.strip(),
                            'description': current_description.strip() if current_description else '',
                            'articles': []
                        })
                    current_category = line.replace('CATEGORY:', '').strip()
                    current_description = None
                elif line.startswith('DESCRIPTION:'):
                    current_description = line.replace('DESCRIPTION:', '').strip()

            # Add the last category
            if current_category is not None:
                categories_data.append({
                    'category': current_category,
                    'description': current_description if current_description else '',
                    'articles': []
                })

            # Add "Other" category automatically
            categories_data.append({
                'category': 'Other',
                'description': 'Articles that do not clearly align with specific narrative threads or recurring stories',
                'articles': []
            })

            # Check if articles already have categories
            all_categorized = all('category' in article for article in articles_sorted)
            print(f"Are all articles categorized? {all_categorized}")

            if not all_categorized:
                # Categorize articles
                logging.info("Categorizing articles")
                for article in articles_sorted:
                    if 'category' not in article:  # Only categorize if not already done
                        classification_prompt = ""
                        for category in categories_data:
                            classification_prompt += f"\n{category['category']}: {category['description']}"
                        
                        # Use chatbot to classify the article
                        chatbot = ChatGPT(
                            model_name="gpt-4o-mini",
                            temperature=0,
                            max_tokens=200
                        )
                        
                        classification = chatbot.ask(
                            f"""
Given these coverage categories for the journalist: {journalist_name}:
{classification_prompt}

Classify this article into one of these categories. Choose the most appropriate category based on the descriptions provided.

Article content:
{article['content']}

Only output the exact category name that best matches this article.
                            """
                        )
                        print(classification)
                        
                        # Add debugging
                        print(f"Trying to classify article: {article.get('title', 'Unknown')}")
                        print(f"Classification received: {classification}")
                        print(f"Available categories: {[cat['category'] for cat in categories_data]}")
                        
                        # Add article to appropriate category and save category in article
                        for category in categories_data:
                            cleaned_category = category['category'].replace('**', '').strip()
                            cleaned_classification = classification.replace('**', '').strip()
                            if cleaned_category == cleaned_classification:
                                category['articles'].append(article)
                                article['category'] = cleaned_category  # Save category in article
                                break
                
                # Save the updated articles with their categories
                print("Saving categorized articles...")
                save_data_to_json(articles_sorted, processed_articles_path)
        pass

        # Save in a format that avoids circular references
        save_data = {
            'categories': [
                {
                    'category': cat['category'].replace('**', '').strip(),  # Remove asterisks
                    'description': cat['description']
                }
                for cat in categories_data
            ],
            'articles': []
        }

        # For debugging
        processed_articles = set()  # Keep track of processed articles

        # Add articles with safer category assignment
        for article in articles_sorted:
            # Find the category for this article
            article_category = None
            for cat in categories_data:
                cat_name = cat['category'].replace('**', '').strip()  # Remove asterisks

                # Check if article is in category by title
                article_titles_in_category = {a.get('title', '') for a in cat['articles']}
                if article.get('title', '') in article_titles_in_category:
                    article_category = cat_name
                    processed_articles.add(article.get('title', ''))
                    break
                
            # Create article data with category
            article_data = article.copy()
            if article_category:
                article_data['category'] = article_category
            else:
                if article.get('title', '') not in processed_articles:  # Only warn about truly uncategorized articles
                    print(f"\nWARNING: No category found for article: {article.get('title', 'Unknown title')}")
                article_data['category'] = 'Uncategorized'

            save_data['articles'].append(article_data)

        print(f"\nSuccessfully categorized {len(processed_articles)} articles")
        save_data_to_json(save_data, categorized_articles_path)
        
        # Generate profile analysis
        logging.info("Generating profile analysis")
        profile_md = f"""
# Journalist Profile Analysis: {journalist_name}

## Overview
This analysis examines the work and coverage patterns of {journalist_name} based on {len(articles_sorted)} articles.

## Coverage Areas
"""
        # Sort categories by number of articles, keeping 'Other' last
        categories_data = sorted(
            categories_data,
            key=lambda x: (
                1 if x['category'] == 'Other' else 0,  # Force 'Other' to end
                -len(x['articles'])  # Sort rest by number of articles (descending)
            )
        )
        
        # Analyze each category
        # Inside the generate_journalist_profile function, replace the category analysis section:

        for category in categories_data:
            # Skip empty categories
            if not category['articles']:
                continue
            
            # Sort articles by date
            category['articles'].sort(key=lambda x: datetime.strptime(x.get('date', '2024-01-01'), '%B %d, %Y'))
            
            excluded_categories = [
            cat_data['category']
            for cat_data in categories_data
            if cat_data['category'] != category['category']
        ]

            # Create a comma-separated string of those categories:
            excluded_categories_str = ", ".join(excluded_categories)

            # First chatbot: Factual narrative analysis
            narrative_bot = ChatGPT(
                model_name="models/gemini-1.5-pro",
                temperature=0,
                max_tokens=2500
            )
            
            narrative_prompt = f"""
Analyze the factual coverage and key narratives in {journalist_name}'s articles for the category: {category['category']}.

Focus on events and stories that *directly* relate to {category['category']}. 
YOU MUST EXCLUDE all information which is closely related to {excluded_categories_str}.

Focus ONLY on extracting and organizing the key events, facts, and stories covered in these articles that are relevant to the '{category['category']}' category. 
Exclude any analysis of the journalist's opinions or stance.

Articles to analyze:
{json.dumps([
    {
        'title': article.get('title', 'Untitled'),
        'date': article.get('date', 'Unknown'),
        'media_outlet': article.get('media_outlet', 'Unknown'),
        'content': article.get('content', '')
    } 
    for article in category['articles']
], indent=2)}

Provide a comprehensive bullet-point analysis of the key stories and facts. For each point:

- Start with a clear event or development
- Include specific dates and chronological progression where relevant
- Name key stakeholders (companies, individuals, organizations)
- Include important statistics, quotes, or concrete outcomes
- ALWAYS cite the specific article (date and outlet) supporting each point
- Highlight connections between different events where they exist

Focus on telling the story through factual, chronological points that are well-supported by the articles.

Format each section with clear headers and bullet points.
Every fact must be linked to specific articles (date, outlet).
Focus on objective information only—no interpretation of the journalist's views.
DO NOT address this prompt directly.
            """
            
            narrative_analysis = narrative_bot.ask(narrative_prompt)
            print("Completed narrative analysis:")
            print(narrative_analysis)

            if language.lower() != 'english':
                narrative_analysis = translate_content(narrative_analysis, 'auto', language)
            
            # Second chatbot: Perspective and stance analysis
            perspective_bot = ChatGPT(
                model_name="models/gemini-1.5-pro",
                temperature=0,
                max_tokens=2500
            )
            
            stance_prompt = f"""
Analyze {journalist_name}'s perspective and stance based on their coverage in the category: {category['category']}

Use both the original articles AND the factual narrative analysis below as your source material.

Original articles:
{json.dumps([
    {
        'title': article.get('title', 'Untitled'),
        'date': article.get('date', 'Unknown'),
        'media_outlet': article.get('media_outlet', 'Unknown'),
        'content': article.get('content', '')
    } for article in category['articles']
], indent=2)}

Factual narrative analysis:
{narrative_analysis}

Provide a detailed analysis of {journalist_name}'s position, sentiment or stance on the topics described in the narrative analysis. Your output should be somehow connected to the narrative analysis.
Analyze {journalist_name}'s perspective and opinions in bullet points. For each observation:

- Describe how {journalist_name} positions themselves on specific issues, Show how {journalist_name} frames events and developments, specifically the ones described in the narrative analysis.
- Note any apparent biases or preferences in their coverage. specifically if this bias or preference is in favour of a certain individual or organisation.
- Highlight his treatment of different stakeholders, how does he consider them and what is his sentiment towards them.
- Support each point with specific quotes and article references (date, outlet)

Focus on building a clear picture of the journalist's viewpoint through concrete examples and evidence. It should give an idea of what matters or not to {journalist_name}, what does he stand for through his coverage.

Every observation must be supported by specific quotes or examples from the articles.
Focus on identifying patterns across multiple articles pieces rather than single instances.
Highlight any evolution in the journalist's perspective over time, if applicable.
DO NOT address this prompt directly.
"""
        
            stance_analysis = perspective_bot.ask(stance_prompt)
            print("Completed stance analysis:")
            print(stance_analysis)

            if language.lower() != 'english':
                stance_analysis = translate_content(stance_analysis, 'auto', language)
            
            # Add to markdown document
            profile_md += f"""      
{f"## Miscellaneous Coverage" if category['category'] == 'Other' else ''}{'' if category['category'] == 'Other' else ''}
{narrative_analysis}

## Opinion, tone and stance interpretation:    
{stance_analysis}
        
#### Articles covering this topic:
"""
            
            # Add article list
            for article in category['articles']:
                profile_md += f"- [{article.get('title', 'Untitled')}]({article.get('link', '#')}) - {article.get('media_outlet', 'Unknown')}, {article.get('date', 'Unknown Date')}\n"
            
            profile_md += "\n---\n"

            print(profile_md)

        # Generate introduction
        try:
            introduction = generate_introduction(profile_md, journalist_name)
            if language.lower() != 'english':
                introduction = translate_content(introduction, 'auto', language)

            # Generate the TOC now that we have the entire "skeleton" of profile_md
            toc = generate_toc(profile_md)

            # Replace the original overview block with (TOC + Introduction)
            profile_md = profile_md.replace(
                f"""## Overview
This analysis examines the work and coverage patterns of {journalist_name} based on {len(articles_sorted)} articles.""",
                f"""{toc}\n\n{introduction}"""
            )
        except Exception as e:
            print(f"Error generating introduction: {e}")
            # If introduction generation fails, still insert a TOC before a fallback introduction
            toc = generate_toc(profile_md)
            fallback_intro = f"""## Introduction
This analysis examines the work and coverage patterns of {journalist_name} based on {len(articles_sorted)} articles, 
covering various topics and themes throughout their journalistic career."""
            
            profile_md = profile_md.replace(
                f"""## Overview
This analysis examines the work and coverage patterns of {journalist_name} based on {len(articles_sorted)} articles.""",
                f"""{toc}\n\n{fallback_intro}"""
            )
        
        # Generate conclusion
        chatbot = ChatGPT(
            model_name="chatgpt-4o-latest",
            temperature=0,
            max_tokens=1500
        )
        
        conclusion_prompt = f"""
Based on the complete analysis of {journalist_name}'s coverage topics, create a concluding section.

Topical coverage:
{profile_md}

Keep the conclusion to approximately 300 words.
"""
        
        conclusion = chatbot.ask(conclusion_prompt)
        print(conclusion)

        if language.lower() != 'english':
            conclusion = translate_content(conclusion, 'auto', language)
            
        profile_md += conclusion

        # Save the analysis
        output_path = os.path.join(general_folder, "Outputs", "CompiledOutputs", 
                                 f"JournalistProfile_{journalist_name.replace(' ', '_')}.md")
        
        # Make sure the directory exists before writing
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(profile_md)

        return profile_md

    except Exception as e:
        logging.error(f"Error in generate_journalist_profile: {str(e)}")
        logging.error(traceback.format_exc())
        raise

# Gradio interface
def gradio_journalist_list(company_name, file_folder, docx_file_path, industry_of_interest, region, language, news_folder_path, force_reprocess,
                         generate_journalist_list=False, generate_insights=False, 
                         generate_analysis=False, generate_topics=False, 
                         generate_analytics=False, generate_quotes=False,
                         generate_consolidated_quotes=False, generate_journalist_coverage_profile=False,
                         journalist_name=None):
    try:
        # Clean up input paths
        pdf_folder = file_folder.strip() if file_folder else None
        docx_path = docx_file_path.strip() if docx_file_path else None
        
        print("\nChecking input paths...")
        if not check_input_paths(pdf_folder, docx_path):
            raise ValueError("No valid input paths provided. Please check your folder/file paths.")
        
        # Initialize document processor
        processor = DocumentProcessor(
            min_length=1000,
            max_length=25500,
            similarity_threshold=0.8
        )

        # Validate inputs
        if not file_folder and not docx_file_path:
            raise ValueError("Please provide either a PDF folder path or a DOCX file path")

        # Process documents using the DocumentProcessor
        logging.info("Processing documents")
        articles = processor.process_documents(
            pdf_folder_path=file_folder if file_folder.strip() else None,
            docx_file_path=docx_file_path if docx_file_path.strip() else None,
            docx_separator="--"
        )

        if not articles:
            raise ValueError("No valid articles found after processing documents")

        if journalist_name:
            articles_sorted, general_folder, directories_created = preprocess_journalist_articles(
                journalist_name=journalist_name,
                articles=articles,
                news_folder_path=news_folder_path,
                force_reprocess=force_reprocess
            )
        else:
            articles_sorted, general_folder, directories_created = preprocess_articles(
                company_name=company_name,
                articles=articles,
                industry_of_interest=industry_of_interest,
                region=region
            )

        if not directories_created:
            raise ValueError("Failed to create necessary directories")

        # Initialize output variables
        md_content = None
        summary_insights = None
        combined_analysis = None
        topics_summaries = None
        media_analytics = None
        stakeholder_quotes = None
        consolidated_quotes = None
        journalist_profile = None

        # Pass language parameter to each generation function
        if generate_journalist_list:
            logging.info("Generating journalist list")
            md_content = generate_journalist_list_output(
                articles_sorted=articles_sorted,
                company_name=company_name,
                general_folder=general_folder,
                language=language
            )

        if generate_insights:
            logging.info("Generating insights")
            summary_insights = generate_insights_output(
                articles_sorted=articles_sorted,
                company_name=company_name,
                general_folder=general_folder,
                industry_of_interest=industry_of_interest,
                region=region,
                language=language
            )

        if generate_analysis:
            logging.info("Generating comprehensive analysis")
            combined_analysis = generate_issue_analysis_output(
                articles_sorted=articles_sorted,
                company_name=company_name,
                general_folder=general_folder,
                industry_of_interest=industry_of_interest,
                region=region,
                language=language
            )

        if generate_topics:
            logging.info("Generating topic summaries")
            topics_summaries = generate_topics_output(
                articles_sorted=articles_sorted,
                company_name=company_name,
                general_folder=general_folder,
                industry_of_interest=industry_of_interest,  
                region=region,  
                language=language
            )

        if generate_analytics:
            logging.info("Generating media analytics")
            media_analytics = generate_analytics_output(
                articles_sorted=articles_sorted,
                company_name=company_name,
                general_folder=general_folder,
                language=language
            )

        if generate_quotes:
            logging.info("Generating stakeholder analysis")
            stakeholder_quotes = generate_stakeholder_quotes(
                articles_sorted=articles_sorted,
                company_name=company_name,
                general_folder=general_folder,
                language=language
            )

        if generate_consolidated_quotes:
            logging.info("Generating consolidated stakeholder analysis")
            consolidated_quotes = generate_consolidated_stakeholder_analysis(
                company_name=company_name,
                articles=articles_sorted,
                general_folder=general_folder
            )

        if generate_journalist_coverage_profile and journalist_name:
            logging.info(f"Generating journalist profile for {journalist_name}")
            journalist_profile = generate_journalist_profile(
                articles_sorted=articles_sorted,
                journalist_name=journalist_name,
                news_folder_path=file_folder,
                language=language
            )

        return (
            md_content if generate_journalist_list else "Journalist list not requested",
            summary_insights if generate_insights else "Summary insights not requested",
            combined_analysis if generate_analysis else "Comprehensive analysis not requested",
            topics_summaries if generate_topics else "Topic summaries not requested",
            media_analytics if generate_analytics else "Media analytics not requested",
            stakeholder_quotes if generate_quotes else "Stakeholder analysis not requested",
            consolidated_quotes if generate_consolidated_quotes else "Consolidated stakeholder analysis not requested",
            journalist_profile if generate_journalist_coverage_profile else "Journalist profile not requested"
        )

    except Exception as e:
        error_message = f"An error occurred: {str(e)}\nPlease check the app.log file for more details."
        logging.error(f"An error occurred in Gradio interface: {str(e)}")
        logging.error(traceback.format_exc())
        return error_message, error_message, error_message, error_message, error_message, error_message, error_message, error_message



INFO:httpx:HTTP Request: GET https://api.gradio.app/gradio-messaging/en "HTTP/1.1 200 OK"
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps


INFO:httpx:HTTP Request: GET https://checkip.amazonaws.com/ "HTTP/1.1 200 "
INFO:httpx:HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"


IMPORTANT: You are using gradio version 4.22.0, however version 4.44.1 is available, please upgrade.
--------


INFO:httpx:HTTP Request: POST https://api.gradio.app/gradio-initiated-analytics/ "HTTP/1.1 200 OK"


In [None]:
from PyPDF2 import PdfReader
from docx import Document as DocxDocument  # For DOCX files
from difflib import SequenceMatcher
import os
import logging
from typing import List, Dict, Optional


class DocumentProcessor:
    def __init__(self, min_length: int = 1000, max_length: int = 25500, similarity_threshold: float = 0.8):
        """
        Initialize the document processor with configurable parameters.
        
        Args:
            min_length (int): Minimum character length for valid documents
            max_length (int): Maximum character length for valid documents
            similarity_threshold (float): Threshold for detecting duplicate content
        """
        self.min_length = min_length
        self.max_length = max_length
        self.similarity_threshold = similarity_threshold
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)

    def _text_similarity(self, text1: str, text2: str) -> float:
        """Calculate similarity ratio between two texts"""
        return SequenceMatcher(None, text1.strip(), text2.strip()).ratio()

    def _get_files(self, folder_path: str) -> List[str]:
        """Get list of files from folder, excluding system files"""
        if not os.path.exists(folder_path):
            self.logger.error(f"Folder path does not exist: {folder_path}")
            return []
        
        files = [f"{folder_path}/{file}" for file in os.listdir(folder_path)]
        return [f for f in files if f.lower().endswith('.pdf')]

    def _extract_text_from_pdf(self, file_path: str) -> Optional[str]:
        """Extract text content from a PDF file using PyPDF2"""
        try:
            reader = PdfReader(file_path)
            text = ""
            for page in reader.pages:
                text += page.extract_text() or ""
            return text.strip()
        except Exception as e:
            self.logger.error(f"Error extracting text from {file_path}: {e}")
            return None

    def _extract_text_from_docx(self, file_path: str) -> Optional[str]:
        """Extract text content from a DOCX file using python-docx"""
        try:
            doc = DocxDocument(file_path)
            text = "\n".join([para.text for para in doc.paragraphs])
            return text.strip()
        except Exception as e:
            self.logger.error(f"Error extracting text from DOCX {file_path}: {e}")
            return None

    def process_pdf_folder(self, folder_path: str) -> List[Dict]:
        """Process all PDFs in a folder."""
        if not os.path.exists(folder_path):
            self.logger.error(f"PDF folder path does not exist: {folder_path}")
            return []

        files = self._get_files(folder_path)
        self.logger.info(f"Found {len(files)} files in folder")

        articles = [{'file_path': file, 'position': idx + 1} for idx, file in enumerate(files)]

        # Process files using PyPDF2
        processed_documents = []
        for file in files:
            try:
                pdf_text = self._extract_text_from_pdf(file)

                if pdf_text is None or not self.min_length <= len(pdf_text) <= self.max_length:
                    self.logger.info(
                        f"Removing file {file} (text extraction failed or length out of range)"
                    )
                    continue

                pdf_text = pdf_text.replace("William Masquelier", " ")  # Optional cleanup
                processed_documents.append({'file_path': file, 'content': pdf_text})
            except Exception as e:
                self.logger.error(f"Error processing PDF {file}: {e}")

        self.logger.info(f"Processed {len(processed_documents)} valid PDF files")
        return processed_documents

    def process_docx(self, docx_path: str, separator: str = "--") -> List[Dict]:
        """
        Process a DOCX file containing multiple articles.
        
        Args:
            docx_path (str): Path to the DOCX file
            separator (str): Separator used between articles
            
        Returns:
            List[Dict]: List of processed articles
        """
        if not os.path.exists(docx_path):
            self.logger.error(f"DOCX file does not exist: {docx_path}")
            return []

        try:
            # Extract text from DOCX
            docx_text = self._extract_text_from_docx(docx_path)
            if not docx_text:
                self.logger.error(f"Failed to extract text from DOCX {docx_path}")
                return []

            # Split into articles based on separator
            raw_sections = docx_text.split(separator)
            articles = []

            for idx, section in enumerate(raw_sections, 1):
                content = section.strip()
                if not self.min_length <= len(content) <= self.max_length:
                    continue

                is_duplicate = any(
                    self._text_similarity(content, existing['content']) > self.similarity_threshold 
                    for existing in articles
                )

                if not is_duplicate:
                    articles.append({
                        'file_path': docx_path,
                        'content': content,
                        'position': idx
                    })

            self.logger.info(f"Processed {len(articles)} articles from DOCX")
            return articles

        except Exception as e:
            self.logger.error(f"Error processing DOCX {docx_path}: {e}")
            return []

    def process_documents(self, 
                         pdf_folder_path: Optional[str] = None, 
                         docx_file_path: Optional[str] = None, 
                         docx_separator: str = "--") -> List[Dict]:
        """
        Main processing function that handles both PDF folder and DOCX file inputs.
        
        Args:
            pdf_folder_path (str, optional): Path to folder containing PDFs
            docx_file_path (str, optional): Path to DOCX file
            docx_separator (str): Separator for DOCX processing
            
        Returns:
            List[Dict]: Combined list of processed articles from both sources
        """
        all_articles = []
        position = 1

        if pdf_folder_path:
            pdf_articles = self.process_pdf_folder(pdf_folder_path)
            for article in pdf_articles:
                article['reordered_position'] = position
                position += 1
            all_articles.extend(pdf_articles)

        if docx_file_path:
            docx_articles = self.process_docx(docx_file_path, docx_separator)
            for article in docx_articles:
                article['reordered_position'] = position
                position += 1
            all_articles.extend(docx_articles)

        self.logger.info(f"Total processed articles: {len(all_articles)}")
        return all_articles


# Preprocess articles

In [19]:
from typing import List, Dict, Tuple
def preprocess_journalist_articles(journalist_name: str, articles: List[Dict], news_folder_path: str, 
                                 force_reprocess: bool = False) -> Tuple[List[Dict], str, bool]:
    """
    Preprocesses articles for journalist analysis with save/load functionality.
    """
    try:
        logging.info(f"Starting article preprocessing for journalist {journalist_name}")
        print(f"Found {len(articles)} articles to process")
        
        # Set up directory structure
        general_folder = setup_journalist_directories(news_folder_path, journalist_name)
        print(f"Set up directory structure at: {general_folder}")
        
        preprocessed_path = os.path.join(general_folder, "Outputs", "CompiledOutputs", "PreprocessedArticles.json")
        
        # Check if we have preprocessed articles saved
        if not force_reprocess and os.path.exists(preprocessed_path):
            logging.info("Loading previously preprocessed articles")
            print("Found existing preprocessed articles, loading...")
            articles_sorted = load_data_from_json(preprocessed_path)
            return articles_sorted, general_folder, True
        
        # If no saved data or force_reprocess, do the preprocessing
        logging.info("Preprocessing articles")
        print("Starting preprocessing...")

        # Extract metadata and clean articles
        print("Extracting metadata...")
        articles = extract_metadata(articles)
        articles = clean_articles(articles)
        print(f"Cleaned and extracted metadata from {len(articles)} articles")

        # Get embeddings
        print("Generating embeddings...")
        article_embeddings = get_embeddings(articles, embeddings_model)
        articles = filter_duplicates(articles, article_embeddings)
        print(f"After duplicate filtering: {len(articles)} articles")

        # Extract hyperlinks
        print("Extracting hyperlinks...")
        for article in articles:
            hyperlinks = extract_hyperlinks(article['file_path'])
            article['link'] = hyperlinks[0] if hyperlinks else None

        # Sort articles by date
        print("Sorting articles...")
        articles_sorted = sorted(articles, 
                               key=lambda x: datetime.strptime(x.get('date', '2024-01-01'), '%B %d, %Y'))

        # Save preprocessed articles
        print("Saving preprocessed articles...")
        save_data_to_json(articles_sorted, preprocessed_path)
        
        logging.info("Article preprocessing completed successfully")
        print("Preprocessing completed successfully")
        
        if not articles_sorted:
            print("Warning: No articles remained after preprocessing")
            return None, general_folder, False
            
        return articles_sorted, general_folder, True

    except Exception as e:
        logging.error(f"Error in article preprocessing: {str(e)}")
        logging.error(traceback.format_exc())
        print(f"Error during preprocessing: {str(e)}")
        print(traceback.format_exc())
        return None, None, False
    
def setup_journalist_directories(news_folder_path: str, journalist_name: str) -> str:
    """
    Set up directory structure for journalist analysis based on the provided news folder path.
    
    Args:
        news_folder_path (str): Path to the folder containing news articles
        journalist_name (str): Name of the journalist being analyzed
        
    Returns:
        str: Path to the general folder for outputs
    """
    try:
        # Get the journalist's base directory by going up one level from news folder
        general_folder = os.path.dirname(news_folder_path)
        outputs_folder = os.path.join(general_folder, "Outputs")
        compiled_outputs = os.path.join(outputs_folder, "CompiledOutputs")
        
        # Create directories
        os.makedirs(compiled_outputs, exist_ok=True)
        
        logging.info(f"Created directory structure at {general_folder}")
        return general_folder
        
    except Exception as e:
        logging.error(f"Error creating directory structure: {str(e)}")
        raise

def generate_introduction(profile_md, journalist_name, max_retries=3):
    for attempt in range(max_retries):
        try:
            intro_bot = ChatGPT(
                model_name="chatgpt-4o-latest",
                temperature=0,
                max_tokens=1000,
                max_retries=3,  # Add retry logic to the ChatGPT instance
                retry_delay=2
            )
            
            intro_prompt = f"""
Based on the provided journalist profile, create a comprehensive introduction section.

Profile content:
{profile_md}

Write an engaging introduction that:
1. Provides an overview of {journalist_name}'s primary areas of expertise and focus
2. Highlights their most significant or impactful coverage topics
3. Identifies any overarching patterns or themes across their work
4. Notes their typical approach to reporting and story development

Keep the introduction to approximately 300-400 words, making it substantive but concise.
Use a professional, analytical tone while remaining engaging.
Base all observations strictly on the evidence from the analyzed articles.
Start your output with "## Introduction".
"""
            
            introduction = intro_bot.ask(intro_prompt)
            if introduction and introduction.strip():
                print("Successfully generated introduction")
                return introduction
                
        except Exception as e:
            print(f"Attempt {attempt + 1} failed to generate introduction: {str(e)}")
            if attempt < max_retries - 1:
                time.sleep(2 * (attempt + 1))  # Exponential backoff
                continue
            else:
                raise Exception(f"Failed to generate introduction after {max_retries} attempts: {str(e)}")
    
    raise Exception("Failed to generate valid introduction after all retry attempts")

# Journalist profile code

In [17]:
def generate_journalist_profile(articles_sorted: List[Dict], journalist_name: str, news_folder_path: str, 
                              language: str = 'English', force_reprocess: bool = False) -> str:
    """
    Generate comprehensive profile analysis of a specific journalist based on their articles.
    
    Args:
        articles_sorted (List[Dict]): List of preprocessed articles
        journalist_name (str): Name of the journalist being analyzed
        news_folder_path (str): Path to the folder containing news articles
        language (str): Output language for the analysis
        force_reprocess (bool): If True, reprocess everything even if saved data exists
        
    Returns:
        str: Generated markdown content with journalist analysis
    """
    try:
        logging.info(f"Starting journalist profile analysis for {journalist_name}")
        
        # Set up directory structure
        general_folder = setup_journalist_directories(news_folder_path, journalist_name)
        processed_articles_path = os.path.join(general_folder, "Outputs", "CompiledOutputs", "ProcessedArticles.json")
        categorized_articles_path = os.path.join(general_folder, "Outputs", "CompiledOutputs", "CategorizedArticles.json")
        
        # Initialize compiled_sentences
        compiled_sentences = ""
        
        # Load or generate one-sentence descriptions
        if not force_reprocess and os.path.exists(processed_articles_path):
            logging.info("Loading previously processed articles")
            articles_sorted = load_data_from_json(processed_articles_path)
            # Compile sentences from loaded articles
            compiled_sentences = "\n".join([article.get('one_sentence_description', '') 
                                         for article in articles_sorted 
                                         if 'one_sentence_description' in article])
        else:
            # Generate one-sentence descriptions
            logging.info("Generating article descriptions")
            compiled_sentences = ""
            system_prompt = """You are a helpful assistant. Your role is to describe in one single sentence what a given news media article's main topic and angle is."""

            for article in articles_sorted:
                article_content = article.get('content', '')
                chatbot = ChatGPT(
                    system_prompt=system_prompt,
                    model_name="gpt-4o-mini",
                    temperature=0,
                    max_tokens=350,
                )

                question = f"""
Please write a single sentence summarizing this article's main topic and {journalist_name}'s angle or approach.
Focus on both the subject matter and how {journalist_name} covers it.

Article: {article_content}
                """

                response = chatbot.ask(question)
                print(response)
                article['one_sentence_description'] = response
                compiled_sentences += response + "\n"

            # Save processed articles
            save_data_to_json(articles_sorted, processed_articles_path)

        # Check if categories need to be generated
        need_categorization = True
        if not force_reprocess and os.path.exists(categorized_articles_path):
            logging.info("Loading previously categorized articles")
            saved_data = load_data_from_json(categorized_articles_path)
            
            # Check if ALL articles have categories
            all_categorized = all('category' in article for article in articles_sorted)
            
            if all_categorized:
                need_categorization = False
                # Reconstruct categories_data from saved data
                categories_data = []
                for category_info in saved_data['categories']:
                    category = {
                        'category': category_info['category'],
                        'description': category_info['description'],
                        'articles': []
                    }
                    # Find articles belonging to this category
                    for article in articles_sorted:
                        if article.get('category') == category_info['category']:
                            category['articles'].append(article)
                    categories_data.append(category)
            else:
                logging.info("Some articles missing categories - will recategorize")
                
        if need_categorization:
            logging.info("Defining coverage categories")
            system_prompt = f"""You are a media analyst. Your role is to identify the main recurring stories, narratives, and series of connected events that {journalist_name} covers across multiple articles."""
            
            chatbot = ChatGPT(
                system_prompt=system_prompt,
                model_name="models/gemini-1.5-pro",
                temperature=0,
                max_tokens=1000,
            )

            question = f"""
Based on these article summaries, identify the main recurring stories or narrative threads in {journalist_name}'s articles.
Create 5-7 distinct categories that represent specific stories, events, or series of connected events that appear across multiple articles.

Article summaries:
{compiled_sentences}

For each category:
1. Give it a name that describes the specific story/narrative (e.g., "Tech Company X Layoff Series" rather than just "Tech Industry")
2. Explain what specific events, developments, or connected stories this narrative encompasses
3. Focus on identifying stories that span multiple articles or connected events rather than broad topics

Format as:
CATEGORY: [Story/Narrative Name]
DESCRIPTION: [Explanation of the specific story thread and how it develops across articles]
"""

            categories_response = chatbot.ask(question)
            print(categories_response)

            categories_data = []
            current_category = None
            current_description = None
        
            for line in categories_response.split('\n'):
                if line.startswith('CATEGORY:'):
                    if current_category is not None:
                        categories_data.append({
                            'category': current_category.strip(),
                            'description': current_description.strip() if current_description else '',
                            'articles': []
                        })
                    current_category = line.replace('CATEGORY:', '').strip()
                    current_description = None
                elif line.startswith('DESCRIPTION:'):
                    current_description = line.replace('DESCRIPTION:', '').strip()

            # Add the last category
            if current_category is not None:
                categories_data.append({
                    'category': current_category,
                    'description': current_description if current_description else '',
                    'articles': []
                })

            # Add "Other" category automatically
            categories_data.append({
                'category': 'Other',
                'description': 'Articles that do not clearly align with specific narrative threads or recurring stories',
                'articles': []
            })

            # Check if articles already have categories
            all_categorized = all('category' in article for article in articles_sorted)
            print(f"Are all articles categorized? {all_categorized}")

            if not all_categorized:
                # Categorize articles
                logging.info("Categorizing articles")
                for article in articles_sorted:
                    if 'category' not in article:  # Only categorize if not already done
                        classification_prompt = ""
                        for category in categories_data:
                            classification_prompt += f"\n{category['category']}: {category['description']}"
                        
                        # Use chatbot to classify the article
                        chatbot = ChatGPT(
                            model_name="gpt-4o-mini",
                            temperature=0,
                            max_tokens=200
                        )
                        
                        classification = chatbot.ask(
                            f"""
Given these coverage categories for the journalist: {journalist_name}:
{classification_prompt}

Classify this article into one of these categories. Choose the most appropriate category based on the descriptions provided.

Article content:
{article['content']}

Only output the exact category name that best matches this article.
                            """
                        )
                        print(classification)
                        
                        # Add debugging
                        print(f"Trying to classify article: {article.get('title', 'Unknown')}")
                        print(f"Classification received: {classification}")
                        print(f"Available categories: {[cat['category'] for cat in categories_data]}")
                        
                        # Add article to appropriate category and save category in article
                        for category in categories_data:
                            cleaned_category = category['category'].replace('**', '').strip()
                            cleaned_classification = classification.replace('**', '').strip()
                            if cleaned_category == cleaned_classification:
                                category['articles'].append(article)
                                article['category'] = cleaned_category  # Save category in article
                                break
                
                # Save the updated articles with their categories
                print("Saving categorized articles...")
                save_data_to_json(articles_sorted, processed_articles_path)
        pass

        # Save in a format that avoids circular references
        save_data = {
            'categories': [
                {
                    'category': cat['category'].replace('**', '').strip(),  # Remove asterisks
                    'description': cat['description']
                }
                for cat in categories_data
            ],
            'articles': []
        }

        # For debugging
        processed_articles = set()  # Keep track of processed articles

        # Add articles with safer category assignment
        for article in articles_sorted:
            # Find the category for this article
            article_category = None
            for cat in categories_data:
                cat_name = cat['category'].replace('**', '').strip()  # Remove asterisks

                # Check if article is in category by title
                article_titles_in_category = {a.get('title', '') for a in cat['articles']}
                if article.get('title', '') in article_titles_in_category:
                    article_category = cat_name
                    processed_articles.add(article.get('title', ''))
                    break
                
            # Create article data with category
            article_data = article.copy()
            if article_category:
                article_data['category'] = article_category
            else:
                if article.get('title', '') not in processed_articles:  # Only warn about truly uncategorized articles
                    print(f"\nWARNING: No category found for article: {article.get('title', 'Unknown title')}")
                article_data['category'] = 'Uncategorized'

            save_data['articles'].append(article_data)

        print(f"\nSuccessfully categorized {len(processed_articles)} articles")
        save_data_to_json(save_data, categorized_articles_path)
        
        # Generate profile analysis
        logging.info("Generating profile analysis")
        profile_md = f"""
# Journalist Profile Analysis: {journalist_name}

## Overview
This analysis examines the work and coverage patterns of {journalist_name} based on {len(articles_sorted)} articles.

## Coverage Areas
"""
        # Sort categories by number of articles, keeping 'Other' last
        categories_data = sorted(
            categories_data,
            key=lambda x: (
                1 if x['category'] == 'Other' else 0,  # Force 'Other' to end
                -len(x['articles'])  # Sort rest by number of articles (descending)
            )
        )
        
        # Analyze each category
        # Inside the generate_journalist_profile function, replace the category analysis section:

        for category in categories_data:
            # Skip empty categories
            if not category['articles']:
                continue
            
            # Sort articles by date
            category['articles'].sort(key=lambda x: datetime.strptime(x.get('date', '2024-01-01'), '%B %d, %Y'))
            
            excluded_categories = [
            cat_data['category']
            for cat_data in categories_data
            if cat_data['category'] != category['category']
        ]

            # Create a comma-separated string of those categories:
            excluded_categories_str = ", ".join(excluded_categories)

            # First chatbot: Factual narrative analysis
            narrative_bot = ChatGPT(
                model_name="models/gemini-1.5-pro",
                temperature=0,
                max_tokens=2500
            )
            
            narrative_prompt = f"""
Analyze the factual coverage and key narratives in {journalist_name}'s articles for the category: {category['category']}.

Focus on events and stories that *directly* relate to {category['category']}. 
YOU MUST EXCLUDE all information which is closely related to {excluded_categories_str}.

Focus ONLY on extracting and organizing the key events, facts, and stories covered in these articles that are relevant to the '{category['category']}' category. 
Exclude any analysis of the journalist's opinions or stance.

Articles to analyze:
{json.dumps([
    {
        'title': article.get('title', 'Untitled'),
        'date': article.get('date', 'Unknown'),
        'media_outlet': article.get('media_outlet', 'Unknown'),
        'content': article.get('content', '')
    } 
    for article in category['articles']
], indent=2)}

Provide a comprehensive bullet-point analysis of the key stories and facts. For each point:

- Start with a clear event or development
- Include specific dates and chronological progression where relevant
- Name key stakeholders (companies, individuals, organizations)
- Include important statistics, quotes, or concrete outcomes
- ALWAYS cite the specific article (date and outlet) supporting each point
- Highlight connections between different events where they exist

Focus on telling the story through factual, chronological points that are well-supported by the articles.

Format each section with clear headers and bullet points.
Every fact must be linked to specific articles (date, outlet).
Focus on objective information only—no interpretation of the journalist's views.
DO NOT address this prompt directly.
            """
            
            narrative_analysis = narrative_bot.ask(narrative_prompt)
            print("Completed narrative analysis:")
            print(narrative_analysis)

            if language.lower() != 'english':
                narrative_analysis = translate_content(narrative_analysis, 'auto', language)
            
            # Second chatbot: Perspective and stance analysis
            perspective_bot = ChatGPT(
                model_name="models/gemini-1.5-pro",
                temperature=0,
                max_tokens=2500
            )
            
            stance_prompt = f"""
Analyze {journalist_name}'s perspective and stance based on their coverage in the category: {category['category']}

Use both the original articles AND the factual narrative analysis below as your source material.

Original articles:
{json.dumps([
    {
        'title': article.get('title', 'Untitled'),
        'date': article.get('date', 'Unknown'),
        'media_outlet': article.get('media_outlet', 'Unknown'),
        'content': article.get('content', '')
    } for article in category['articles']
], indent=2)}

Factual narrative analysis:
{narrative_analysis}

Provide a detailed analysis of {journalist_name}'s position, sentiment or stance on the topics described in the narrative analysis. Your output should be somehow connected to the narrative analysis.
Analyze {journalist_name}'s perspective and opinions in bullet points. For each observation:

- Describe how {journalist_name} positions themselves on specific issues, Show how {journalist_name} frames events and developments, specifically the ones described in the narrative analysis.
- Note any apparent biases or preferences in their coverage. specifically if this bias or preference is in favour of a certain individual or organisation.
- Highlight his treatment of different stakeholders, how does he consider them and what is his sentiment towards them.
- Support each point with specific quotes and article references (date, outlet)

Focus on building a clear picture of the journalist's viewpoint through concrete examples and evidence. It should give an idea of what matters or not to {journalist_name}, what does he stand for through his coverage.

Every observation must be supported by specific quotes or examples from the articles.
Focus on identifying patterns across multiple articles pieces rather than single instances.
Highlight any evolution in the journalist's perspective over time, if applicable.
DO NOT address this prompt directly.
"""
        
            stance_analysis = perspective_bot.ask(stance_prompt)
            print("Completed stance analysis:")
            print(stance_analysis)

            if language.lower() != 'english':
                stance_analysis = translate_content(stance_analysis, 'auto', language)
            
            # Add to markdown document
            profile_md += f"""      
{f"## Miscellaneous Coverage" if category['category'] == 'Other' else ''}{'' if category['category'] == 'Other' else ''}
{narrative_analysis}

## Opinion, tone and stance interpretation:    
{stance_analysis}
        
#### Articles covering this topic:
"""
            
            # Add article list
            for article in category['articles']:
                profile_md += f"- [{article.get('title', 'Untitled')}]({article.get('link', '#')}) - {article.get('media_outlet', 'Unknown')}, {article.get('date', 'Unknown Date')}\n"
            
            profile_md += "\n---\n"

            print(profile_md)

        # Generate introduction
        try:
            introduction = generate_introduction(profile_md, journalist_name)
            if language.lower() != 'english':
                introduction = translate_content(introduction, 'auto', language)

            # Generate the TOC now that we have the entire "skeleton" of profile_md
            toc = generate_toc(profile_md)

            # Replace the original overview block with (TOC + Introduction)
            profile_md = profile_md.replace(
                f"""## Overview
This analysis examines the work and coverage patterns of {journalist_name} based on {len(articles_sorted)} articles.""",
                f"""{toc}\n\n{introduction}"""
            )
        except Exception as e:
            print(f"Error generating introduction: {e}")
            # If introduction generation fails, still insert a TOC before a fallback introduction
            toc = generate_toc(profile_md)
            fallback_intro = f"""## Introduction
This analysis examines the work and coverage patterns of {journalist_name} based on {len(articles_sorted)} articles, 
covering various topics and themes throughout their journalistic career."""
            
            profile_md = profile_md.replace(
                f"""## Overview
This analysis examines the work and coverage patterns of {journalist_name} based on {len(articles_sorted)} articles.""",
                f"""{toc}\n\n{fallback_intro}"""
            )
        
        # Generate conclusion
        chatbot = ChatGPT(
            model_name="chatgpt-4o-latest",
            temperature=0,
            max_tokens=1500
        )
        
        conclusion_prompt = f"""
Based on the complete analysis of {journalist_name}'s coverage topics, create a concluding section.

Topical coverage:
{profile_md}

Keep the conclusion to approximately 300 words.
"""
        
        conclusion = chatbot.ask(conclusion_prompt)
        print(conclusion)

        if language.lower() != 'english':
            conclusion = translate_content(conclusion, 'auto', language)
            
        profile_md += conclusion

        # Save the analysis
        output_path = os.path.join(general_folder, "Outputs", "CompiledOutputs", 
                                 f"JournalistProfile_{journalist_name.replace(' ', '_')}.md")
        
        # Make sure the directory exists before writing
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(profile_md)

        return profile_md

    except Exception as e:
        logging.error(f"Error in generate_journalist_profile: {str(e)}")
        logging.error(traceback.format_exc())
        raise

# Test

In [18]:
# Test
print("Starting preprocessing test...")
news_folder = "KnowledgeBase/JournalistProfile/Nick Evans/NewsNickEvans"
journalist_name = "Nick Evans"

try:
    # Check for existing preprocessed files
    preprocessed_path = os.path.join(
        os.path.dirname(news_folder),
        "Outputs",
        "CompiledOutputs",
        "PreprocessedArticles.json"
    )
    
    articles_sorted = None
    if os.path.exists(preprocessed_path):
        print("Found existing preprocessed articles, loading...")
        articles_sorted = load_data_from_json(preprocessed_path)
        general_folder = os.path.dirname(news_folder)
        success = True
    else:
        # Only process documents if no preprocessed files exist
        print("No preprocessed articles found, starting document processing...")
        processor = DocumentProcessor(
            min_length=1000,
            max_length=25500,
            similarity_threshold=0.9
        )
        
        print("Processing documents...")
        articles = processor.process_documents(
            pdf_folder_path=news_folder,
            docx_file_path=None,
            docx_separator="--"
        )
        
        print(f"Found {len(articles)} articles")
        
        if not articles:
            raise ValueError("No valid articles found in the news folder")
            
        # Now preprocess the articles
        print("Starting preprocessing...")
        articles_sorted, general_folder, success = preprocess_journalist_articles(
            journalist_name=journalist_name,
            articles=articles,
            news_folder_path=news_folder,
            force_reprocess=False
        )
    
    print(f"Preprocessing success: {success}")
    print(f"Number of processed articles: {len(articles_sorted) if articles_sorted else 0}")
    
    if success and articles_sorted:
        print("Generating profile...")
        # Generate profile
        profile_content = generate_journalist_profile(
            articles_sorted=articles_sorted,
            journalist_name=journalist_name,
            news_folder_path=news_folder,
            language="English",
            force_reprocess=False
        )
        print(profile_content)
    else:
        print("Preprocessing failed or no articles remained after preprocessing")
        
except Exception as e:
    print(f"Error in main process: {str(e)}")
    print(traceback.format_exc())

Starting preprocessing test...
Found existing preprocessed articles, loading...
Data successfully loaded from 'KnowledgeBase/JournalistProfile/Nick Evans/Outputs/CompiledOutputs/PreprocessedArticles.json'.
Preprocessing success: True
Number of processed articles: 82
Generating profile...
Data successfully loaded from 'KnowledgeBase/JournalistProfile/Nick Evans/Outputs/CompiledOutputs/ProcessedArticles.json'.
Data successfully loaded from 'KnowledgeBase/JournalistProfile/Nick Evans/Outputs/CompiledOutputs/CategorizedArticles.json'.

Successfully categorized 82 articles
Data successfully saved to 'KnowledgeBase/JournalistProfile/Nick Evans/Outputs/CompiledOutputs/CategorizedArticles.json'.
Completed narrative analysis:
### Lithium Market and Investments

* **Kali Metals IPO (January 6, 2024):** Kali Metals, a newly listed lithium company, raised $15 million in an oversubscribed IPO at 25 cents per share.  Key investors include Chris Ellison (Mineral Resources), Tim Roberts, Rod Jones, an

# Test new pdf loader

In [1]:
import os
import logging
import mammoth
import json
from difflib import SequenceMatcher
from typing import List, Dict, Union, Optional
from docling.document_converter import DocumentConverter

class DocumentProcessor:
    def __init__(self, min_length: int = 1000, max_length: int = 25500, similarity_threshold: float = 0.8):
        """
        Initialize the document processor with configurable parameters.
        
        Args:
            min_length (int): Minimum character length for valid documents
            max_length (int): Maximum character length for valid documents
            similarity_threshold (float): Threshold for detecting duplicate content
        """
        self.min_length = min_length
        self.max_length = max_length
        self.similarity_threshold = similarity_threshold
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)

    def _text_similarity(self, text1: str, text2: str) -> float:
        """Calculate similarity ratio between two texts"""
        return SequenceMatcher(None, text1.strip(), text2.strip()).ratio()

    def _get_files(self, folder_path: str) -> List[str]:
        """Get list of files from folder, excluding system files"""
        if not os.path.exists(folder_path):
            self.logger.error(f"Folder path does not exist: {folder_path}")
            return []
        
        files = [f"{folder_path}/{file}" for file in os.listdir(folder_path)]
        return [f for f in files if not f.endswith('.DS_Store')]

    def _remove_file(self, file: str, articles: List[Dict]) -> None:
        """Remove file and update articles list"""
        try:
            os.remove(file)
            articles[:] = [article for article in articles if article['file_path'] != file]
        except OSError as e:
            self.logger.error(f"Error removing file {file}: {e}")

    def process_pdf_folder(self, folder_path: str) -> List[Dict]:
        if not os.path.exists(folder_path):
            self.logger.error(f"PDF folder path does not exist: {folder_path}")
            return []

        files = self._get_files(folder_path)
        self.logger.info(f"Found {len(files)} files in folder")

        articles = [{'file_path': file, 'position': idx + 1} for idx, file in enumerate(files)]

        # Initialize Docling's DocumentConverter once
        docling_converter = DocumentConverter()

        # Filter files based on content length using Docling
        for file in files[:]:
            try:
                # Convert the PDF to a Docling result
                docling_result = docling_converter.convert(file)

                # Export the text as Markdown
                doc_markdown = docling_result.document.export_to_markdown()

                content_length = len(doc_markdown)
                self.logger.info(f"Processing file {file}")
                self.logger.info(f"Content length: {content_length}")

                # Check if it meets length requirements
                if not self.min_length <= content_length <= self.max_length:
                    self.logger.info(
                        f"Removing file {file} (length: {content_length} outside "
                        f"range [{self.min_length}, {self.max_length}])"
                    )
                    self._remove_file(file, articles)
                else:
                    self.logger.info(f"File {file} passed length check")

            except Exception as e:
                self.logger.error(f"Error processing PDF {file}: {e}")
                self._remove_file(file, articles)

        # Process remaining files
        valid_files = [article['file_path'] for article in articles]
        self.logger.info(f"Processing {len(valid_files)} valid files")

        processed_documents = []
        for file in valid_files:
            try:
                # Convert again and store the text in processed_documents
                docling_result = docling_converter.convert(file)
                doc_markdown = docling_result.document.export_to_markdown()

                # Optional: do any text clean-ups
                doc_markdown = doc_markdown.replace("William Masquelier", " ")
                processed_documents.append(doc_markdown)
            except Exception as e:
                self.logger.error(f"Error processing PDF {file}: {e}")
                processed_documents.append("")

        # Update articles with content
        for article, markdown_content in zip(articles, processed_documents):
            article['content'] = markdown_content

        return articles

    def process_docx(self, docx_path: str, separator: str = "--") -> List[Dict]:
        """
        Process a DOCX file containing multiple articles.
        
        Args:
            docx_path (str): Path to the DOCX file
            separator (str): Separator used between articles
            
        Returns:
            List[Dict]: List of processed articles
        """
        if not os.path.exists(docx_path):
            self.logger.error(f"DOCX file does not exist: {docx_path}")
            return []

        try:
            with open(docx_path, 'rb') as docx_file:
                result = mammoth.convert_to_html(docx_file)
                full_text = result.value

                # Clean HTML tags
                for tag in ['p', 'ul', 'ol']:
                    full_text = full_text.replace(f'<{tag}>', '\n').replace(f'</{tag}>', '\n')
                full_text = full_text.replace('<li>', '').replace('</li>', '\n')

                # Split and process sections
                raw_sections = full_text.split(separator)
                articles = []
                
                for idx, section in enumerate(raw_sections, 1):
                    content = section.strip()
                    
                    # Apply length filters
                    if not self.min_length <= len(content) <= self.max_length:
                        continue

                    # Check for duplicates
                    is_duplicate = any(
                        self._text_similarity(content, existing['content']) > self.similarity_threshold 
                        for existing in articles
                    )

                    if not is_duplicate and len(content.split()) > 10:
                        articles.append({
                            'content': content,
                            'position': idx,
                            'file_path': docx_path
                        })

                return articles

        except Exception as e:
            self.logger.error(f"Error processing DOCX {os.path.basename(docx_path)}: {e}")
            return []

    def process_documents(self, 
                         pdf_folder_path: Optional[str] = None, 
                         docx_file_path: Optional[str] = None, 
                         docx_separator: str = "--") -> List[Dict]:
        """
        Main processing function that handles both PDF folder and DOCX file inputs.
        
        Args:
            pdf_folder_path (str, optional): Path to folder containing PDFs
            docx_file_path (str, optional): Path to DOCX file
            docx_separator (str): Separator for DOCX processing
            
        Returns:
            List[Dict]: Combined list of processed articles from both sources
        """
        if not pdf_folder_path and not docx_file_path:
            raise ValueError("At least one input path (PDF folder or DOCX file) must be provided")

        all_articles = []
        position = 1  # Initialize position counter
        
        # Process PDF folder if provided
        if pdf_folder_path:
            self.logger.info(f"Processing PDFs from folder: {pdf_folder_path}")
            pdf_articles = self.process_pdf_folder(pdf_folder_path)
            for article in pdf_articles:
                article['reordered_position'] = position
                position += 1
            all_articles.extend(pdf_articles)
            self.logger.info(f"Processed {len(pdf_articles)} articles from PDFs")

        # Process DOCX file if provided
        if docx_file_path:
            self.logger.info(f"Processing DOCX file: {docx_file_path}")
            docx_articles = self.process_docx(docx_file_path, docx_separator)
            for article in docx_articles:
                article['reordered_position'] = position
                position += 1
            all_articles.extend(docx_articles)
            self.logger.info(f"Processed {len(docx_articles)} articles from DOCX")

        # Check for duplicates across all sources
        unique_articles = []
        seen_content = set()
        for article in all_articles:
            is_duplicate = any(
                self._text_similarity(article['content'], existing['content']) > self.similarity_threshold
                for existing in unique_articles
            )
            content_hash = hash(article['content'])
            
            if not is_duplicate and content_hash not in seen_content:
                unique_articles.append(article)
                seen_content.add(content_hash)

        # Ensure all articles have required fields
        for article in unique_articles:
            if 'reordered_position' not in article:
                article['reordered_position'] = 0
            if 'file_path' not in article:
                article['file_path'] = "Unknown"
            if 'position' not in article:
                article['position'] = 0

        self.logger.info(f"Total unique articles processed: {len(unique_articles)}")
        return unique_articles


In [2]:
# Testing the DocumentProcessor with a sample PDF folder
processor = DocumentProcessor(min_length=1000, max_length=25000)

pdf_folder_path = "KnowledgeBase/JournalistProfile/Colin Packham/NewsColin Packham"  # Adjust to match your local path or create one
processed_articles = processor.process_documents(pdf_folder_path=pdf_folder_path)

print("Processed Articles:")
for idx, art in enumerate(processed_articles, 1):
    print(f"Article #{idx}:")
    print(f"File Path: {art['file_path']}")
    print(f"Reordered Position: {art['reordered_position']}")
    print(f"Content (first 300 chars):\n{art['content'][:300]}")
    print("-"*50)


INFO:__main__:Processing PDFs from folder: KnowledgeBase/JournalistProfile/Colin Packham/NewsColin Packham
INFO:__main__:Found 20 files in folder
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.utils.accelerator_utils:Accelerator device: 'mps'
INFO:easyocr.easyocr:Download complete
INFO:easyocr.easyocr:Download complete.
INFO:docling.utils.accelerator_utils:Accelerator device: 'mps'
INFO:docling.utils.accelerator_utils:Accelerator device: 'mps'
INFO:docling.pipeline.base_pipeline:Processing document We'll be cooking with too much gas.PDF
INFO:docling.document_converter:Finished converting document We'll be cooking with too much gas.PDF in 987.71 sec.
INFO:__main__:Processing file KnowledgeBase/JournalistProfile/Colin Packham/NewsColin Packham/We'll be cooking with too much gas.PDF
INFO:__main__:Content length: 3438
INFO:__main__:File KnowledgeBase/JournalistProfile/Colin Packham/NewsColin Packham/We'll be cooking with too much gas.PDF passed length check

Processed Articles:
Article #1:
File Path: KnowledgeBase/JournalistProfile/Colin Packham/NewsColin Packham/We'll be cooking with too much gas.PDF
Reordered Position: 1
Content (first 300 chars):
<!-- image -->

## We'll be cooking with too much gas

The Daily Telegraph (Australia) January 10, 2025 Friday Telegraph Edition

Copyright 2025 Nationwide News Pty Limited All Rights Reserved

Section:

NEWS; Pg. 3

Length:

494 words

Byline:

Colin Packham

## Body

<!-- image -->

Surplus tipped
--------------------------------------------------
Article #2:
File Path: KnowledgeBase/JournalistProfile/Colin Packham/NewsColin Packham/Santos defers $3bn WA oil project as it focuses on dividends.PDF
Reordered Position: 2
Content (first 300 chars):
<!-- image -->

## Santos defers $3bn WA oil project as it focuses on dividends

The Australian January 22, 2025 Wednesday Australian Edition

Copyright 2025 Nationwide News Pty Limited All Rights Reserved

Section:

BUSINESS; Pg. 16

Length:

405 word

In [6]:
class DocumentProcessor:
    def __init__(self, min_length: int = 1000, max_length: int = 25500, similarity_threshold: float = 0.8):
        """
        Initialize the document processor with configurable parameters.
        
        Args:
            min_length (int): Minimum character length for valid documents
            max_length (int): Maximum character length for valid documents
            similarity_threshold (float): Threshold for detecting duplicate content
        """
        self.min_length = min_length
        self.max_length = max_length
        self.similarity_threshold = similarity_threshold
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)

    def _text_similarity(self, text1: str, text2: str) -> float:
        """Calculate similarity ratio between two texts"""
        return SequenceMatcher(None, text1.strip(), text2.strip()).ratio()

    def _get_files(self, folder_path: str) -> List[str]:
        """Get list of files from folder, excluding system files"""
        if not os.path.exists(folder_path):
            self.logger.error(f"Folder path does not exist: {folder_path}")
            return []
        
        files = [f"{folder_path}/{file}" for file in os.listdir(folder_path)]
        return [f for f in files if f.lower().endswith('.pdf')]

    def _extract_text_from_pdf(self, file_path: str) -> Optional[str]:
        """Extract text content from a PDF file using PyPDF2"""
        try:
            reader = PdfReader(file_path)
            text = ""
            for page in reader.pages:
                text += page.extract_text() or ""
            return text.strip()
        except Exception as e:
            self.logger.error(f"Error extracting text from {file_path}: {e}")
            return None

    def _extract_text_from_docx(self, file_path: str) -> Optional[str]:
        """Extract text content from a DOCX file using python-docx"""
        try:
            doc = DocxDocument(file_path)
            text = "\n".join([para.text for para in doc.paragraphs])
            return text.strip()
        except Exception as e:
            self.logger.error(f"Error extracting text from DOCX {file_path}: {e}")
            return None

    def process_pdf_folder(self, folder_path: str) -> List[Dict]:
        """Process all PDFs in a folder."""
        if not os.path.exists(folder_path):
            self.logger.error(f"PDF folder path does not exist: {folder_path}")
            return []

        files = self._get_files(folder_path)
        self.logger.info(f"Found {len(files)} files in folder")

        articles = [{'file_path': file, 'position': idx + 1} for idx, file in enumerate(files)]

        # Process files using PyPDF2
        processed_documents = []
        for file in files:
            try:
                pdf_text = self._extract_text_from_pdf(file)

                if pdf_text is None or not self.min_length <= len(pdf_text) <= self.max_length:
                    self.logger.info(
                        f"Removing file {file} (text extraction failed or length out of range)"
                    )
                    continue

                pdf_text = pdf_text.replace("William Masquelier", " ")  # Optional cleanup
                processed_documents.append({'file_path': file, 'content': pdf_text})
            except Exception as e:
                self.logger.error(f"Error processing PDF {file}: {e}")

        self.logger.info(f"Processed {len(processed_documents)} valid PDF files")
        return processed_documents

    def process_docx(self, docx_path: str, separator: str = "--") -> List[Dict]:
        """
        Process a DOCX file containing multiple articles.
        
        Args:
            docx_path (str): Path to the DOCX file
            separator (str): Separator used between articles
            
        Returns:
            List[Dict]: List of processed articles
        """
        if not os.path.exists(docx_path):
            self.logger.error(f"DOCX file does not exist: {docx_path}")
            return []

        try:
            # Extract text from DOCX
            docx_text = self._extract_text_from_docx(docx_path)
            if not docx_text:
                self.logger.error(f"Failed to extract text from DOCX {docx_path}")
                return []

            # Split into articles based on separator
            raw_sections = docx_text.split(separator)
            articles = []

            for idx, section in enumerate(raw_sections, 1):
                content = section.strip()
                if not self.min_length <= len(content) <= self.max_length:
                    continue

                is_duplicate = any(
                    self._text_similarity(content, existing['content']) > self.similarity_threshold 
                    for existing in articles
                )

                if not is_duplicate:
                    articles.append({
                        'file_path': docx_path,
                        'content': content,
                        'position': idx
                    })

            self.logger.info(f"Processed {len(articles)} articles from DOCX")
            return articles

        except Exception as e:
            self.logger.error(f"Error processing DOCX {docx_path}: {e}")
            return []

    def process_documents(self, 
                         pdf_folder_path: Optional[str] = None, 
                         docx_file_path: Optional[str] = None, 
                         docx_separator: str = "--") -> List[Dict]:
        """
        Main processing function that handles both PDF folder and DOCX file inputs.
        
        Args:
            pdf_folder_path (str, optional): Path to folder containing PDFs
            docx_file_path (str, optional): Path to DOCX file
            docx_separator (str): Separator for DOCX processing
            
        Returns:
            List[Dict]: Combined list of processed articles from both sources
        """
        all_articles = []
        position = 1

        if pdf_folder_path:
            pdf_articles = self.process_pdf_folder(pdf_folder_path)
            for article in pdf_articles:
                article['reordered_position'] = position
                position += 1
            all_articles.extend(pdf_articles)

        if docx_file_path:
            docx_articles = self.process_docx(docx_file_path, docx_separator)
            for article in docx_articles:
                article['reordered_position'] = position
                position += 1
            all_articles.extend(docx_articles)

        self.logger.info(f"Total processed articles: {len(all_articles)}")
        return all_articles

In [8]:
processor = DocumentProcessor(
    min_length=500,  # Minimum length for a valid document
    max_length=25000,  # Maximum length for a valid document
    similarity_threshold=0.8  # Threshold for duplicate detection
)

# Define the folder path containing PDFs
pdf_folder_path = "KnowledgeBase/JournalistProfile/Colin Packham/NewsColin Packham"  # Replace with the actual path to your folder

# Process the PDFs in the folder
processed_articles = processor.process_pdf_folder(pdf_folder_path)

# Display the results
if processed_articles:
    print(f"Processed {len(processed_articles)} documents:")
    for idx, article in enumerate(processed_articles, 1):
        print(f"\n--- Document {idx} ---")
        print(f"File Path: {article['file_path']}")
        print(f"Content (first 500 characters):\n{article['content'][:1000]}")
else:
    print("No valid documents were processed.")

INFO:__main__:Found 20 files in folder
INFO:__main__:Processed 20 valid PDF files


Processed 20 documents:

--- Document 1 ---
File Path: KnowledgeBase/JournalistProfile/Colin Packham/NewsColin Packham/We'll be cooking with too much gas.PDF
Content (first 500 characters):
Page 1 of 2
We'll be cooking with too much gas
We'll be cooking with too much gas
The Daily Telegraph (Australia)
January 10, 2025 Friday
Telegraph Edition
Copyright 2025 Nationwide News Pty Limited All Rights Reserved
Section: NEWS; Pg. 3
Length: 494 words
Byline: Colin Packham
Body
Surplus tipped for east coast Australia's east coast will have a surplus of gas this year, the Australian Competition 
and Consumer Commission has concluded, an outlook that tempers concern of an immediate export curb.
Australia's LNG industry has been on heightened alert that exports from Queensland could be limited via the so-
called Aus-tralian Domestic Gas Security Mechanism.
The ADGSM can be triggered only if the ACCC finds a shortfall, but the competition watchdog said it now expected 
a surplus of between 77 peta