In [2]:
# Configuration flags for file inclusion
include_config = {
    'main_code': False,           # MediaCoverageAnalysis.py
    'helpers': True,              # Utils/Helpers.py  - Enabled to extract helper functions
    'outputs': True,              # Utils/Outputs.py  - Enabled to extract output functions
    'chatbots': False,            # Classes/SimplifiedChatbots.py
    'doc_processor': False        # Classes/DocumentProcessor.py
}

# Output functions selection (set to True for the function you want to analyze)
output_functions_config = {
    'generate_journalist_list_output': False,     # Generate list of journalists and media outlets
    'generate_insights_output': False,            # Generate article insights
    'generate_issue_analysis_output': False,      # Generate issue analysis
    'generate_topics_output': False,              # Generate topic summaries
    'generate_analytics_output': False,            # Generate media analytics
    'generate_stakeholder_quotes': False,         # Generate stakeholder analysis
    'generate_consolidated_stakeholder_analysis': True,  # Generate consolidated stakeholder analysis
    'generate_journalist_article_list': False,    # Generate article list for a journalist
    'generate_journalist_profile': False,         # Generate journalist profile
    'analyze_journalist_topic_coverage': False,   # Generate journalist topic coverage analysis
    'generate_journalist_analysis_output': False  # Generate journalist analytics
}

# File paths (modify these according to your project structure)
file_paths = {
    'main_code': 'MediaCoverageAnalysis.py',
    'helpers': 'Utils/Helpers.py',
    'outputs': 'Utils/Outputs.py',
    'chatbots': 'Classes/SimplifiedChatbots.py',
    'doc_processor': 'Classes/DocumentProcessor.py'
}

def read_file_content(file_path):
    """Read and return the content of a Python file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except Exception as e:
        return f"Error reading {file_path}: {str(e)}"

def count_lines_of_code(content):
    """Count non-empty, non-comment lines of code."""
    lines = content.split('\n')
    count = 0
    in_multiline_comment = False
    
    for line in lines:
        line = line.strip()
        if not line:  # Skip empty lines
            continue
            
        # Handle multiline comments
        if '"""' in line or "'''" in line:
            if in_multiline_comment:
                in_multiline_comment = False
                continue
            elif line.count('"""') == 1 or line.count("'''") == 1:
                in_multiline_comment = True
                continue
                
        if in_multiline_comment:
            continue
            
        # Skip single-line comments
        if line.startswith('#'):
            continue
            
        count += 1
    return count

def extract_imports(content):
    """
    Extract import statements from the code with special handling for wildcard imports.
    Identifies potential helper function imports.
    """
    import ast
    import re
    
    imports = []
    helpers_wildcard_import = False
    specific_helper_imports = []
    
    # First, check for wildcard imports from Helpers using regex
    wildcard_match = re.search(r'from\s+Utils\.Helpers\s+import\s+\*', content)
    if wildcard_match:
        helpers_wildcard_import = True
        imports.append("Utils.Helpers.*")
    
    # Now use AST to get specific imports
    try:
        tree = ast.parse(content)
        for node in ast.walk(tree):
            if isinstance(node, ast.Import):
                for name in node.names:
                    imports.append(name.name)
            elif isinstance(node, ast.ImportFrom):
                module = node.module or ''
                for name in node.names:
                    imported_name = f"{module}.{name.name}"
                    imports.append(imported_name)
                    
                    # Track specific helper imports
                    if module == 'Utils.Helpers':
                        if name.name != '*':  # Skip wildcards as we handle them separately
                            specific_helper_imports.append(name.name)
                            
        # Add a special marker for helpers analysis
        if helpers_wildcard_import or specific_helper_imports:
            if helpers_wildcard_import:
                imports.append("__ALL_HELPERS_AVAILABLE__")
            else:
                imports.append(f"__SPECIFIC_HELPERS__: {', '.join(specific_helper_imports)}")
                
        return sorted(set(imports))
    except Exception as e:
        print(f"Error extracting imports: {e}")
        return []

def extract_docstring(content):
    """Extract the module-level docstring from Python code."""
    import ast
    try:
        tree = ast.parse(content)
        if ast.get_docstring(tree):
            return ast.get_docstring(tree)
        return "⚠️ No module-level docstring found. Consider adding a description of this module's purpose."
    except:
        return "⚠️ Unable to parse code for docstring."

def extract_function_calls(function_code):
    """Extract function calls from a function's code."""
    import ast
    import re
    
    function_calls = set()
    
    # Extract function calls using AST
    try:
        tree = ast.parse(function_code)
        for node in ast.walk(tree):
            if isinstance(node, ast.Call):
                if isinstance(node.func, ast.Name):
                    # Direct function call like function_name()
                    function_calls.add(node.func.id)
                elif isinstance(node.func, ast.Attribute):
                    # Method call like object.method()
                    # We're only interested in potential helpers, not methods
                    if isinstance(node.func.value, ast.Name) and node.func.value.id == 'self':
                        function_calls.add(node.func.attr)
    except Exception as e:
        print("")
    
    # Use regex as a fallback to find potential function calls
    # This regex looks for words followed by parentheses
    potential_calls = re.findall(r'(\w+)\s*\(', function_code)
    function_calls.update(potential_calls)
    
    # Advanced regex to find imported helper functions being called
    # Look for patterns like: from Utils.Helpers import *
    import_matches = re.findall(r'from\s+Utils\.Helpers\s+import\s+\*', function_code)
    if import_matches:
        # If we have "from Utils.Helpers import *", we need to be more careful
        # as any function call could potentially be a helper
        pass
        
    # Filter out Python built-ins and common methods
    builtins = ['print', 'len', 'str', 'int', 'float', 'list', 'dict', 'set', 
                'tuple', 'sum', 'min', 'max', 'sorted', 'range', 'enumerate',
                'zip', 'map', 'filter', 'round', 'open', 'os', 'logging', 'read',
                'write', 'append', 'pop', 'sort', 'datetime', 'strptime', 'Path',
                'mkdir', 'join', 'isfile', 'exists', 'dirname', 'lower', 'upper',
                'strip', 'replace', 'split', 'startswith', 'endswith', 'format',
                'get', 'keys', 'values', 'items', 'update', 'isinstance', 'raise']
    function_calls = {call for call in function_calls if call not in builtins and not call.startswith('__')}
    
    return function_calls

def extract_functions(content, selected_functions=None):
    """
    Extract function names, arguments, their docstrings, and source code from Python code.
    If selected_functions is provided, only extract those specific functions.
    """
    import ast
    import re
    
    # First, scan for all helper modules imported
    helper_imports = []
    helper_wildcard = False
    import_lines = re.findall(r'^from\s+Utils\.Helpers\s+import\s+(.+)$', content, re.MULTILINE)
    for line in import_lines:
        if '*' in line:
            helper_wildcard = True
        else:
            helpers = [h.strip() for h in line.split(',')]
            helper_imports.extend(helpers)
    
    print(f"Helper imports found: {helper_imports}, Wildcard: {helper_wildcard}")
    
    functions = []
    try:
        tree = ast.parse(content)
        for node in ast.walk(tree):
            if isinstance(node, ast.FunctionDef):
                # Skip if we're only looking for specific functions
                if selected_functions and node.name not in selected_functions:
                    continue
                    
                # Get function arguments
                args = []
                for arg in node.args.args:
                    args.append(arg.arg)
                
                # Get docstring
                doc = ast.get_docstring(node) or "⚠️ No docstring provided"
                
                # Get function source code
                source_lines = content.split('\n')
                start_line = node.lineno - 1
                end_line = node.end_lineno
                source_code = '\n'.join(source_lines[start_line:end_line])
                
                # Extract function calls within this function
                function_calls = extract_function_calls(source_code)
                
                # If we're analyzing an output function and we have helper imports,
                # make sure to include those helper functions in function_calls
                if not selected_functions or node.name in selected_functions:
                    if helper_wildcard:
                        # Just note that this function might call any helper
                        pass
                    else:
                        # Add specifically imported helper functions if they appear in the code
                        for helper in helper_imports:
                            pattern = r'\b' + re.escape(helper) + r'\s*\('
                            if re.search(pattern, source_code):
                                function_calls.add(helper)
                                print(f"Added explicitly imported helper: {helper} to {node.name}")
                
                functions.append({
                    'name': node.name,
                    'args': args,
                    'docstring': doc,
                    'source_code': source_code,
                    'function_calls': function_calls,
                    'imports_all_helpers': helper_wildcard,
                    'specific_helper_imports': helper_imports
                })
        return functions
    except Exception as e:
        print(f"Error extracting functions: {e}")
        return []

def extract_classes(content, extract_all_methods=False, selected_methods=None):
    """Extract class names, methods, their docstrings, and source code from Python code."""
    import ast
    classes = []
    try:
        tree = ast.parse(content)
        for node in ast.walk(tree):
            if isinstance(node, ast.ClassDef):
                methods = []
                for child in node.body:
                    if isinstance(child, ast.FunctionDef):
                        # Skip if we're looking for specific methods and this isn't one
                        if not extract_all_methods and selected_methods and child.name not in selected_methods:
                            continue
                            
                        args = [arg.arg for arg in child.args.args]
                        doc = ast.get_docstring(child) or "⚠️ No docstring provided"
                        
                        # Get method source code
                        source_lines = content.split('\n')
                        start_line = child.lineno - 1
                        end_line = child.end_lineno
                        source_code = '\n'.join(source_lines[start_line:end_line])
                        
                        # Extract function calls within this method
                        function_calls = extract_function_calls(source_code)
                        
                        methods.append({
                            'name': child.name,
                            'args': args,
                            'docstring': doc,
                            'source_code': source_code,
                            'function_calls': function_calls
                        })
                
                # Get class source code
                source_lines = content.split('\n')
                start_line = node.lineno - 1
                end_line = node.end_lineno
                source_code = '\n'.join(source_lines[start_line:end_line])
                
                doc = ast.get_docstring(node) or "⚠️ No docstring provided"
                classes.append({
                    'name': node.name,
                    'docstring': doc,
                    'methods': methods,
                    'source_code': source_code
                })
        return classes
    except Exception as e:
        print(f"Error extracting classes: {e}")
        return []

def find_dependent_functions(output_functions, helper_functions):
    """
    Find all helper functions that are called by the selected output functions,
    including indirect dependencies (recursively).
    """
    # Start with direct dependencies
    dependencies = set()
    functions_to_check = set()
    
    # Collect initial function calls from output functions
    for func in output_functions:
        functions_to_check.update(func['function_calls'])
        
    print(f"Initial functions to check from output functions: {functions_to_check}")
    
    # Keep track of functions we've already processed to avoid infinite recursion
    processed_functions = set()
    
    # Process functions until no more dependencies are found
    while functions_to_check:
        current_func_name = functions_to_check.pop()
        
        if current_func_name in processed_functions:
            continue
            
        processed_functions.add(current_func_name)
        
        # Look for this function in helper functions
        found = False
        for helper in helper_functions:
            if helper['name'] == current_func_name:
                dependencies.add(current_func_name)
                # Add this helper's dependencies to check
                new_dependencies = helper['function_calls'] - processed_functions
                functions_to_check.update(new_dependencies)
                found = True
                print(f"Found helper: {current_func_name}, adding dependencies: {new_dependencies}")
                break

    # For debugging: print the final dependencies
    print(f"Final dependencies: {dependencies}")
    
    return dependencies

def generate_code_description():
    """Generate a structured description of the codebase focusing on selected output functions."""
    import re
    description = []
    description.append("# Code Structure and Documentation with Focus on Output Functions\n")
    
    # First, read all file contents
    file_contents = {}
    for key, path in file_paths.items():
        if include_config.get(key, False):
            file_contents[key] = read_file_content(path)
    
    # Step 1: Extract outputs first to identify selected functions
    if include_config.get('outputs', False) and 'outputs' in file_contents:
        outputs_content = file_contents['outputs']
        output_functions = extract_functions(outputs_content)
        
        # Get imports from outputs file to better understand dependencies
        output_imports = extract_imports(outputs_content)
        uses_helper_wildcard = any("Utils.Helpers.*" in imp for imp in output_imports) or any("__ALL_HELPERS_AVAILABLE__" in imp for imp in output_imports)
        
        print(f"Imports in Outputs.py: {output_imports}")
        print(f"Uses Helper wildcard import: {uses_helper_wildcard}")
        
        # Filter to only selected output functions
        selected_output_functions = []
        for func in output_functions:
            if output_functions_config.get(func['name'], False):
                selected_output_functions.append(func)
        
        # If no output functions were explicitly selected, use all of them  
        if not any(output_functions_config.values()):
            selected_output_functions = output_functions
            
        print(f"Selected output functions: {[func['name'] for func in selected_output_functions]}")
    else:
        selected_output_functions = []
    
    # Step 2: Extract helper functions
    if include_config.get('helpers', False) and 'helpers' in file_contents:
        helpers_content = file_contents['helpers']
        helper_functions = extract_functions(helpers_content)
        print(f"Available helper functions: {[func['name'] for func in helper_functions]}")
    else:
        helper_functions = []
    
    # Step 3: Find dependent helper functions
    if selected_output_functions and helper_functions:
        # If outputs.py imports all helpers with *, we need to do a deeper scan
        if uses_helper_wildcard:
            print("Detected wildcard import of Helpers, performing deeper dependency analysis")
            # First find direct dependencies
            dependent_function_names = find_dependent_functions(selected_output_functions, helper_functions)
            
            # Search for function calls in the output functions that match helper function names
            for output_func in selected_output_functions:
                source_code = output_func['source_code']
                for helper_func in helper_functions:
                    # Check if helper function name appears in source code
                    if re.search(r'\b' + re.escape(helper_func['name']) + r'\s*\(', source_code):
                        dependent_function_names.add(helper_func['name'])
                        print(f"Found additional helper function call: {helper_func['name']} in {output_func['name']}")
        else:
            # Standard dependency resolution
            dependent_function_names = find_dependent_functions(selected_output_functions, helper_functions)
        
        dependent_helper_functions = [f for f in helper_functions if f['name'] in dependent_function_names]
        print(f"Found {len(dependent_helper_functions)} dependent helper functions")
    else:
        dependent_helper_functions = []
    
    # Generate output for selected output functions
    if selected_output_functions:
        description.append("\n## Selected Output Functions from Utils/Outputs.py\n")
        
        # Add file statistics
        outputs_content = file_contents.get('outputs', '')
        loc = count_lines_of_code(outputs_content)
        description.append(f"\n### File Statistics")
        description.append(f"- Lines of code: {loc}")
        description.append(f"- Selected output functions: {len(selected_output_functions)}")
        
        # Add imports
        imports = extract_imports(outputs_content)
        if imports:
            description.append("\n### Dependencies")
            description.append("The following modules are imported:")
            for imp in imports:
                description.append(f"- `{imp}`")
        
        # Add selected output functions
        description.append("\n### Selected Output Functions")
        for func in selected_output_functions:
            args_str = ', '.join(func['args'])
            description.append(f"\n#### `{func['name']}({args_str})`")
            description.append(func['docstring'])
            description.append("\nFunction Implementation:")
            description.append("```python")
            description.append(func['source_code'])
            description.append("```")
            
            if func['function_calls']:
                description.append("\nCalls the following functions:")
                for call in sorted(func['function_calls']):
                    description.append(f"- `{call}()`")
    
    # Generate output for dependent helper functions
    if dependent_helper_functions:
        description.append("\n## Dependent Helper Functions from Utils/Helpers.py\n")
        
        helpers_content = file_contents.get('helpers', '')
        loc = count_lines_of_code(helpers_content)
        description.append(f"\n### File Statistics")
        description.append(f"- Lines of code: {loc}")
        description.append(f"- Dependent helper functions: {len(dependent_helper_functions)}")
        
        # Add imports
        imports = extract_imports(helpers_content)
        if imports:
            description.append("\n### Dependencies")
            description.append("The following modules are imported:")
            for imp in imports:
                description.append(f"- `{imp}`")
        
        # Add dependent helper functions
        description.append("\n### Helper Functions")
        for func in dependent_helper_functions:
            args_str = ', '.join(func['args'])
            description.append(f"\n#### `{func['name']}({args_str})`")
            description.append(func['docstring'])
            description.append("\nFunction Implementation:")
            description.append("```python")
            description.append(func['source_code'])
            description.append("```")
            
            if func['function_calls']:
                description.append("\nCalls the following functions:")
                for call in sorted(func['function_calls']):
                    description.append(f"- `{call}()`")
    
    # Add other files if requested
    for key in ['main_code', 'chatbots', 'doc_processor']:
        if not include_config.get(key, False):
            continue
            
        if key not in file_contents:
            continue
            
        content = file_contents[key]
        file_path = file_paths[key]
        
        description.append(f"\n## {file_path}\n")
        
        # Add code statistics
        loc = count_lines_of_code(content)
        description.append(f"\n### File Statistics")
        description.append(f"- Lines of code: {loc}")
        
        # Add imports
        imports = extract_imports(content)
        if imports:
            description.append("\n### Dependencies")
            description.append("The following modules are imported:")
            for imp in imports:
                description.append(f"- `{imp}`")
        
        # Add module docstring
        module_doc = extract_docstring(content)
        description.append("\n### Module Description")
        description.append(module_doc)
        
        # Add full source code
        description.append("\n### Source Code")
        description.append("```python")
        description.append(content)
        description.append("```")
    
    return '\n'.join(description)

In [3]:
# Configuration flags for file inclusion
include_config = {
    'main_code': True,           # MediaCoverageAnalysis.py
    'helpers': True,              # Utils/Helpers.py  - Enabled to extract helper functions
    'outputs': False,              # Utils/Outputs.py  - Enabled to extract output functions
    'chatbots': True,            # Classes/SimplifiedChatbots.py
    'doc_processor': True       # Classes/DocumentProcessor.py
}

# Output functions selection (set to True for the function you want to analyze)
output_functions_config = {
    'generate_journalist_list_output': False,     # Generate list of journalists and media outlets
    'generate_insights_output': False,            # Generate article insights
    'generate_issue_analysis_output': False,      # Generate issue analysis
    'generate_topics_output': False,              # Generate topic summaries
    'generate_analytics_output': False,            # Generate media analytics
    'generate_stakeholder_quotes': False,         # Generate stakeholder analysis
    'generate_consolidated_stakeholder_analysis': False,  # Generate consolidated stakeholder analysis
    'generate_journalist_article_list': False,    # Generate article list for a journalist
    'generate_journalist_profile': True,         # Generate journalist profile
    'analyze_journalist_topic_coverage': False,   # Generate journalist topic coverage analysis
    'generate_journalist_analysis_output': False  # Generate journalist analytics
}

# Generate and display the description
description = generate_code_description()
print(description)

Helper imports found: [], Wildcard: False







Available helper functions: ['extract_hyperlinks', 'add_links_to_articles', 'check_input_paths', 'parse_relative_date', 'clean_date_string', 'process_article_date', 'ensure_directory_exists', 'process_pdfs', 'get_files', 'save_data_to_json', 'load_data_from_json', 'extract_metadata', 'clean_articles', 'filter_duplicates', 'get_embeddings', 'filter_relevant_articles', 'filter_top_categories', 'extract_sentiment_score', 'save_plot_base64', 'create_bar_chart_compiled_insights', 'create_sentiment_graph', 'create_category_sentiment_graph', 'create_horizontal_bar_chart', 'create_stacked_bar_chart', 'generate_media_outlet_pie_chart', 'generate_media_outlet_tone_chart', 'generate_overall_sentiment_trend', 'generate_sentiment_trends_by_category', 'generate_articles_per_category', 'generate_category_tone_chart', 'generate_top_journalists_chart', 'read_insights_content', 'generate_toc', 'extract_categories', 'generate_markdown_report', 'process_sta