In [28]:
import re
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from wordcloud import WordCloud, STOPWORDS
import networkx as nx
import numpy as np
from datetime import datetime

# Enhanced function to extract timestamps, speakers, roles and content
def extract_speech_data(text):
    # List to store extracted data
    data = []
    
    # Pattern to find timestamps (format: HH:MM)
    timestamp_pattern = r'\n(\d{1,2}:\d{2})\n'
    
    # Pattern to capture speaker, role in parentheses, and content
    speech_pattern = r'(.*?)(?:\s*\((.*?)\))?\s*:\s*(.*?)(?=\n\n|\Z)'
    
    # First, split text by timestamps
    timestamp_splits = re.split(timestamp_pattern, text)
    
    current_timestamp = None
    current_text = ""
    
    # Process the split text
    for i, segment in enumerate(timestamp_splits):
        if i % 2 == 1:  # This is a timestamp
            current_timestamp = segment
        else:  # This is content after a timestamp or before the first timestamp
            current_text = segment
            if current_timestamp:  # If we have a timestamp, process the text
                # Find all speaker-content pairs in this timestamped section
                matches = re.findall(speech_pattern, current_text, re.DOTALL)
                
                for match in matches:
                    speaker = match[0].strip()
                    role = match[1].strip() if match[1] else ""
                    content = match[2].strip()
                    data.append((current_timestamp, speaker, role, content))
    
    # Process any remaining sections without timestamps
    if not data:
        matches = re.findall(speech_pattern, text, re.DOTALL)
        for match in matches:
            speaker = match[0].strip()
            role = match[1].strip() if match[1] else ""
            content = match[2].strip()
            data.append(("", speaker, role, content))
    
    return data

# Process all text files in the directory
def process_all_files(directory):
    all_data = []
    file_dates = {}
    
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            file_path = os.path.join(directory, filename)
            date_match = re.search(r'(\d{2})_(\d{2})_(\d{2})', filename)
            
            if date_match:
                day, month, year = date_match.groups()
                date = f"20{year}-{month}-{day}"
                
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
                
                # Extract speeches with timestamps
                data = extract_speech_data(text)
                
                # Create DataFrame for this file
                if data:
                    file_df = pd.DataFrame(data, columns=['Timestamp', 'Speaker', 'Role', 'Content'])
                    file_df['Date'] = date
                    file_df['Filename'] = filename
                    
                    all_data.append(file_df)
                    file_dates[filename] = date
    
    # Combine all DataFrames
    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        return combined_df, file_dates
    
    return pd.DataFrame(), {}

# Add these visualization functions to your existing code

def get_extended_stopwords():
    """Custom stopwords for parliamentary debates"""
    base_stopwords = set(STOPWORDS)
    custom_words = {
        'will', 'one', 'whether', 'said', 'get', 'also', 'make', 'well', 'say',
        'thank', 'think', 'way', 'come', 'right', 'know', 'take', 'see', 'going',
        'would', 'could', 'should', 'may', 'might', 'must', 'shall', 'can',
        'just', 'now', 'look', 'want', 'back', 'much', 'many', 'lot', 'thing',
        'next', 'made', 'like', 'good', 'set', 'put', 'year', 'day', 'time',
        'use', 'used', 'using', 'new', 'old', 'first', 'last', 'able', 'need',
        'point', 'every', 'across', 'example', 'really', 'quite', 'mean', 'within'
    }
    return base_stopwords.union(custom_words)

def generate_word_cloud(text, title, output_path):
    """Generate filtered word cloud visualization"""
    wordcloud = WordCloud(
        width=1200,
        height=600,
        background_color='white',
        stopwords=get_extended_stopwords(),
        max_words=150,
        collocations=False
    ).generate(text)

    plt.figure(figsize=(15, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, pad=20)
    plt.savefig(output_path, bbox_inches='tight', dpi=300)
    plt.close()

def generate_speaker_network(date_df, output_path):
    """Create improved speaker interaction network"""
    interactions = defaultdict(int)
    speakers = date_df['Speaker'].tolist()
    
    for i in range(1, len(speakers)):
        prev_speaker = speakers[i-1]
        current_speaker = speakers[i]
        if prev_speaker != current_speaker:
            interactions[(prev_speaker, current_speaker)] += 1

    G = nx.DiGraph()
    for (source, target), weight in interactions.items():
        G.add_edge(source, target, weight=weight)

    plt.figure(figsize=(18, 12))
    pos = nx.spring_layout(G, k=0.5, seed=42)
    
    # Node sizing based on degree
    node_sizes = [2000 + G.degree(speaker)*300 for speaker in G.nodes()]
    
    nx.draw_networkx_nodes(
        G, pos,
        node_size=node_sizes,
        node_color='lightblue',
        alpha=0.9
    )
    
    # Edge styling
    edge_weights = [G[u][v]['weight']*2 for u,v in G.edges()]
    nx.draw_networkx_edges(
        G, pos,
        width=edge_weights,
        edge_color='gray',
        alpha=0.7,
        arrowsize=20
    )
    
    # Labels with roles
    roles = date_df.set_index('Speaker')['Role'].to_dict()
    labels = {node: f"{node}\n({roles.get(node, '')})" for node in G.nodes()}
    nx.draw_networkx_labels(G, pos, labels=labels, font_size=9, font_family='sans-serif')
    
    plt.title("Speaker Interaction Flow", pad=20)
    plt.axis('off')
    plt.savefig(output_path, bbox_inches='tight', dpi=300)
    plt.close()

# Update the main processing function
def process_parliamentary_minutes(directory, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    
    print(f"Processing files in: {directory}")
    df, file_dates = process_all_files(directory)
    
    if df.empty:
        print("No valid files found.")
        return

    # Sort and save data
    df = df.sort_values(['Date', 'Timestamp'])
    df.to_csv(os.path.join(output_dir, "parliamentary_minutes.csv"), index=False)
    
    # Process each date separately for visualizations
    for filename, date in file_dates.items():
        date_str = date.replace('-', '_')
        date_df = df[df['Date'] == date]
        
        print(f"Creating visualizations for {date}")
        
        # Word Cloud
        generate_word_cloud(
            ' '.join(date_df['Content']),
            f"Key Topics - {date}",
            os.path.join(output_dir, f"wordcloud_{date_str}.png")
        )
        
        # Speaker Frequency
        plt.figure(figsize=(14, 7))
        date_df['Speaker'].value_counts().plot(kind='barh', color='skyblue')
        plt.title(f"Speaking Frequency - {date}")
        plt.xlabel('Number of Contributions')
        plt.ylabel('Speaker')
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f"speaker_freq_{date_str}.png"), dpi=300)
        plt.close()
        
        # Interaction Network
        generate_speaker_network(
            date_df,
            os.path.join(output_dir, f"interaction_network_{date_str}.png")
        )

    print(f"Processing complete. Results saved to: {output_dir}")
    return df, extract_speakers_and_roles(df)


# Extract speakers and their roles function
def extract_speakers_and_roles(df):
    speakers_data = []
    
    for speaker in df['Speaker'].unique():
        speaker_df = df[df['Speaker'] == speaker]
        roles = speaker_df['Role'].unique()
        role = roles[0] if roles[0] and roles[0] != "" else "Unknown"
        
        # Calculate speaking statistics
        word_count = speaker_df['Content'].str.split().str.len().sum()
        contribution_count = len(speaker_df)
        avg_words_per_contribution = word_count / contribution_count if contribution_count > 0 else 0
        
        speakers_data.append({
            'Speaker': speaker,
            'Role/Organization': role,
            'Number_of_Contributions': contribution_count,
            'Total_Words': word_count,
            'Average_Words_Per_Contribution': avg_words_per_contribution
        })
    
    return pd.DataFrame(speakers_data).sort_values('Number_of_Contributions', ascending=False)



input_dir = "C://Users//Usama.Khatab//Projects//Software Projects//mleng//project-info//data"  # Change to your directory with text files
output_dir = "output"  # Change to your desired output directory
df, speakers_df = process_parliamentary_minutes(input_dir, output_dir)


Processing files in: C://Users//Usama.Khatab//Projects//Software Projects//mleng//project-info//data
Creating visualizations for 2025-01-07
Creating visualizations for 2024-10-08
Creating visualizations for 2024-09-10
Creating visualizations for 2024-06-26
Processing complete. Results saved to: output
