In [9]:
import json
import networkx as nx
import matplotlib.pyplot as plt
import csv
import os

# Read and parse the JSON data
def load_data(file_path):
    articles = []
    with open(file_path, 'r') as file:
        for line in file:
            articles.append(json.loads(line.strip()))
    return articles

# Create field connections graph
def create_field_graph(articles):
    G = nx.Graph()
    
    # Track field to domain mapping
    field_to_domain = {}
    
    # First, collect all fields and their domain information
    for article in articles:
        for topic in article['topics']:
            if 'field' in topic and topic['field'] is not None:
                field_name = topic['field']['display_name']
                
                # Store the field to domain mapping
                if 'domain' in topic and topic['domain'] is not None:
                    field_to_domain[field_name] = topic['domain']['display_name']
                    
                # Initialize node if not exists
                if not G.has_node(field_name):
                    G.add_node(field_name, count=0)
    
    # Now process each article to create connections
    for article in articles:
        # Extract unique fields for this article
        fields = set()
        for topic in article['topics']:
            if 'field' in topic and topic['field'] is not None:
                field_name = topic['field']['display_name']
                fields.add(field_name)
        
        # Increment node counts for each field in this article
        for field in fields:
            G.nodes[field]['count'] += 1
        
        # Create connections between fields from the same article
        fields = list(fields)
        for i in range(len(fields)):
            for j in range(i+1, len(fields)):
                if G.has_edge(fields[i], fields[j]):
                    G[fields[i]][fields[j]]['weight'] += 1
                else:
                    G.add_edge(fields[i], fields[j], weight=1)
    
    # Add domain info to each node
    for node in G.nodes():
        if node in field_to_domain:
            G.nodes[node]['domain'] = field_to_domain[node]
    
    return G

# Create subfield connections graph
def create_subfield_graph(articles):
    G = nx.Graph()
    
    # Track subfield to field and domain mapping for visualization
    subfield_to_field = {}
    subfield_to_domain = {}
    
    # First, collect all subfields and their field/domain information
    for article in articles:
        for topic in article['topics']:
            if 'subfield' in topic and topic['subfield'] is not None:
                subfield_name = topic['subfield']['display_name']
                
                # Store the subfield to field mapping
                if 'field' in topic and topic['field'] is not None:
                    subfield_to_field[subfield_name] = topic['field']['display_name']
                
                # Store the subfield to domain mapping
                if 'domain' in topic and topic['domain'] is not None:
                    subfield_to_domain[subfield_name] = topic['domain']['display_name']
                    
                # Initialize node if not exists
                if not G.has_node(subfield_name):
                    G.add_node(subfield_name, count=0)
    
    # Now process each article to create connections
    for article in articles:
        # Extract unique subfields for this article
        subfields = set()
        for topic in article['topics']:
            if 'subfield' in topic and topic['subfield'] is not None:
                subfield_name = topic['subfield']['display_name']
                subfields.add(subfield_name)
        
        # Increment node counts for each subfield in this article
        for subfield in subfields:
            G.nodes[subfield]['count'] += 1
        
        # Create connections between subfields from the same article
        subfields = list(subfields)
        for i in range(len(subfields)):
            for j in range(i+1, len(subfields)):
                if G.has_edge(subfields[i], subfields[j]):
                    G[subfields[i]][subfields[j]]['weight'] += 1
                else:
                    G.add_edge(subfields[i], subfields[j], weight=1)
    
    # Add field and domain info to each node
    for node in G.nodes():
        if node in subfield_to_field:
            G.nodes[node]['field'] = subfield_to_field[node]
        if node in subfield_to_domain:
            G.nodes[node]['domain'] = subfield_to_domain[node]
    
    return G

# Draw network graph as a traditional node-edge graph
def draw_network(G, title, node_size_attr='count'):
    # Reduced figure size from 14x10 to 10x7
    plt.figure(figsize=(10, 7))
    
    # Calculate node sizes based on count - EXTREMELY reduced scaling factor
    # Further reduced base size from 3 to 2
    base_size = 2  # Minimum node size
    sizes = [base_size + (G.nodes[node].get(node_size_attr, 1) * 0.2) for node in G.nodes()]
    
    # Calculate edge widths based on weight - EXTREMELY thin lines
    # Further reduced max width by 20%
    min_width = 0.1
    max_width = 0.32  # Reduced from 0.4 (20% reduction)
    
    # Normalize weights for edge coloring and width
    all_weights = [G[u][v].get('weight', 1) for u, v in G.edges()]
    min_weight = min(all_weights) if all_weights else 1
    max_weight = max(all_weights) if all_weights else 1
    
    # Create edge width and color lists
    edge_widths = []
    edge_colors = []
    
    for u, v in G.edges():
        weight = G[u][v].get('weight', 1)
        # Normalize weight between 0 and 1
        norm_weight = (weight - min_weight) / (max_weight - min_weight) if max_weight > min_weight else 0.5
        
        # Calculate width - thinner overall
        width = min_width + norm_weight * (max_width - min_width)
        edge_widths.append(width)
        
        # Calculate color - darker for stronger connections
        # 0.8 (light gray) to 0.0 (black)
        color_val = 0.8 - (norm_weight * 0.8)
        edge_colors.append((color_val, color_val, color_val))
    
    # Get domains for coloring
    domains = {}
    for node, data in G.nodes(data=True):
        domain = data.get('domain', 'Unknown')
        if domain not in domains:
            domains[domain] = len(domains)
    
    # Create color map based on domains - fixed for Matplotlib 3.7+
    import matplotlib as mpl
    cmap = mpl.colormaps['tab10']
    node_colors = [cmap(domains.get(G.nodes[node].get('domain', 'Unknown'), 0)) for node in G.nodes()]
    
    # Use a layout that clearly shows edges and minimizes overlaps
    # Using spring_layout with even higher repulsion to reduce overlapping
    pos = nx.spring_layout(G, k=0.9, seed=42)  # Increased k to 0.9 for maximum spacing
    
    # Draw edges with extremely thin lines and variable darkness
    nx.draw_networkx_edges(G, pos, width=edge_widths, edge_color=edge_colors, alpha=0.6)
    
    # Draw nodes with clear outlines but TINY size
    nx.draw_networkx_nodes(G, pos, node_size=sizes, node_color=node_colors, 
                         alpha=0.7, linewidths=0.1, edgecolors='black')
    
    # For subfield graph, determine importance threshold for displaying text
    if title == "Subfield Connections":
        # Calculate a threshold based on node counts or degree
        node_counts = [G.nodes[node].get(node_size_attr, 1) for node in G.nodes()]
        node_degrees = [G.degree(node) for node in G.nodes()]
        
        # Choose the threshold - nodes with higher count or more connections
        count_threshold = sorted(node_counts, reverse=True)[min(len(node_counts)//4, len(node_counts)-1)]
        degree_threshold = sorted(node_degrees, reverse=True)[min(len(node_degrees)//4, len(node_degrees)-1)]
        
        # Draw labels with background only for important subfields
        for node, (x, y) in pos.items():
            node_count = G.nodes[node].get(node_size_attr, 1)
            node_degree = G.degree(node)
            
            # Show label only if count or degree exceeds threshold
            if node_count >= count_threshold or node_degree >= degree_threshold:
                # Even smaller font size
                plt.text(x + 0.01, y + 0.01, node, fontsize=2.5, fontweight="normal", 
                        ha='left', va='bottom',
                        bbox=dict(facecolor='white', alpha=0.4, boxstyle='round', pad=0.02, linewidth=0))
    else:
        # For field graph, draw all labels with further reduced font size
        for node, (x, y) in pos.items():
            # Small offset to avoid overlapping with nodes
            offset_x = x + 0.01
            offset_y = y + 0.01
            plt.text(offset_x, offset_y, node, fontsize=2.5, fontweight="normal", 
                    ha='left', va='bottom',  # Left alignment helps avoid node overlap
                    bbox=dict(facecolor='white', alpha=0.4, boxstyle='round', pad=0.02, linewidth=0))
    
    # Add a legend for domains
    legend_elements = []
    for domain, idx in domains.items():
        legend_elements.append(plt.Line2D([0], [0], marker='o', color='w', 
                              markerfacecolor=cmap(idx), markersize=4, label=domain))
    
    # Add a legend for edge weights
    if max_weight > min_weight:
        weight_legend = []
        for i, w in enumerate([(max_weight - min_weight) * p + min_weight for p in [0.2, 0.6, 1.0]]):
            gray_val = 0.8 - ((i * 0.4) * 0.8)
            weight_legend.append(plt.Line2D([0], [0], color=(gray_val, gray_val, gray_val),
                                 linewidth=min_width + (i * 0.4) * (max_width - min_width), 
                                 label=f'≈ {w:.1f}'))
        
        plt.legend(handles=legend_elements + weight_legend, 
                  title="Domains & Connection Strength",
                  loc="upper right", fontsize=5, title_fontsize=6,
                  framealpha=0.7)
    else:
        plt.legend(handles=legend_elements, title="Domains", 
                  loc="upper right", fontsize=5, title_fontsize=6)
    
    plt.title(title, fontsize=11, fontweight='bold')
    plt.axis('off')
    plt.tight_layout()
    
    # Save with higher DPI for better detail
    plt.savefig(f"{title.replace(' ', '_')}.png", dpi=400, bbox_inches='tight')
    plt.close()
    
    return pos

# Export to Gephi format with clear node-edge structure
def export_to_gephi(G, filename):
    # Get min and max weights for normalization
    all_weights = [G[u][v].get('weight', 1) for u, v in G.edges()]
    min_weight = min(all_weights) if all_weights else 1
    max_weight = max(all_weights) if all_weights else 1
    
    # Add display information to the graph for Gephi
    for node, data in G.nodes(data=True):
        # Store size attribute explicitly for Gephi - with extremely minimal default size
        # Further reduced from 0.5 to 0.3
        G.nodes[node]['viz'] = {'size': 0.3 + (data.get('count', 1) * 0.1)}  # Extremely small size
        
        # Add position for initial layout
        pos = nx.spring_layout(G, k=0.9, seed=42)  # Use same layout as visualization
        G.nodes[node]['viz']['position'] = {'x': float(pos[node][0] * 1000), 
                                           'y': float(pos[node][1] * 1000), 
                                           'z': 0.0}
        
        # Add color information based on domain
        domain = data.get('domain', 'Unknown')
        # Use a simple hash function to generate consistent colors for domains
        import hashlib
        domain_hash = int(hashlib.md5(domain.encode()).hexdigest(), 16)
        r = (domain_hash & 0xFF) / 255.0
        g = ((domain_hash >> 8) & 0xFF) / 255.0
        b = ((domain_hash >> 16) & 0xFF) / 255.0
        G.nodes[node]['viz']['color'] = {'r': r * 255, 'g': g * 255, 'b': b * 255, 'a': 0.7}
    
    # Add edge weight information to GEXF with darker colors for stronger edges
    for u, v, data in G.edges(data=True):
        weight = data.get('weight', 1)
        # Normalize weight between 0 and 1
        norm_weight = (weight - min_weight) / (max_weight - min_weight) if max_weight > min_weight else 0.5
        
        # Set edge thickness to be extremely minimal
        # Reduced max thickness by 20%
        G[u][v]['viz'] = {'thickness': 0.05 + (norm_weight * 0.12)}
        
        # Calculate grayscale color - darker for stronger connections
        # From 200 (light gray) to 0 (black)
        gray_val = int(200 - (norm_weight * 200))
        G[u][v]['viz']['color'] = {'r': gray_val, 'g': gray_val, 'b': gray_val, 'a': 0.6}
    
    # Export to GEXF (Graph Exchange XML Format)
    nx.write_gexf(G, f"{filename}.gexf")
    
    # Create CSV files for nodes and edges
    os.makedirs('gephi_csv', exist_ok=True)
    
    # Nodes CSV file
    with open(f"gephi_csv/{filename}_nodes.csv", 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        # Write headers
        writer.writerow(['Id', 'Label', 'Count', 'Domain', 'Field'])
        
        # Write node data
        for node, data in G.nodes(data=True):
            domain = data.get('domain', 'Unknown')
            field = data.get('field', '')
            count = data.get('count', 1)
            
            writer.writerow([node, node, count, domain, field])
    
    # Edges CSV file
    with open(f"gephi_csv/{filename}_edges.csv", 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Source', 'Target', 'Weight', 'Type'])
        
        for u, v, data in G.edges(data=True):
            writer.writerow([u, v, data.get('weight', 1), 'Undirected'])
    
    print(f"Gephi files created: {filename}.gexf and CSV files in gephi_csv/")

def main():
    # Updated file path from "paste.txt" to "articles_2024/all_articles.jsonl"
    articles = load_data("all_articles.jsonl")
    print(f"Loaded {len(articles)} articles")
    
    # Create field graph
    field_graph = create_field_graph(articles)
    print(f"Field graph created with {len(field_graph.nodes())} nodes and {len(field_graph.edges())} edges")
    
    # Draw field graph with clear node-edge visualization
    draw_network(field_graph, "Field Connections")
    
    # Export field graph for Gephi
    export_to_gephi(field_graph, "field_connections")
    
    # Create subfield graph
    subfield_graph = create_subfield_graph(articles)
    print(f"Subfield graph created with {len(subfield_graph.nodes())} nodes and {len(subfield_graph.edges())} edges")
    
    # Draw subfield graph with clear node-edge visualization
    draw_network(subfield_graph, "Subfield Connections")
    
    # Export subfield graph for Gephi
    export_to_gephi(subfield_graph, "subfield_connections")
    
    # Print instructions for Gephi
    print("\nGephi Import Instructions:")
    print("1. Open Gephi")
    print("2. Go to File -> Open and select the .gexf file")
    print("3. In the Import Report dialog, click OK")
    print("4. For a clear node-edge visualization:")
    print("   - Go to the Layout panel and apply 'ForceAtlas 2'")
    print("   - Check 'Prevent Overlap' and click 'Run'")
    print("   - After the layout stabilizes, click 'Stop'")
    print("5. In the Appearance panel:")
    print("   - To color nodes by domain: Select Nodes > Partition > domain attribute")
    print("   - To size nodes by count: Select Nodes > Ranking > count attribute")
    print("6. To adjust edge visibility:")
    print("   - Select Edges > Ranking > weight")
    print("   - Adjust the min/max size to make connections visible")
    print("7. In Preview, select a template like 'Default Straight' to see clear edges")

if __name__ == "__main__":
    main()

Loaded 62554 articles
Field graph created with 26 nodes and 296 edges
Gephi files created: field_connections.gexf and CSV files in gephi_csv/
Subfield graph created with 241 nodes and 6990 edges
Gephi files created: subfield_connections.gexf and CSV files in gephi_csv/

Gephi Import Instructions:
1. Open Gephi
2. Go to File -> Open and select the .gexf file
3. In the Import Report dialog, click OK
4. For a clear node-edge visualization:
   - Go to the Layout panel and apply 'ForceAtlas 2'
   - Check 'Prevent Overlap' and click 'Run'
   - After the layout stabilizes, click 'Stop'
5. In the Appearance panel:
   - To color nodes by domain: Select Nodes > Partition > domain attribute
   - To size nodes by count: Select Nodes > Ranking > count attribute
6. To adjust edge visibility:
   - Select Edges > Ranking > weight
   - Adjust the min/max size to make connections visible
7. In Preview, select a template like 'Default Straight' to see clear edges


In [18]:
import json
import networkx as nx
import matplotlib.pyplot as plt
import csv
import os
import pandas as pd
import numpy as np
from collections import Counter

# Read and parse the JSON data
def load_data(file_path):
    articles = []
    with open(file_path, 'r') as file:
        for line in file:
            articles.append(json.loads(line.strip()))
    return articles

# Create field connections graph
def create_field_graph(articles):
    G = nx.Graph()
    
    # Track field to domain mapping
    field_to_domain = {}
    
    # First, collect all fields and their domain information
    for article in articles:
        for topic in article['topics']:
            if 'field' in topic and topic['field'] is not None:
                field_name = topic['field']['display_name']
                
                # Store the field to domain mapping
                if 'domain' in topic and topic['domain'] is not None:
                    field_to_domain[field_name] = topic['domain']['display_name']
                    
                # Initialize node if not exists
                if not G.has_node(field_name):
                    G.add_node(field_name, count=0)
    
    # Now process each article to create connections
    for article in articles:
        # Extract unique fields for this article
        fields = set()
        for topic in article['topics']:
            if 'field' in topic and topic['field'] is not None:
                field_name = topic['field']['display_name']
                fields.add(field_name)
        
        # Increment node counts for each field in this article
        for field in fields:
            G.nodes[field]['count'] += 1
        
        # Create connections between fields from the same article
        fields = list(fields)
        for i in range(len(fields)):
            for j in range(i+1, len(fields)):
                if G.has_edge(fields[i], fields[j]):
                    G[fields[i]][fields[j]]['weight'] += 1
                else:
                    G.add_edge(fields[i], fields[j], weight=1)
    
    # Add domain info to each node
    for node in G.nodes():
        if node in field_to_domain:
            G.nodes[node]['domain'] = field_to_domain[node]
    
    return G

# Create subfield connections graph
def create_subfield_graph(articles):
    G = nx.Graph()
    
    # Track subfield to field and domain mapping for visualization
    subfield_to_field = {}
    subfield_to_domain = {}
    
    # First, collect all subfields and their field/domain information
    for article in articles:
        for topic in article['topics']:
            if 'subfield' in topic and topic['subfield'] is not None:
                subfield_name = topic['subfield']['display_name']
                
                # Store the subfield to field mapping
                if 'field' in topic and topic['field'] is not None:
                    subfield_to_field[subfield_name] = topic['field']['display_name']
                
                # Store the subfield to domain mapping
                if 'domain' in topic and topic['domain'] is not None:
                    subfield_to_domain[subfield_name] = topic['domain']['display_name']
                    
                # Initialize node if not exists
                if not G.has_node(subfield_name):
                    G.add_node(subfield_name, count=0)
    
    # Now process each article to create connections
    for article in articles:
        # Extract unique subfields for this article
        subfields = set()
        for topic in article['topics']:
            if 'subfield' in topic and topic['subfield'] is not None:
                subfield_name = topic['subfield']['display_name']
                subfields.add(subfield_name)
        
        # Increment node counts for each subfield in this article
        for subfield in subfields:
            G.nodes[subfield]['count'] += 1
        
        # Create connections between subfields from the same article
        subfields = list(subfields)
        for i in range(len(subfields)):
            for j in range(i+1, len(subfields)):
                if G.has_edge(subfields[i], subfields[j]):
                    G[subfields[i]][subfields[j]]['weight'] += 1
                else:
                    G.add_edge(subfields[i], subfields[j], weight=1)
    
    # Add field and domain info to each node
    for node in G.nodes():
        if node in subfield_to_field:
            G.nodes[node]['field'] = subfield_to_field[node]
        if node in subfield_to_domain:
            G.nodes[node]['domain'] = subfield_to_domain[node]
    
    return G

# Draw network graph as a traditional node-edge graph
def draw_network(G, title, node_size_attr='count'):
    # Differentiate figure sizes by chart type
    if title == "Field Connections":
        # Much smaller size for field charts
        plt.figure(figsize=(1.8, 1.8))  # Extra small for fields
    else:
        # Slightly larger for subfields due to complexity
        plt.figure(figsize=(2.3, 2.3))
    
    # Calculate node sizes based on count - extremely minimal sizing
    base_size = 0.1  # Tiny minimum node size 
    max_size_factor = 0.005  # Drastically reduced scaling factor
    sizes = [base_size + (G.nodes[node].get(node_size_attr, 1) * max_size_factor) for node in G.nodes()]
    
    # Calculate edge widths - ultra thin lines for academic print
    min_width = 0.05
    max_width = 0.15
    
    # Normalize weights for edge coloring and width
    all_weights = [G[u][v].get('weight', 1) for u, v in G.edges()]
    min_weight = min(all_weights) if all_weights else 1
    max_weight = max(all_weights) if all_weights else 1
    
    # Create edge width and color lists
    edge_widths = []
    edge_colors = []
    
    for u, v in G.edges():
        weight = G[u][v].get('weight', 1)
        # Normalize weight between 0 and 1
        norm_weight = (weight - min_weight) / (max_weight - min_weight) if max_weight > min_weight else 0.5
        
        # Calculate width - extra thin for print
        width = min_width + norm_weight * (max_width - min_width)
        edge_widths.append(width)
        
        # Calculate color - darker for stronger connections
        # Use grayscale for better print reproduction
        color_val = 0.7 - (norm_weight * 0.7)
        edge_colors.append((color_val, color_val, color_val))
    
    # Get domains for coloring
    domains = {}
    for node, data in G.nodes(data=True):
        domain = data.get('domain', 'Unknown')
        if domain not in domains:
            domains[domain] = len(domains)
    
    # Create color map based on domains - using colorblind-friendly palette
    import matplotlib as mpl
    cmap = mpl.colormaps['tab10']
    node_colors = [cmap(domains.get(G.nodes[node].get('domain', 'Unknown'), 0)) for node in G.nodes()]
    
    # Use a layout with much higher k value for better node separation
    pos = nx.spring_layout(G, k=2.5, iterations=200, seed=42)
    
    # Draw edges first with thin lines
    nx.draw_networkx_edges(G, pos, width=edge_widths, edge_color=edge_colors, alpha=0.5)
    
    # Draw nodes with minimal size and very low alpha for less visual dominance
    nx.draw_networkx_nodes(G, pos, node_size=sizes, node_color=node_colors, 
                         alpha=0.5, linewidths=0.03, edgecolors='black')
    
    # More strategic label display with priority for distant nodes
    # First identify center of graph
    center_x = sum(x for x, y in pos.values()) / len(pos)
    center_y = sum(y for x, y in pos.values()) / len(pos)
    
    # Calculate distances from center for each node
    distances = {node: np.sqrt((x - center_x)**2 + (y - center_y)**2) for node, (x, y) in pos.items()}
    
    # For subfield graph, be much more selective with labels
    if title == "Subfield Connections":
        # Calculate importance factors
        node_counts = [G.nodes[node].get(node_size_attr, 1) for node in G.nodes()]
        node_degrees = [G.degree(node) for node in G.nodes()]
        
        # Set very restrictive thresholds - only top ~5% by count/degree, top ~15% by distance
        count_threshold = sorted(node_counts, reverse=True)[min(len(node_counts)//20, len(node_counts)-1)]
        degree_threshold = sorted(node_degrees, reverse=True)[min(len(node_degrees)//20, len(node_degrees)-1)]
        distance_threshold = sorted(distances.values(), reverse=True)[min(len(distances)//7, len(distances)-1)]
        
        # Draw labels for only the most important or very distant nodes
        for node, (x, y) in pos.items():
            node_count = G.nodes[node].get(node_size_attr, 1)
            node_degree = G.degree(node)
            node_distance = distances[node]
            
            # Label if it's very important OR if it's very distant
            if (node_count >= count_threshold or node_degree >= degree_threshold or 
                node_distance >= distance_threshold):
                # Smaller font for subfields
                plt.text(x, y, node, fontsize=1.8, ha='center', va='center',
                        bbox=dict(facecolor='white', alpha=0.9, boxstyle='round', 
                              pad=0.3, linewidth=0))
    else:
        # For field graph, show more labels with priority for distant nodes
        node_counts = [G.nodes[node].get(node_size_attr, 1) for node in G.nodes()]
        count_threshold = sorted(node_counts, reverse=True)[min(len(node_counts)//6, len(node_counts)-1)]
        distance_threshold = sorted(distances.values(), reverse=True)[min(len(distances)//3, len(distances)-1)]
        
        for node, (x, y) in pos.items():
            node_count = G.nodes[node].get(node_size_attr, 1)
            node_distance = distances[node]
            
            # Label if it's important OR if it's distant
            if node_count >= count_threshold or node_distance >= distance_threshold:
                # Larger font for fields
                plt.text(x, y, node, fontsize=2.5, ha='center', va='center',
                        bbox=dict(facecolor='white', alpha=0.9, boxstyle='round', 
                              pad=0.3, linewidth=0))
    
    # Create more comprehensive legend showing more domains
    # Show more of the top domains to ensure coverage
    top_domains = sorted([(domain, sum(1 for _, data in G.nodes(data=True) 
                        if data.get('domain', 'Unknown') == domain)) 
                       for domain in domains], key=lambda x: x[1], reverse=True)
    
    # Include at least 5 domains, more if there are ties at the cutoff
    cutoff_count = top_domains[min(4, len(top_domains)-1)][1]
    top_domains = [d for d in top_domains if d[1] >= cutoff_count][:8]  # Cap at 8 domains max
    
    legend_elements = []
    for domain, _ in top_domains:
        idx = domains[domain]
        # Full domain names for better readability
        legend_elements.append(plt.Line2D([0], [0], marker='o', color='w', 
                             markerfacecolor=cmap(idx), markersize=3, label=domain))
    
    # Two weight elements for better clarity
    weight_legend = [
        plt.Line2D([0], [0], color='lightgray', linewidth=min_width, label='Weak Link'),
        plt.Line2D([0], [0], color='darkgray', linewidth=max_width, label='Strong Link')
    ]
    
    # Always position legend at the bottom center
    plt.legend(handles=legend_elements + weight_legend, 
             loc="lower center", bbox_to_anchor=(0.5, -0.15),
             ncol=2, fontsize=3.5, title_fontsize=0,
             framealpha=0.8, borderpad=0.4, labelspacing=0.3,
             handletextpad=0.4, handlelength=1.0)
    
    plt.title(title, fontsize=8, pad=5)
    plt.axis('off')
    plt.tight_layout()
    
    # Save as high-resolution vector graphics for publication
    plt.savefig(f"{title.replace(' ', '_')}.pdf", format='pdf', bbox_inches='tight', dpi=1200)
    # Also save PNG for quick reference with higher DPI
    plt.savefig(f"{title.replace(' ', '_')}.png", dpi=1200, bbox_inches='tight')
    plt.close()
    
    return pos

# Generate top fields/subfields by frequency
def generate_top_nodes_table(G, title, count_attr='count', top_n=20):
    """
    Generate a table of the top n nodes by frequency (count attribute).
    Returns a DataFrame with node names, counts, and additional attributes.
    """
    # Get all nodes with their counts
    node_data = []
    for node, data in G.nodes(data=True):
        count = data.get(count_attr, 0)
        domain = data.get('domain', 'Unknown')
        # Include field if it exists (for subfields)
        field = data.get('field', '') if 'field' in data else ''
        degree = G.degree(node)
        
        node_info = {
            'Name': node,
            'Count': count,
            'Domain': domain,
            'Connections': degree,
            'Normalized Centrality': nx.degree_centrality(G)[node]
        }
        
        if field:
            node_info['Field'] = field
            
        node_data.append(node_info)
    
    # Convert to DataFrame and sort by count
    df = pd.DataFrame(node_data)
    df = df.sort_values('Count', ascending=False).head(top_n)
    
    # Format the normalized centrality column
    df['Normalized Centrality'] = df['Normalized Centrality'].apply(lambda x: f"{x:.4f}")
    
    # Save to CSV
    df.to_csv(f"Top_{top_n}_{title.replace(' ', '_')}.csv", index=False)
    
    # Create LaTeX table
    latex_table = df.to_latex(index=False, 
                             caption=f"Top {top_n} {title} by Frequency", 
                             label=f"tab:top_{top_n}_{title.replace(' ', '_').lower()}",
                             longtable=True, escape=False)
    
    with open(f"Top_{top_n}_{title.replace(' ', '_')}.tex", 'w') as f:
        f.write(latex_table)
    
    return df

# Generate network statistics for academic publication
def generate_network_statistics(G, title):
    """
    Generate comprehensive network statistics for academic publications.
    Returns a DataFrame with key metrics.
    """
    stats = {}
    
    # Basic graph metrics
    stats['Number of Nodes'] = len(G.nodes())
    stats['Number of Edges'] = len(G.edges())
    stats['Graph Density'] = nx.density(G)
    
    # Get all weights
    weights = [G[u][v].get('weight', 1) for u, v in G.edges()]
    
    # Edge weight statistics
    stats['Min Edge Weight'] = min(weights) if weights else 0
    stats['Max Edge Weight'] = max(weights) if weights else 0
    stats['Mean Edge Weight'] = np.mean(weights) if weights else 0
    stats['Median Edge Weight'] = np.median(weights) if weights else 0
    
    # Centrality measures
    degree_centrality = nx.degree_centrality(G)
    stats['Max Degree Centrality'] = max(degree_centrality.values()) if degree_centrality else 0
    stats['Mean Degree Centrality'] = np.mean(list(degree_centrality.values())) if degree_centrality else 0
    
    betweenness_centrality = nx.betweenness_centrality(G)
    stats['Max Betweenness Centrality'] = max(betweenness_centrality.values()) if betweenness_centrality else 0
    stats['Mean Betweenness Centrality'] = np.mean(list(betweenness_centrality.values())) if betweenness_centrality else 0
    
    # Clustering
    stats['Global Clustering Coefficient'] = nx.average_clustering(G)
    
    # Connected components
    connected_components = list(nx.connected_components(G))
    stats['Number of Connected Components'] = len(connected_components)
    
    largest_cc = max(connected_components, key=len)
    stats['Largest Component Size'] = len(largest_cc)
    stats['Largest Component Percentage'] = (len(largest_cc) / len(G.nodes())) * 100
    
    # Degree distribution statistics
    degrees = [d for _, d in G.degree()]
    stats['Min Degree'] = min(degrees) if degrees else 0
    stats['Max Degree'] = max(degrees) if degrees else 0
    stats['Mean Degree'] = np.mean(degrees) if degrees else 0
    stats['Median Degree'] = np.median(degrees) if degrees else 0
    
    # Domain diversity
    domains = [data.get('domain', 'Unknown') for _, data in G.nodes(data=True)]
    domain_counts = Counter(domains)
    stats['Number of Domains'] = len(domain_counts)
    stats['Most Common Domain'] = domain_counts.most_common(1)[0][0] if domain_counts else 'None'
    stats['Most Common Domain Count'] = domain_counts.most_common(1)[0][1] if domain_counts else 0
    
    # If this is the subfield graph, include field diversity
    if 'field' in next(iter(G.nodes(data=True)))[1]:
        fields = [data.get('field', 'Unknown') for _, data in G.nodes(data=True)]
        field_counts = Counter(fields)
        stats['Number of Fields'] = len(field_counts)
        stats['Most Common Field'] = field_counts.most_common(1)[0][0] if field_counts else 'None'
        stats['Most Common Field Count'] = field_counts.most_common(1)[0][1] if field_counts else 0
    
    # Top nodes by degree and count
    node_degrees = dict(G.degree())
    sorted_nodes_degree = sorted(node_degrees.items(), key=lambda x: x[1], reverse=True)
    top_nodes_degree = sorted_nodes_degree[:5]
    
    node_counts = {node: G.nodes[node].get('count', 0) for node in G.nodes()}
    sorted_nodes_count = sorted(node_counts.items(), key=lambda x: x[1], reverse=True)
    top_nodes_count = sorted_nodes_count[:5]
    
    # Convert to DataFrame for nice formatting
    stats_df = pd.DataFrame(list(stats.items()), columns=['Metric', 'Value'])
    
    # Format the DataFrame
    stats_df['Value'] = stats_df.apply(
        lambda row: f"{row['Value']:.4f}" if isinstance(row['Value'], float) else row['Value'], 
        axis=1
    )
    
    # Save statistics to CSV
    stats_df.to_csv(f"{title.replace(' ', '_')}_statistics.csv", index=False)
    
    # Create a separate DataFrame for top nodes
    top_nodes_df = pd.DataFrame({
        'Top Nodes by Degree': [f"{node} ({degree})" for node, degree in top_nodes_degree],
        'Top Nodes by Count': [f"{node} ({count})" for node, count in top_nodes_count]
    })
    
    top_nodes_df.to_csv(f"{title.replace(' ', '_')}_top_nodes.csv", index=False)
    
    return stats_df, top_nodes_df

# Function to generate LaTeX tables for academic papers
def generate_latex_tables(stats_df, top_nodes_df, title):
    """
    Generate LaTeX tables from the statistics DataFrames
    """
    # Main statistics table
    latex_stats = stats_df.to_latex(index=False, caption=f"Network Statistics for {title}", 
                               label=f"tab:{title.replace(' ', '_').lower()}_stats",
                               longtable=True, escape=False)
    
    # Top nodes table
    latex_top_nodes = top_nodes_df.to_latex(index=False, caption=f"Top Nodes in {title} Network", 
                                       label=f"tab:{title.replace(' ', '_').lower()}_top_nodes",
                                       longtable=True, escape=False)
    
    # Save to files
    with open(f"{title.replace(' ', '_')}_statistics.tex", 'w') as f:
        f.write(latex_stats)
        
    with open(f"{title.replace(' ', '_')}_top_nodes.tex", 'w') as f:
        f.write(latex_top_nodes)
    
    return latex_stats, latex_top_nodes

def main():
    # Load the articles
    articles = load_data("all_articles.jsonl")
    print(f"Loaded {len(articles)} articles")
    
    # Create field graph
    field_graph = create_field_graph(articles)
    print(f"Field graph created with {len(field_graph.nodes())} nodes and {len(field_graph.edges())} edges")
    
    # Draw field graph
    draw_network(field_graph, "Field Connections")
    
    # Generate statistics for field graph
    field_stats_df, field_top_nodes_df = generate_network_statistics(field_graph, "Field Connections")
    print("\nField Network Statistics:")
    print(field_stats_df.to_string(index=False))
    
    print("\nTop Nodes in Field Network:")
    print(field_top_nodes_df.to_string(index=False))
    
    # Generate top 20 fields table
    top_fields_df = generate_top_nodes_table(field_graph, "Fields", top_n=20)
    print("\nTop 20 Fields by Frequency:")
    print(top_fields_df.to_string(index=False))
    
    # Generate LaTeX tables for field graph
    generate_latex_tables(field_stats_df, field_top_nodes_df, "Field Connections")
    
    # Create subfield graph
    subfield_graph = create_subfield_graph(articles)
    print(f"\nSubfield graph created with {len(subfield_graph.nodes())} nodes and {len(subfield_graph.edges())} edges")
    
    # Draw subfield graph
    draw_network(subfield_graph, "Subfield Connections")
    
    # Generate statistics for subfield graph
    subfield_stats_df, subfield_top_nodes_df = generate_network_statistics(subfield_graph, "Subfield Connections")
    print("\nSubfield Network Statistics:")
    print(subfield_stats_df.to_string(index=False))
    
    print("\nTop Nodes in Subfield Network:")
    print(subfield_top_nodes_df.to_string(index=False))
    
    # Generate top 20 subfields table
    top_subfields_df = generate_top_nodes_table(subfield_graph, "Subfields", top_n=20)
    print("\nTop 20 Subfields by Frequency:")
    print(top_subfields_df.to_string(index=False))
    
    # Generate LaTeX tables for subfield graph
    generate_latex_tables(subfield_stats_df, subfield_top_nodes_df, "Subfield Connections")
    
    # Analyze interdisciplinary connections
    print("\nAnalyzing interdisciplinary connections...")
    
    # Get domains for each node
    field_domains = {node: data.get('domain', 'Unknown') 
                     for node, data in field_graph.nodes(data=True)}
    
    # Count cross-domain connections
    cross_domain_edges = 0
    domain_connections = {}
    
    for u, v in field_graph.edges():
        domain_u = field_domains.get(u, 'Unknown')
        domain_v = field_domains.get(v, 'Unknown')
        
        if domain_u != domain_v:
            cross_domain_edges += 1
            
            # Track which domains are connecting
            domain_pair = tuple(sorted([domain_u, domain_v]))
            domain_connections[domain_pair] = domain_connections.get(domain_pair, 0) + 1
    
    # Calculate percentage of cross-domain connections
    cross_domain_percentage = (cross_domain_edges / len(field_graph.edges())) * 100 if field_graph.edges() else 0
    
    print(f"Cross-domain connections: {cross_domain_edges} ({cross_domain_percentage:.2f}% of all connections)")
    
    # Top domain connections
    top_domain_connections = sorted(domain_connections.items(), key=lambda x: x[1], reverse=True)[:5]
    print("\nTop domain connections:")
    for (domain1, domain2), count in top_domain_connections:
        print(f"  {domain1} <-> {domain2}: {count} connections")
    
    # Create a summary table
    summary_data = {
        'Network Type': ['Field Network', 'Subfield Network'],
        'Nodes': [len(field_graph.nodes()), len(subfield_graph.nodes())],
        'Edges': [len(field_graph.edges()), len(subfield_graph.edges())],
        'Density': [nx.density(field_graph), nx.density(subfield_graph)],
        'Clustering Coefficient': [nx.average_clustering(field_graph), nx.average_clustering(subfield_graph)],
        'Connected Components': [nx.number_connected_components(field_graph), nx.number_connected_components(subfield_graph)],
        'Cross-Domain Edges %': [cross_domain_percentage, '-']
    }
    
    summary_df = pd.DataFrame(summary_data)
    
    # Format the summary DataFrame
    for col in ['Density', 'Clustering Coefficient', 'Cross-Domain Edges %']:
        summary_df[col] = summary_df[col].apply(lambda x: f"{x:.2f}%" if isinstance(x, float) else x)
    
    print("\nNetwork Comparison Summary:")
    print(summary_df.to_string(index=False))
    
    # Save summary to CSV and LaTeX
    summary_df.to_csv("network_comparison_summary.csv", index=False)
    
    latex_summary = summary_df.to_latex(index=False, caption="Comparison of Field and Subfield Networks", 
                                   label="tab:network_comparison", escape=False)
    
    with open("network_comparison_summary.tex", 'w') as f:
        f.write(latex_summary)
    
    print("\nAll statistics have been saved to CSV and LaTeX files for use in academic publications.")

if __name__ == "__main__":
    main()

Loaded 244026 articles
Field graph created with 26 nodes and 317 edges

Field Network Statistics:
                        Metric             Value
               Number of Nodes                26
               Number of Edges               317
                 Graph Density            0.9754
               Min Edge Weight                 1
               Max Edge Weight             23850
              Mean Edge Weight          744.4006
            Median Edge Weight          142.0000
         Max Degree Centrality            1.0000
        Mean Degree Centrality            0.9754
    Max Betweenness Centrality            0.0012
   Mean Betweenness Centrality            0.0010
 Global Clustering Coefficient            0.9769
Number of Connected Components                 1
        Largest Component Size                26
  Largest Component Percentage          100.0000
                    Min Degree                21
                    Max Degree                25
                   M