In [23]:
import json
import pandas as pd
import numpy as np
from collections import defaultdict
from pathlib import Path
import glob
import os

class CrossFieldAnalyzer:
    def __init__(self, data_folder="openalex_topics_results"):
        self.data_folder = Path(data_folder)
        
    def load_all_data(self):
        """
        Load and combine all available JSON files
        """
        pattern = os.path.join(self.data_folder, "highly_cited_articles_*.json")
        files = glob.glob(pattern)
        
        if not files:
            raise FileNotFoundError("No article files found")
        
        all_papers = []
        for file_path in files:
            print(f"Loading data from {os.path.basename(file_path)}")
            with open(file_path, 'r', encoding='utf-8') as f:
                papers = json.load(f)
                all_papers.extend(papers)
        
        print(f"Loaded total of {len(all_papers)} papers from {len(files)} files")
        return all_papers
        
    def analyze_cooperations(self, papers, min_score=0.3):
        """
        Analyze cross-field cooperation patterns between topics
        """
        topic_cooperations = defaultdict(lambda: defaultdict(int))
        field_cooperations = defaultdict(lambda: defaultdict(int))
        topic_counts = defaultdict(int)
        field_counts = defaultdict(int)
        field_to_topics = defaultdict(set)
        topic_to_field = {}
        paper_counts = defaultdict(lambda: defaultdict(int))
        
        for paper in papers:
            topics = paper.get("topics", [])
            relevant_topics = [t for t in topics if t.get("score", 0) >= min_score]
            
            # Update individual counts and mappings
            for topic in relevant_topics:
                topic_name = topic["display_name"]
                field_name = topic["field"]["display_name"]
                topic_counts[topic_name] += 1
                field_counts[field_name] += 1
                field_to_topics[field_name].add(topic_name)
                topic_to_field[topic_name] = field_name
                
            # Analyze cross-field topic cooperations
            for i, topic1 in enumerate(relevant_topics[:-1]):
                for topic2 in relevant_topics[i+1:]:
                    field1 = topic1["field"]["display_name"]
                    field2 = topic2["field"]["display_name"]
                    
                    # Skip if topics are from the same field
                    if field1 == field2:
                        continue
                        
                    t1_name = topic1["display_name"]
                    t2_name = topic2["display_name"]
                    
                    # Sort names for consistency
                    t1_name, t2_name = sorted([t1_name, t2_name])
                    field1, field2 = sorted([field1, field2])
                    
                    # Weight by topic scores
                    weight = topic1["score"] * topic2["score"]
                    
                    # Update cooperations
                    topic_cooperations[t1_name][t2_name] += weight
                    field_cooperations[field1][field2] += weight
                    paper_counts[t1_name][t2_name] += 1
        
        return {
            'topic_cooperations': topic_cooperations,
            'field_cooperations': field_cooperations,
            'topic_counts': topic_counts,
            'field_counts': field_counts,
            'field_to_topics': field_to_topics,
            'topic_to_field': topic_to_field,
            'paper_counts': paper_counts
        }
    
    def calculate_metrics(self, analysis_results):
        """
        Calculate cooperation metrics
        """
        topic_metrics = []
        field_metrics = []
        
        # Calculate topic-level metrics
        topic_coops = analysis_results['topic_cooperations']
        topic_counts = analysis_results['topic_counts']
        topic_to_field = analysis_results['topic_to_field']
        paper_counts = analysis_results['paper_counts']
        
        for topic1 in topic_coops:
            field1 = topic_to_field[topic1]
            for topic2, weight in topic_coops[topic1].items():
                field2 = topic_to_field[topic2]
                
                if field1 != field2:
                    total_possible = min(topic_counts[topic1], topic_counts[topic2])
                    papers = paper_counts[topic1][topic2]
                    
                    metric = {
                        'topic1': topic1,
                        'topic2': topic2,
                        'field1': field1,
                        'field2': field2,
                        'weight': weight,
                        'paper_count': papers,
                        'normalized_weight': weight / total_possible if total_possible > 0 else 0,
                        'topic1_count': topic_counts[topic1],
                        'topic2_count': topic_counts[topic2]
                    }
                    topic_metrics.append(metric)
        
        # Calculate field-level metrics
        field_coops = analysis_results['field_cooperations']
        field_counts = analysis_results['field_counts']
        
        for field1 in field_coops:
            for field2, weight in field_coops[field1].items():
                if field1 != field2:
                    total_possible = min(field_counts[field1], field_counts[field2])
                    
                    metric = {
                        'field1': field1,
                        'field2': field2,
                        'weight': weight,
                        'normalized_weight': weight / total_possible if total_possible > 0 else 0,
                        'field1_count': field_counts[field1],
                        'field2_count': field_counts[field2]
                    }
                    field_metrics.append(metric)
        
        # Sort by normalized weight
        topic_metrics.sort(key=lambda x: x['normalized_weight'], reverse=True)
        field_metrics.sort(key=lambda x: x['normalized_weight'], reverse=True)
        
        return topic_metrics, field_metrics
    
    def identify_clusters(self, metrics, threshold=0.3, min_cluster_size=3):
        """
        Identify clusters of closely cooperating topics or fields
        """
        items = set()
        item_to_field = {}
        
        # Build item sets and field mappings
        for m in metrics:
            if 'topic1' in m:
                items.add(m['topic1'])
                items.add(m['topic2'])
                item_to_field[m['topic1']] = m['field1']
                item_to_field[m['topic2']] = m['field2']
            else:
                # For field metrics, ensure fields are different
                if m['field1'] != m['field2']:
                    items.add(m['field1'])
                    items.add(m['field2'])
            
        item_list = sorted(list(items))
        item_idx = {item: i for i, item in enumerate(item_list)}
        
        # Create adjacency matrix
        n = len(item_list)
        adj_matrix = np.zeros((n, n))
        
        for m in metrics:
            if m['normalized_weight'] >= threshold:
                if 'topic1' in m:
                    # For topics, check they're from different fields
                    if item_to_field[m['topic1']] != item_to_field[m['topic2']]:
                        i, j = item_idx[m['topic1']], item_idx[m['topic2']]
                        adj_matrix[i, j] = m['normalized_weight']
                        adj_matrix[j, i] = m['normalized_weight']
                else:
                    # For fields, ensure they're different
                    if m['field1'] != m['field2']:
                        i, j = item_idx[m['field1']], item_idx[m['field2']]
                        adj_matrix[i, j] = m['normalized_weight']
                        adj_matrix[j, i] = m['normalized_weight']
        
        # Find clusters
        clusters = []
        visited = set()
        
        for i in range(n):
            if i not in visited:
                cluster = []
                stack = [i]
                cluster_fields = set()  # Track fields in cluster
                
                while stack:
                    node = stack.pop()
                    if node not in visited:
                        visited.add(node)
                        item = item_list[node]
                        
                        # For topics, check field diversity
                        if 'topic1' in metrics[0]:  # Check if we're dealing with topics
                            field = item_to_field[item]
                            if field not in cluster_fields or not cluster_fields:
                                cluster.append(item)
                                cluster_fields.add(field)
                        else:  # For fields, just add if different
                            cluster.append(item)
                        
                        # Add neighbors
                        for j in range(n):
                            if adj_matrix[node, j] >= threshold and j not in visited:
                                next_item = item_list[j]
                                # Check if adding would maintain field diversity
                                if 'topic1' in metrics[0]:
                                    next_field = item_to_field[next_item]
                                    if next_field not in cluster_fields:
                                        stack.append(j)
                                else:
                                    if next_item not in cluster:
                                        stack.append(j)
                
                if len(cluster) >= min_cluster_size:
                    clusters.append(cluster)
        
        # Sort clusters by size
        clusters.sort(key=len, reverse=True)
        return clusters
    
    def generate_report(self, topic_metrics, field_metrics, topic_clusters, field_clusters):
        """
        Generate a comprehensive report of the cooperation analysis
        """
        report = {
            'topic_analysis': {
                'top_cooperations': topic_metrics[:20],
                'clusters': topic_clusters,
                'summary_stats': {
                    'total_pairs': len(topic_metrics),
                    'avg_weight': np.mean([m['normalized_weight'] for m in topic_metrics]),
                    'avg_papers': np.mean([m['paper_count'] for m in topic_metrics])
                }
            },
            'field_analysis': {
                'top_cooperations': field_metrics[:20],
                'clusters': field_clusters,
                'summary_stats': {
                    'total_pairs': len(field_metrics),
                    'avg_weight': np.mean([m['normalized_weight'] for m in field_metrics])
                }
            }
        }
        return report

def print_report(report):
    """
    Print a formatted version of the cooperation report
    """
    print("\n=== Cross-Field Cooperation Analysis Report ===\n")
    
    print("Top Topic Cooperations:")
    for i, coop in enumerate(report['topic_analysis']['top_cooperations'][:10], 1):
        print(f"\n{i}. {coop['topic1']} ({coop['field1']}) × {coop['topic2']} ({coop['field2']})")
        print(f"   Papers: {coop['paper_count']}")
        print(f"   Normalized Weight: {coop['normalized_weight']:.3f}")
    
    print("\nTopic Clusters:")
    for i, cluster in enumerate(report['topic_analysis']['clusters'], 1):
        print(f"\nCluster {i}:")
        print("  " + ", ".join(cluster))
    
    print("\nField Cooperations:")
    for i, coop in enumerate(report['field_analysis']['top_cooperations'][:10], 1):
        print(f"\n{i}. {coop['field1']} × {coop['field2']}")
        print(f"   Normalized Weight: {coop['normalized_weight']:.3f}")
    
    print("\nField Clusters:")
    for i, cluster in enumerate(report['field_analysis']['clusters'], 1):
        print(f"\nCluster {i}:")
        print("  " + ", ".join(cluster))
    
    print("\nSummary Statistics:")
    t_stats = report['topic_analysis']['summary_stats']
    f_stats = report['field_analysis']['summary_stats']
    print(f"Total Topic Pairs: {t_stats['total_pairs']}")
    print(f"Average Topic Normalized Weight: {t_stats['avg_weight']:.3f}")
    print(f"Average Papers per Topic Pair: {t_stats['avg_papers']:.1f}")
    print(f"Total Field Pairs: {f_stats['total_pairs']}")
    print(f"Average Field Normalized Weight: {f_stats['avg_weight']:.3f}")

# Usage example
if __name__ == "__main__":
    analyzer = CrossFieldAnalyzer()
    
    # Load and analyze all data
    papers = analyzer.load_all_data()
    analysis_results = analyzer.analyze_cooperations(papers)
    topic_metrics, field_metrics = analyzer.calculate_metrics(analysis_results)
    
    # Identify clusters
    topic_clusters = analyzer.identify_clusters(topic_metrics)
    field_clusters = analyzer.identify_clusters(field_metrics)
    
    # Generate and print report
    report = analyzer.generate_report(topic_metrics, field_metrics, 
                                    topic_clusters, field_clusters)
    print_report(report)
    
    # Create visualization
    svg_content = visualize_cooperations(report)
    
    # Save visualization as SVG artifact
    with open('topic_network.svg', 'w', encoding='utf-8') as f:
        f.write(svg_content)

Loading data from highly_cited_articles_2024_20250221_222754.json
Loading data from highly_cited_articles_2024_20250221_222818.json
Loading data from highly_cited_articles_2024_20250221_222842.json
Loading data from highly_cited_articles_2024_20250221_222907.json
Loading data from highly_cited_articles_2024_20250221_222933.json
Loading data from highly_cited_articles_2024_20250221_223000.json
Loading data from highly_cited_articles_2024_20250221_223026.json
Loading data from highly_cited_articles_2024_20250221_223052.json
Loading data from highly_cited_articles_2024_20250221_223119.json
Loading data from highly_cited_articles_2024_20250221_223149.json
Loading data from highly_cited_articles_2024_20250221_223218.json
Loading data from highly_cited_articles_2024_20250221_223247.json
Loading data from highly_cited_articles_2024_20250221_223315.json
Loading data from highly_cited_articles_2024_20250221_223343.json
Loading data from highly_cited_articles_2024_20250221_223412.json
Loading da

In [59]:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from io import BytesIO

def create_network_visualization(topic_metrics, min_edge_weight=0):
    """
    Create a network visualization of topic cooperations
    """
    # Initialize the graph
    G = nx.Graph()
    
    # Track unique fields and their topics
    fields = set()
    field_to_color = {}
    node_colors = {}
    edge_weights = {}
    
    # Process metrics to get fields and create edges
    for metric in topic_metrics[:25]:  # Take top 25 cooperations
        fields.add(metric['field1'])
        fields.add(metric['field2'])
        
        # Add edge with weight
        if metric['normalized_weight'] > min_edge_weight:
            G.add_edge(metric['topic1'], metric['topic2'], 
                      weight=metric['paper_count'])
            edge_weights[(metric['topic1'], metric['topic2'])] = metric['paper_count']
            
            # Track field information
            node_colors[metric['topic1']] = metric['field1']
            node_colors[metric['topic2']] = metric['field2']
    
    # Create color mapping for fields
    unique_fields = sorted(list(fields))
    colors = plt.cm.tab20(np.linspace(0, 1, len(unique_fields)))
    field_to_color = dict(zip(unique_fields, colors))
    
    # Create plot with extra space for legend
    plt.figure(figsize=(20, 23))  # Increase figure size to accommodate legend
    plt.subplots_adjust(top=0.75, bottom=0.2)  # Adjust spacing for legend at top or bottom
    
    # Compute layout with extra spacing
    pos = nx.spring_layout(G, k=0.6, iterations=50)
    
    # Draw edges first
    max_edge_weight = max(edge_weights.values())
    for (node1, node2), weight in edge_weights.items():
        width = 2 * weight / max_edge_weight
        nx.draw_networkx_edges(G, pos, 
                             edgelist=[(node1, node2)],
                             width=width,
                             alpha=0.4,
                             edge_color='gray')
    
    # Draw nodes
    for node in G.nodes():
        nx.draw_networkx_nodes(G, pos,
                             nodelist=[node],
                             node_color=[field_to_color[node_colors[node]]],
                             node_size=1000,
                             alpha=0.7)
    
    # Adjust labels to prevent overlap and limit to 30 chars
    label_pos = {node: (x, y + 0.05) for node, (x, y) in pos.items()}  # Offset label
    labels = {node: (node[:40] + '...') if len(node) > 40 else node for node in G.nodes()}
    nx.draw_networkx_labels(G, label_pos, labels, font_size=8, font_color='black')
    
    # Create legend elements
    legend_elements = [plt.Line2D([0], [0],
                                 marker='o',
                                 color='w',
                                 label=field,
                                 markerfacecolor=field_to_color[field],
                                 markersize=10,
                                 markeredgecolor='black')
                      for field in unique_fields]
    
    # Calculate number of columns for legend
    ncol = min(5, len(unique_fields))  # Maximum 5 columns
    
    # Add legend at the bottom to ensure it's not on the right
    plt.figlegend(handles=legend_elements,
                 title='Academic Fields',
                 title_fontsize=12,
                 fontsize=10,
                 loc='lower center',
                 # bbox_to_anchor=(0.5, -0.02),
                 ncol=ncol,
                 borderaxespad=0,
                 mode="expand")
    
    # Set title
    plt.title('Top 25 Cross-Field Topic Cooperations\n' +
             '(Edge width represents number of papers, Node colors represent academic fields)',
             pad=50, fontsize=14)
    
    # Remove axes
    plt.axis('off')
    
    # Convert to SVG
    img_stream = BytesIO()
    plt.savefig(img_stream, format='svg', bbox_inches='tight', dpi=300)
    plt.close()
    
    return img_stream.getvalue().decode()

def visualize_network(report):
    """
    Create network visualization from report
    """
    top_cooperations = report['topic_analysis']['top_cooperations']
    return create_network_visualization(top_cooperations)


In [60]:
    # Create visualization
    svg_content = visualize_network(report)
    
    # Save visualization as SVG artifact
    with open('topic_network.svg', 'w', encoding='utf-8') as f:
        f.write(svg_content)