In [None]:
# Imports
import sys
from pathlib import Path
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from collections import Counter

# Add src to path
sys.path.insert(0, str(Path.cwd().parent))

from src.data_collection.paper_collector import ArXivCollector, Paper
from src.analysis.paper_analyzer import PaperAnalyzer, PaperAnalysis
from src.analysis.knowledge_extractor import KnowledgeExtractor, ResearchGap, Hypothesis

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

print("âœ… Imports successful!")

## 1. Load Data

Load papers, analyses, and knowledge extraction results from a pipeline run.

In [None]:
# Set your data directory and run name
DATA_DIR = Path("../data/results")
RUN_NAME = "graphene_thermal_conduc_20241231_120000"  # Change to your run name

# Or list available runs
if DATA_DIR.exists():
    available_runs = [f.stem.replace('_SUMMARY', '') for f in DATA_DIR.glob('*_SUMMARY.json')]
    if available_runs:
        print("Available runs:")
        for run in available_runs:
            print(f"  - {run}")
        RUN_NAME = available_runs[-1]  # Use most recent
        print(f"\nUsing: {RUN_NAME}")
    else:
        print("No runs found. Run collect_and_analyze.py first!")
else:
    print(f"Data directory not found: {DATA_DIR}")

In [None]:
# Load papers
collector = ArXivCollector()
papers = collector.load_papers(f"{RUN_NAME}_papers")

print(f"ðŸ“š Loaded {len(papers)} papers")
print(f"   First paper: {papers[0].title[:80]}...")

In [None]:
# Load analyses
analyzer = PaperAnalyzer()
analyses = analyzer.load_analyses(f"{RUN_NAME}_analyses")

print(f"ðŸ¤– Loaded {len(analyses)} analyses")
print(f"   Avg relevance: {np.mean([a.relevance_score for a in analyses]):.2f}/10")

In [None]:
# Load knowledge graph
extractor = KnowledgeExtractor()
graph = extractor.load_knowledge_graph(f"{RUN_NAME}_knowledge_graph")

stats = extractor.get_graph_statistics()
print(f"ðŸ§  Knowledge Graph:")
print(f"   {stats['total_nodes']} nodes, {stats['total_edges']} edges")
print(f"   {stats['num_materials']} materials, {stats['num_properties']} properties")

In [None]:
# Load gaps and hypotheses
with open(DATA_DIR / f"{RUN_NAME}_gaps.json", 'r', encoding='utf-8') as f:
    gaps_data = json.load(f)
    gaps = [ResearchGap(**g) for g in gaps_data]

with open(DATA_DIR / f"{RUN_NAME}_hypotheses.json", 'r', encoding='utf-8') as f:
    hyp_data = json.load(f)
    hypotheses = [Hypothesis(**h) for h in hyp_data]

print(f"ðŸŽ¯ Loaded {len(gaps)} gaps and {len(hypotheses)} hypotheses")

## 2. Explore Papers

Analyze the collected papers: publication dates, authors, categories.

In [None]:
# Convert to DataFrame
papers_df = collector.get_papers_dataframe(papers)

print(f"Papers DataFrame shape: {papers_df.shape}")
papers_df.head()

In [None]:
# Publication timeline
fig, ax = plt.subplots(figsize=(12, 5))
papers_df['published_date'].dt.date.value_counts().sort_index().plot(kind='bar', ax=ax)
ax.set_title('Papers by Publication Date', fontsize=14, fontweight='bold')
ax.set_xlabel('Date')
ax.set_ylabel('Number of Papers')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Top categories
all_categories = []
for cats in papers_df['categories']:
    all_categories.extend(cats)

cat_counts = Counter(all_categories)
top_cats = dict(cat_counts.most_common(10))

fig, ax = plt.subplots(figsize=(10, 6))
plt.barh(list(top_cats.keys()), list(top_cats.values()))
ax.set_title('Top 10 arXiv Categories', fontsize=14, fontweight='bold')
ax.set_xlabel('Number of Papers')
plt.tight_layout()
plt.show()

print(f"Total unique categories: {len(cat_counts)}")

In [None]:
# Top authors
all_authors = []
for authors in papers_df['authors']:
    all_authors.extend(authors)

author_counts = Counter(all_authors)
top_authors = dict(author_counts.most_common(15))

print("Top 15 Most Prolific Authors:")
for i, (author, count) in enumerate(top_authors.items(), 1):
    print(f"{i:2d}. {author:40s} ({count} papers)")

## 3. Analyze Paper Analyses

Explore the AI-generated analyses: relevance scores, research types, extracted entities.

In [None]:
# Convert to DataFrame
analyses_df = analyzer.get_analysis_dataframe(analyses)

print(f"Analyses DataFrame shape: {analyses_df.shape}")
analyses_df.head()

In [None]:
# Relevance score distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(analyses_df['relevance_score'], bins=20, edgecolor='black', alpha=0.7)
axes[0].axvline(analyses_df['relevance_score'].mean(), color='red', linestyle='--', 
                label=f'Mean: {analyses_df["relevance_score"].mean():.2f}')
axes[0].set_title('Relevance Score Distribution', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Relevance Score (0-10)')
axes[0].set_ylabel('Frequency')
axes[0].legend()

# Box plot
analyses_df.boxplot(column='relevance_score', ax=axes[1])
axes[1].set_title('Relevance Score Box Plot', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Relevance Score')

plt.tight_layout()
plt.show()

print(f"Relevance Statistics:")
print(analyses_df['relevance_score'].describe())

In [None]:
# Research type distribution
type_counts = analyses_df['research_type'].value_counts()

fig, ax = plt.subplots(figsize=(8, 6))
type_counts.plot(kind='pie', ax=ax, autopct='%1.1f%%', startangle=90)
ax.set_title('Research Type Distribution', fontsize=14, fontweight='bold')
ax.set_ylabel('')
plt.tight_layout()
plt.show()

In [None]:
# Maturity level distribution
maturity_counts = analyses_df['maturity_level'].value_counts()

fig, ax = plt.subplots(figsize=(10, 6))
maturity_counts.plot(kind='barh', ax=ax)
ax.set_title('Research Maturity Level', fontsize=14, fontweight='bold')
ax.set_xlabel('Number of Papers')
plt.tight_layout()
plt.show()

In [None]:
# Top papers by relevance
print("\nðŸŒŸ Top 10 Most Relevant Papers:\n")
top_papers = analyses_df.nlargest(10, 'relevance_score')

for i, row in top_papers.iterrows():
    print(f"{i+1}. [{row['relevance_score']:.1f}/10] {row['title'][:80]}...")
    print(f"   Type: {row['research_type']}, Materials: {row['materials'][:50]}...")
    print()

In [None]:
# Entity extraction statistics
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

axes[0].hist(analyses_df['num_materials'], bins=15, edgecolor='black', alpha=0.7)
axes[0].set_title('Materials per Paper')
axes[0].set_xlabel('Number of Materials')
axes[0].set_ylabel('Frequency')

axes[1].hist(analyses_df['num_properties'], bins=15, edgecolor='black', alpha=0.7, color='orange')
axes[1].set_title('Properties per Paper')
axes[1].set_xlabel('Number of Properties')

axes[2].hist(analyses_df['num_methods'], bins=15, edgecolor='black', alpha=0.7, color='green')
axes[2].set_title('Methods per Paper')
axes[2].set_xlabel('Number of Methods')

plt.tight_layout()
plt.show()

## 4. Visualize Knowledge Graph

Explore the knowledge graph structure and relationships.

In [None]:
# Graph statistics
print("Knowledge Graph Statistics:")
print(f"  Nodes: {graph.number_of_nodes()}")
print(f"  Edges: {graph.number_of_edges()}")
print(f"  Density: {nx.density(graph):.4f}")

# Node type breakdown
node_types = {}
for node, data in graph.nodes(data=True):
    node_type = data.get('type', 'unknown')
    node_types[node_type] = node_types.get(node_type, 0) + 1

print(f"\nNode Types:")
for ntype, count in sorted(node_types.items(), key=lambda x: x[1], reverse=True):
    print(f"  {ntype}: {count}")

In [None]:
# Most connected nodes
degree_dict = dict(graph.degree())
top_nodes = sorted(degree_dict.items(), key=lambda x: x[1], reverse=True)[:20]

print("Top 20 Most Connected Nodes:\n")
for i, (node, degree) in enumerate(top_nodes, 1):
    node_type = graph.nodes[node].get('type', 'unknown')
    freq = graph.nodes[node].get('frequency', 0)
    print(f"{i:2d}. {node:30s} | {node_type:10s} | {degree:3d} connections | {freq:2d} papers")

In [None]:
# Visualize subgraph of top materials and their properties
materials = [n for n, d in graph.nodes(data=True) if d.get('type') == 'material']
top_materials = sorted(materials, key=lambda m: graph.nodes[m]['frequency'], reverse=True)[:5]

# Get connected nodes
subgraph_nodes = set(top_materials)
for mat in top_materials:
    neighbors = list(graph.neighbors(mat))
    subgraph_nodes.update(neighbors[:10])  # Add top 10 neighbors

subgraph = graph.subgraph(subgraph_nodes)

# Draw graph
fig, ax = plt.subplots(figsize=(16, 12))

pos = nx.spring_layout(subgraph, k=2, iterations=50)

# Color nodes by type
node_colors = []
for node in subgraph.nodes():
    ntype = subgraph.nodes[node].get('type', 'unknown')
    if ntype == 'material':
        node_colors.append('lightblue')
    elif ntype == 'property':
        node_colors.append('lightcoral')
    elif ntype == 'method':
        node_colors.append('lightgreen')
    else:
        node_colors.append('gray')

# Draw
nx.draw_networkx_nodes(subgraph, pos, node_color=node_colors, node_size=500, alpha=0.8, ax=ax)
nx.draw_networkx_edges(subgraph, pos, alpha=0.2, ax=ax)
nx.draw_networkx_labels(subgraph, pos, font_size=8, ax=ax)

ax.set_title(f'Knowledge Graph: Top {len(top_materials)} Materials and Connected Entities', 
             fontsize=14, fontweight='bold')
ax.axis('off')

# Legend
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='lightblue', label='Materials'),
    Patch(facecolor='lightcoral', label='Properties'),
    Patch(facecolor='lightgreen', label='Methods')
]
ax.legend(handles=legend_elements, loc='upper left')

plt.tight_layout()
plt.show()

## 5. Research Patterns

Analyze frequent patterns and co-occurrences.

In [None]:
# Find patterns
patterns = extractor.find_frequent_patterns(min_frequency=2)

print("Top Materials:")
for i, (mat, freq) in enumerate(patterns['top_materials'][:10], 1):
    print(f"{i:2d}. {mat:40s} ({freq} mentions)")

print("\nTop Properties:")
for i, (prop, freq) in enumerate(patterns['top_properties'][:10], 1):
    print(f"{i:2d}. {prop:40s} ({freq} mentions)")

print("\nTop Methods:")
for i, (method, freq) in enumerate(patterns['top_methods'][:10], 1):
    print(f"{i:2d}. {method:40s} ({freq} mentions)")

In [None]:
# Material-Property co-occurrence
print("\nTop Material-Property Pairs:")
for i, (pair, freq) in enumerate(patterns['material_property_pairs'][:15], 1):
    print(f"{i:2d}. {pair:60s} ({freq} co-occurrences)")

In [None]:
# Visualize top materials
if patterns['top_materials']:
    mats, freqs = zip(*patterns['top_materials'][:10])
    
    fig, ax = plt.subplots(figsize=(10, 6))
    plt.barh(mats, freqs)
    ax.set_title('Top 10 Materials by Frequency', fontsize=14, fontweight='bold')
    ax.set_xlabel('Frequency')
    plt.tight_layout()
    plt.show()

## 6. Research Gaps

Explore identified research gaps and their priorities.

In [None]:
print(f"Total Research Gaps Identified: {len(gaps)}\n")

# Sort by priority and confidence
priority_order = {'high': 3, 'medium': 2, 'low': 1}
sorted_gaps = sorted(gaps, key=lambda g: (priority_order.get(g.priority, 0), g.confidence), reverse=True)

print("Top Research Gaps:\n")
for i, gap in enumerate(sorted_gaps[:10], 1):
    print(f"{i}. [{gap.priority.upper()} priority, {gap.confidence:.0%} confidence]")
    print(f"   {gap.description}")
    if gap.related_materials:
        print(f"   Materials: {', '.join(gap.related_materials[:5])}")
    if gap.related_properties:
        print(f"   Properties: {', '.join(gap.related_properties[:5])}")
    print()

In [None]:
# Gap statistics
gap_priorities = Counter(g.priority for g in gaps)
gap_confidences = [g.confidence for g in gaps]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Priority distribution
axes[0].bar(gap_priorities.keys(), gap_priorities.values())
axes[0].set_title('Research Gaps by Priority', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Count')

# Confidence distribution
axes[1].hist(gap_confidences, bins=10, edgecolor='black', alpha=0.7)
axes[1].set_title('Gap Confidence Distribution', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Confidence')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 7. Research Hypotheses

Examine AI-generated research hypotheses.

In [None]:
print(f"Total Hypotheses Generated: {len(hypotheses)}\n")

# Sort by novelty and feasibility
feasibility_order = {'high': 3, 'medium': 2, 'low': 1}
sorted_hyp = sorted(hypotheses, 
                    key=lambda h: (h.novelty_score, feasibility_order.get(h.feasibility, 0)), 
                    reverse=True)

print("Top Research Hypotheses:\n")
for i, hyp in enumerate(sorted_hyp[:10], 1):
    print(f"{i}. [Novelty: {hyp.novelty_score:.1f}/10, Feasibility: {hyp.feasibility}]")
    print(f"   Statement: {hyp.statement}")
    print(f"   Rationale: {hyp.rationale[:150]}...")
    if hyp.materials_involved:
        print(f"   Materials: {', '.join(hyp.materials_involved[:5])}")
    if hyp.suggested_methods:
        print(f"   Methods: {', '.join(hyp.suggested_methods[:3])}")
    print()

In [None]:
# Hypothesis statistics
hyp_feasibilities = Counter(h.feasibility for h in hypotheses)
hyp_novelties = [h.novelty_score for h in hypotheses]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Feasibility distribution
axes[0].bar(hyp_feasibilities.keys(), hyp_feasibilities.values())
axes[0].set_title('Hypotheses by Feasibility', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Count')

# Novelty distribution
axes[1].hist(hyp_novelties, bins=10, edgecolor='black', alpha=0.7, color='purple')
axes[1].set_title('Hypothesis Novelty Scores', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Novelty Score (0-10)')
axes[1].set_ylabel('Frequency')
axes[1].axvline(np.mean(hyp_novelties), color='red', linestyle='--', 
                label=f'Mean: {np.mean(hyp_novelties):.2f}')
axes[1].legend()

plt.tight_layout()
plt.show()

## 8. Custom Queries

Run custom queries on the data.

In [None]:
# Find papers about a specific material
material_of_interest = "graphene"  # Change this

relevant_analyses = [
    a for a in analyses 
    if any(material_of_interest.lower() in m.lower() for m in a.materials)
]

print(f"Papers mentioning '{material_of_interest}': {len(relevant_analyses)}\n")

for analysis in relevant_analyses[:5]:
    print(f"â€¢ {analysis.title[:70]}...")
    print(f"  Relevance: {analysis.relevance_score:.1f}/10")
    print(f"  Properties: {', '.join(analysis.properties[:5])}")
    print()

In [None]:
# Find papers studying a specific property
property_of_interest = "thermal conductivity"  # Change this

relevant_analyses = [
    a for a in analyses 
    if any(property_of_interest.lower() in p.lower() for p in a.properties)
]

print(f"Papers studying '{property_of_interest}': {len(relevant_analyses)}\n")

for analysis in relevant_analyses[:5]:
    print(f"â€¢ {analysis.title[:70]}...")
    print(f"  Materials: {', '.join(analysis.materials[:5])}")
    print(f"  Methods: {', '.join(analysis.methods[:3])}")
    print()

In [None]:
# Find computational papers
computational = [a for a in analyses if a.research_type == 'computational']

print(f"Computational Papers: {len(computational)}\n")

for analysis in computational[:5]:
    print(f"â€¢ {analysis.title[:70]}...")
    print(f"  Methods: {', '.join(analysis.methods[:5])}")
    print()

## 9. Summary & Next Steps

Key insights and recommendations.

In [None]:
print("="*80)
print("SUMMARY")
print("="*80)

print(f"\nðŸ“š Data Collection:")
print(f"   â€¢ {len(papers)} papers collected from arXiv")
print(f"   â€¢ Date range: {papers_df['published_date'].min()} to {papers_df['published_date'].max()}")
print(f"   â€¢ {len(cat_counts)} unique categories")

print(f"\nðŸ¤– AI Analysis:")
print(f"   â€¢ {len(analyses)} papers analyzed")
print(f"   â€¢ Average relevance: {analyses_df['relevance_score'].mean():.2f}/10")
print(f"   â€¢ High relevance (â‰¥7.0): {len([a for a in analyses if a.relevance_score >= 7.0])} papers")

print(f"\nðŸ§  Knowledge Extraction:")
print(f"   â€¢ {stats['total_nodes']} entities in knowledge graph")
print(f"   â€¢ {stats['num_materials']} materials, {stats['num_properties']} properties")
print(f"   â€¢ {len(patterns['material_property_pairs'])} material-property relationships")

print(f"\nðŸŽ¯ Research Opportunities:")
print(f"   â€¢ {len(gaps)} research gaps identified")
print(f"   â€¢ {len([g for g in gaps if g.priority == 'high'])} high-priority gaps")
print(f"   â€¢ {len(hypotheses)} research hypotheses generated")
print(f"   â€¢ {len([h for h in hypotheses if h.novelty_score >= 7.0])} highly novel hypotheses")

print(f"\nðŸ’¡ Recommended Next Steps:")
print(f"   1. Review high-priority research gaps in detail")
print(f"   2. Evaluate top hypotheses for experimental/computational feasibility")
print(f"   3. Identify collaboration opportunities based on author networks")
print(f"   4. Focus on understudied material-property combinations")
print(f"   5. Consider computational approaches for materials with limited experimental data")

print("\n" + "="*80)