In [25]:
# Install required packages
!pip install pandas numpy plotly anthropic ipywidgets pyarrow nbformat -q
# ‚ö†Ô∏è IMPORTANT: After running this cell for the first time, RESTART THE KERNEL (Kernel > Restart Kernel)
# Then run all cells from the beginning for plotly visualizations to work properly



# Phase 3: Trends Analysis and AI Insights

This notebook analyzes research trends over time and generates AI-powered insights using Claude.

## Setup & Data Loading

In [26]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
import os
import time
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set Anthropic API Key
os.environ['ANTHROPIC_API_KEY'] = "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"

print("‚úÖ All imports successful")
print("‚úÖ Anthropic API key configured")

‚úÖ All imports successful
‚úÖ Anthropic API key configured


In [27]:
# Load processed data
data_path = Path('../data/processed')
papers_df = pd.read_parquet(data_path / 'papers.parquet')

print(f"Loaded {len(papers_df)} papers")
print(f"Years: {papers_df['year'].min()} - {papers_df['year'].max()}")
print(f"Total citations: {papers_df['citation_count'].sum():,}")
papers_df.head()

Loaded 19523 papers
Years: 2018 - 2023
Total citations: 184,928


Unnamed: 0,id,scopus_id,doi,title,abstract,year,citation_count,authors,affiliations,references,abstract_length,num_authors,num_references,summary,subject_areas
0,201800001,85060936020,10.23919/PIERS.2018.8597669,Flexible Printed Active Antenna for Digital Te...,This paper presents the development of a flexi...,2018,1,Pratumsiri T.;Janpugdee P.,,85006043726;85046336244;85060914424;85046368249,1199,2,4,,"[Electrical and Electronic Engineering, Electr..."
1,201800002,85052201238,10.1016/j.ces.2018.08.042,Parametric study of hydrogen production via so...,Computational fluid dynamics was applied for s...,2018,21,Phuakpunk K.;Chalermsinsuwan B.;Putivisutisak ...,,,957,4,0,,"[Chemistry (all), Chemical Engineering (all), ..."
2,201800003,85051498032,10.1016/j.apsusc.2018.08.059,Superhydrophobic coating from fluoroalkylsilan...,A superhydrophobic/superoleophilic mesh was su...,2018,37,Saengkaew J.;Le D.;Samart C.;Sawada H.;Nishida...,Hirosaki University;Chulalongkorn University;T...,,1082,8,0,,"[Chemistry (all), Condensed Matter Physics, Ph..."
3,201800004,85050678366,10.1016/j.aca.2018.07.045,Electrochemical impedance-based DNA sensor usi...,A label-free electrochemical DNA sensor based ...,2018,68,Teengam P.;Siangproh W.;Tuantranont A.;Vilaiva...,Chulalongkorn University;Thailand National Ele...,,1668,6,0,,"[Analytical Chemistry, Biochemistry, Environme..."
4,201800005,85059846549,10.17512/pjms.2018.18.2.24,Evaluation of outsourcing transportation contr...,This paper aims to develop an approach to iden...,2018,15,Setamanit S.-O.,,54349103890;0033235290;69649096182;85043470598...,884,1,20,,"[Business and International Management, Strate..."


In [28]:
# Load treemap and hierarchy data (separate files)
with open(data_path / 'treemap_data.json', 'r') as f:
    treemap_data = json.load(f)

with open(data_path / 'subject_hierarchy.json', 'r') as f:
    subject_hierarchy = json.load(f)

print("Treemap structure:")
print(f"  Labels: {len(treemap_data['labels'])}")
print(f"  Parents: {len(treemap_data['parents'])}")
print(f"\nSubject hierarchy:")
print(f"  Top-level topics: {len(subject_hierarchy)}")
print("\nTop-level topics:")
for topic in list(subject_hierarchy.keys())[:5]:
    print(f"  - {topic}")

Treemap structure:
  Labels: 85
  Parents: 85

Subject hierarchy:
  Top-level topics: 2

Top-level topics:
  - asjc_codes
  - subject_groups


## 1. Big Topic Analysis: Topic Shift Over Time

Analyze how research focus has shifted across major domains from 2018-2023.

In [29]:
# Extract Level 1 topics (major domains)
# Fix: Extract the actual subject_groups mapping from the hierarchy
subject_hierarchy = subject_hierarchy['subject_groups']

level1_topics = list(subject_hierarchy.keys())
print(f"Found {len(level1_topics)} Level 1 topics:")
for topic in level1_topics:
    print(f"  - {topic} ({len(subject_hierarchy[topic])} subtopics)")

Found 8 Level 1 topics:
  - Medicine & Health (25 subtopics)
  - Life Sciences (11 subtopics)
  - Computer Science & AI (9 subtopics)
  - Engineering (9 subtopics)
  - Materials & Chemistry (8 subtopics)
  - Physics (7 subtopics)
  - Environmental Science (7 subtopics)
  - Other (0 subtopics)


In [30]:
# Debug: Check data structure
print("=== Subject Areas (from papers) ===")
sample_subjects = papers_df['subject_areas'].iloc[0]
print(f"Type: {type(sample_subjects)}")
print(f"Sample: {sample_subjects[:3] if len(sample_subjects) > 3 else sample_subjects}")

print("\n=== Subject Hierarchy Structure ===")
print(f"Top-level keys: {list(subject_hierarchy.keys())}")
print(f"\nChecking 'subject_groups' structure:")
if 'subject_groups' in subject_hierarchy:
    subject_groups = subject_hierarchy['subject_groups']
    print(f"  Type: {type(subject_groups)}")
    print(f"  Keys (Level 1 topics): {list(subject_groups.keys())}")
    first_topic = list(subject_groups.keys())[0]
    print(f"\n  Sample: '{first_topic}' -> {subject_groups[first_topic][:3] if len(subject_groups[first_topic]) > 3 else subject_groups[first_topic]}")
    
print("\nüîç The issue: subject_hierarchy needs to be subject_hierarchy['subject_groups']!")

=== Subject Areas (from papers) ===
Type: <class 'numpy.ndarray'>
Sample: ['Electrical and Electronic Engineering'
 'Electronic, Optical and Magnetic Materials']

=== Subject Hierarchy Structure ===
Top-level keys: ['Medicine & Health', 'Life Sciences', 'Computer Science & AI', 'Engineering', 'Materials & Chemistry', 'Physics', 'Environmental Science', 'Other']

Checking 'subject_groups' structure:

üîç The issue: subject_hierarchy needs to be subject_hierarchy['subject_groups']!


In [31]:
# Function to map papers to their Level 1 topic
def get_level1_topic(subject_areas, hierarchy):
    """
    Given a paper's subject areas, find its Level 1 topic.
    A paper may belong to multiple Level 1 topics.
    """
    # Handle None or NaN
    if subject_areas is None or pd.isna(subject_areas).any() if isinstance(subject_areas, np.ndarray) else pd.isna(subject_areas):
        return []
    
    # Convert to list if it's an array
    if isinstance(subject_areas, np.ndarray):
        subject_areas = subject_areas.tolist()
    elif isinstance(subject_areas, str):
        # Parse if stored as string
        import ast
        try:
            subject_areas = ast.literal_eval(subject_areas)
        except:
            return []
    
    # Check if empty
    if not subject_areas:
        return []
    
    topics = set()
    for subject in subject_areas:
        # Check which Level 1 topic this subject belongs to
        for level1, subtopics in hierarchy.items():
            if subject in subtopics or subject == level1:
                topics.add(level1)
    
    return list(topics)

# Apply to dataset
papers_df['level1_topics'] = papers_df['subject_areas'].apply(
    lambda x: get_level1_topic(x, subject_hierarchy)
)

# Check distribution
papers_with_topics = papers_df[papers_df['level1_topics'].apply(len) > 0]
print(f"Papers with Level 1 topics: {len(papers_with_topics)} / {len(papers_df)}")
papers_df[['title', 'year', 'subject_areas', 'level1_topics']].head(10)

Papers with Level 1 topics: 7003 / 19523


Unnamed: 0,title,year,subject_areas,level1_topics
0,Flexible Printed Active Antenna for Digital Te...,2018,"[Electrical and Electronic Engineering, Electr...",[]
1,Parametric study of hydrogen production via so...,2018,"[Chemistry (all), Chemical Engineering (all), ...",[]
2,Superhydrophobic coating from fluoroalkylsilan...,2018,"[Chemistry (all), Condensed Matter Physics, Ph...",[]
3,Electrochemical impedance-based DNA sensor usi...,2018,"[Analytical Chemistry, Biochemistry, Environme...","[Materials & Chemistry, Life Sciences]"
4,Evaluation of outsourcing transportation contr...,2018,"[Business and International Management, Strate...",[]
5,The phenotypic and mutational spectrum of Thai...,2018,[Genetics],[Life Sciences]
6,Predicting judicial decisions of criminal case...,2018,"[Decision Sciences (miscellaneous), Informatio...",[Computer Science & AI]
7,PH Variation as a Simple and Selective Pathway...,2018,"[Materials Science (all), Condensed Matter Phy...",[]
8,Effect of Zr addition on the microstructure an...,2018,"[Chemistry (all), Condensed Matter Physics, Su...",[]
9,Applying Text Mining for Classifying Disease f...,2018,"[Artificial Intelligence, Computer Networks an...",[Computer Science & AI]


In [32]:
# Create year-by-topic distribution
# Explode to handle papers in multiple topics
topic_year_data = []

for _, row in papers_df.iterrows():
    if row['level1_topics'] and len(row['level1_topics']) > 0:  # Check if list is not empty
        for topic in row['level1_topics']:
            topic_year_data.append({
                'year': row['year'],
                'topic': topic,
                'papers': 1,
                'citations': row['citation_count']
            })

print(f"Total topic-year entries: {len(topic_year_data)}")

if len(topic_year_data) == 0:
    print("‚ö†Ô∏è No topics found! Check if papers have level1_topics assigned.")
    print(f"Sample paper topics: {papers_df['level1_topics'].head()}")
else:
    topic_year_df = pd.DataFrame(topic_year_data)
    
    # Aggregate by year and topic
    topic_trends = topic_year_df.groupby(['year', 'topic']).agg({
        'papers': 'sum',
        'citations': 'sum'
    }).reset_index()
    
    print("Topic trends data shape:", topic_trends.shape)
    display(topic_trends.head(10))

Total topic-year entries: 8072
Topic trends data shape: (36, 4)


Unnamed: 0,year,topic,papers,citations
0,2018,Computer Science & AI,137,607
1,2018,Engineering,65,1799
2,2018,Environmental Science,55,1398
3,2018,Life Sciences,272,5176
4,2018,Materials & Chemistry,150,4462
5,2018,Medicine & Health,394,8833
6,2019,Computer Science & AI,235,1998
7,2019,Engineering,74,1033
8,2019,Environmental Science,65,781
9,2019,Life Sciences,269,4063


In [33]:
# Visualize: Big Topic Shift Over Time (Stacked Area Chart)
pivot_data = topic_trends.pivot(index='year', columns='topic', values='papers').fillna(0)

fig = go.Figure()

for topic in pivot_data.columns:
    fig.add_trace(go.Scatter(
        x=pivot_data.index,
        y=pivot_data[topic],
        mode='lines',
        name=topic,
        stackgroup='one',
        hovertemplate='<b>%{fullData.name}</b><br>Year: %{x}<br>Papers: %{y}<extra></extra>'
    ))

fig.update_layout(
    title='Research Topic Shift: Publication Distribution Over Time (2018-2023)',
    xaxis_title='Year',
    yaxis_title='Number of Papers',
    hovermode='x unified',
    height=600,
    showlegend=True,
    legend=dict(orientation='v', yanchor='top', y=1, xanchor='left', x=1.05)
)

fig.show()

In [34]:
# Alternative: Line chart showing percentage distribution
# Calculate percentage of total papers per year
yearly_totals = topic_trends.groupby('year')['papers'].sum().reset_index()
yearly_totals.columns = ['year', 'total_papers']

topic_trends_pct = topic_trends.merge(yearly_totals, on='year')
topic_trends_pct['percentage'] = (topic_trends_pct['papers'] / topic_trends_pct['total_papers']) * 100

fig = px.line(
    topic_trends_pct,
    x='year',
    y='percentage',
    color='topic',
    markers=True,
    title='Research Topic Distribution: Percentage Share Over Time',
    labels={'percentage': 'Percentage of Papers (%)', 'year': 'Year', 'topic': 'Research Domain'}
)

fig.update_layout(height=600, hovermode='x unified')
fig.show()

## 2. Subtopic Analysis: Paper Count vs Citation Trends

For specific subtopics, analyze the relationship between publication volume and citation impact.

In [35]:
# Get all subtopics (Level 2+)
all_subtopics = []
for level1_topic, subtopics in subject_hierarchy.items():
    all_subtopics.extend(subtopics)

print(f"Total subtopics: {len(all_subtopics)}")
print("\nSample subtopics:")
for subtopic in all_subtopics[:10]:
    print(f"  - {subtopic}")

Total subtopics: 76

Sample subtopics:
  - Medicine
  - Surgery
  - Infectious Diseases
  - Pharmacology
  - Neurology
  - Oncology
  - Immunology
  - Cardiology
  - Pediatrics
  - Gastroenterology


In [36]:
# Function to get subtopic trends (FIXED VERSION)
def get_subtopic_trends(subtopic_name, papers_df):
    """
    Get yearly paper count and citation trends for a specific subtopic.
    """
    # Filter papers that contain this subtopic
    def has_subtopic(subject_areas):
        # Handle None
        if subject_areas is None:
            return False

        # Handle numpy arrays - convert to list first
        if isinstance(subject_areas, np.ndarray):
            if len(subject_areas) == 0:
                return False
            subject_areas = subject_areas.tolist()

        # Handle strings
        elif isinstance(subject_areas, str):
            import ast
            try:
                subject_areas = ast.literal_eval(subject_areas)
            except:
                return False

        # Check if empty (after conversion)
        if not subject_areas:
            return False

        # Check if subtopic is in the list
        return subtopic_name in subject_areas

    subtopic_papers = papers_df[papers_df['subject_areas'].apply(has_subtopic)]

    if len(subtopic_papers) == 0:
        return None

    # Aggregate by year
    yearly_stats = subtopic_papers.groupby('year').agg({
        'id': 'count',  # Paper count
        'citation_count': 'sum'  # Total citations
    }).reset_index()
    yearly_stats.columns = ['year', 'paper_count', 'total_citations']

    return yearly_stats

# Test with a sample subtopic
sample_subtopic = all_subtopics[0]
sample_trends = get_subtopic_trends(sample_subtopic, papers_df)
print(f"Trends for '{sample_subtopic}':")
print(sample_trends)

Trends for 'Medicine':
None


In [37]:
# Find most popular subtopics for analysis
subtopic_counts = {}
for subtopic in all_subtopics:
    trends = get_subtopic_trends(subtopic, papers_df)
    if trends is not None:
        subtopic_counts[subtopic] = trends['paper_count'].sum()

# Sort by paper count
top_subtopics = sorted(subtopic_counts.items(), key=lambda x: x[1], reverse=True)[:20]

print("Top 20 Subtopics by Paper Count:")
for i, (subtopic, count) in enumerate(top_subtopics, 1):
    print(f"{i:2d}. {subtopic}: {count} papers")

Top 20 Subtopics by Paper Count:
 1. Infectious Diseases: 718 papers
 2. Organic Chemistry: 597 papers
 3. Biochemistry: 596 papers
 4. Molecular Biology: 500 papers
 5. Analytical Chemistry: 480 papers
 6. Pharmacology: 466 papers
 7. Artificial Intelligence: 463 papers
 8. Immunology: 395 papers
 9. Pollution: 371 papers
10. Environmental Engineering: 368 papers
11. Biotechnology: 334 papers
12. Software: 330 papers
13. Information Systems: 321 papers
14. Surgery: 319 papers
15. Catalysis: 319 papers
16. Microbiology: 306 papers
17. Hardware and Architecture: 276 papers
18. Oncology: 262 papers
19. Genetics: 247 papers
20. Gastroenterology: 213 papers


In [38]:
# Visualize subtopic trends: Dual-axis plot (Papers vs Citations)
def plot_subtopic_trends(subtopic_name, papers_df):
    """
    Create a dual-axis plot showing paper count and citation trends.
    """
    trends = get_subtopic_trends(subtopic_name, papers_df)
    
    if trends is None or len(trends) == 0:
        print(f"No data available for '{subtopic_name}'")
        return
    
    # Create figure with secondary y-axis
    fig = make_subplots(
        specs=[[{"secondary_y": True}]],
        subplot_titles=[f"Research Trends: {subtopic_name}"]
    )
    
    # Add paper count trace
    fig.add_trace(
        go.Scatter(
            x=trends['year'],
            y=trends['paper_count'],
            name='Paper Count',
            mode='lines+markers',
            line=dict(color='#9C27B0', width=3),
            marker=dict(size=10),
            hovertemplate='<b>Papers</b><br>Year: %{x}<br>Count: %{y}<extra></extra>'
        ),
        secondary_y=False
    )
    
    # Add citation count trace
    fig.add_trace(
        go.Scatter(
            x=trends['year'],
            y=trends['total_citations'],
            name='Total Citations',
            mode='lines+markers',
            line=dict(color='#E91E63', width=3, dash='dash'),
            marker=dict(size=10, symbol='diamond'),
            hovertemplate='<b>Citations</b><br>Year: %{x}<br>Count: %{y:,}<extra></extra>'
        ),
        secondary_y=True
    )
    
    # Update axes
    fig.update_xaxes(title_text="Year")
    fig.update_yaxes(title_text="<b>Number of Papers</b>", secondary_y=False, title_font=dict(color='#9C27B0'))
    fig.update_yaxes(title_text="<b>Total Citations</b>", secondary_y=True, title_font=dict(color='#E91E63'))
    
    fig.update_layout(
        height=500,
        hovermode='x unified',
        legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='center', x=0.5)
    )
    
    fig.show()
    
    # Print summary statistics
    print(f"\nSummary for '{subtopic_name}':")
    print(f"  Total papers: {trends['paper_count'].sum()}")
    print(f"  Total citations: {trends['total_citations'].sum():,}")
    print(f"  Avg citations per paper: {trends['total_citations'].sum() / trends['paper_count'].sum():.1f}")
    print(f"  Paper growth rate: {((trends['paper_count'].iloc[-1] / trends['paper_count'].iloc[0]) - 1) * 100:.1f}%")

# Test with top subtopics
for subtopic, _ in top_subtopics[:3]:
    plot_subtopic_trends(subtopic, papers_df)


Summary for 'Infectious Diseases':
  Total papers: 718
  Total citations: 5,958
  Avg citations per paper: 8.3
  Paper growth rate: 8.1%



Summary for 'Organic Chemistry':
  Total papers: 597
  Total citations: 7,568
  Avg citations per paper: 12.7
  Paper growth rate: 39.1%



Summary for 'Biochemistry':
  Total papers: 596
  Total citations: 6,820
  Avg citations per paper: 11.4
  Paper growth rate: 15.4%


## 3. AI-Powered Insights & Recommendations

Use LLM to generate insights based on trend analysis and current context.

In [39]:
# Prepare trend summary for LLM
def prepare_trend_summary(topic_trends_df, top_n=10):
    """
    Create a text summary of trends for LLM input.
    """
    summary_lines = []
    summary_lines.append("=== RESEARCH TREND ANALYSIS (2018-2023) ===")
    summary_lines.append("")
    
    # Overall statistics
    total_2018 = topic_trends_df[topic_trends_df['year'] == 2018]['papers'].sum()
    total_2023 = topic_trends_df[topic_trends_df['year'] == 2023]['papers'].sum()
    summary_lines.append(f"Total papers in 2018: {total_2018}")
    summary_lines.append(f"Total papers in 2023: {total_2023}")
    summary_lines.append(f"Overall growth: {((total_2023/total_2018 - 1) * 100):.1f}%")
    summary_lines.append("")
    
    # Topic-level growth rates
    summary_lines.append("GROWTH RATES BY TOPIC:")
    for topic in topic_trends_df['topic'].unique():
        topic_data = topic_trends_df[topic_trends_df['topic'] == topic]
        papers_2018 = topic_data[topic_data['year'] == 2018]['papers'].sum()
        papers_2023 = topic_data[topic_data['year'] == 2023]['papers'].sum()
        if papers_2018 > 0:
            growth = ((papers_2023 / papers_2018) - 1) * 100
            summary_lines.append(f"  {topic}: {growth:+.1f}% ({papers_2018} ‚Üí {papers_2023} papers)")
    
    return "\n".join(summary_lines)

trend_summary = prepare_trend_summary(topic_trends)
print(trend_summary)

=== RESEARCH TREND ANALYSIS (2018-2023) ===

Total papers in 2018: 1073
Total papers in 2023: 1209
Overall growth: 12.7%

GROWTH RATES BY TOPIC:
  Computer Science & AI: +8.8% (137 ‚Üí 149 papers)
  Engineering: +27.7% (65 ‚Üí 83 papers)
  Environmental Science: +80.0% (55 ‚Üí 99 papers)
  Life Sciences: +11.8% (272 ‚Üí 304 papers)
  Materials & Chemistry: +16.7% (150 ‚Üí 175 papers)
  Medicine & Health: +1.3% (394 ‚Üí 399 papers)


In [47]:
# LLM Insights Generator (using Anthropic Claude)
import os

def generate_insights(trend_summary, api_key=None):
    """
    Generate insights using Claude AI.
    """
    # Check for API key
    if api_key is None:
        api_key = os.getenv('ANTHROPIC_API_KEY')
    
    if not api_key:
        print("‚ö†Ô∏è No API key provided. Skipping LLM insights generation.")
        print("Set ANTHROPIC_API_KEY environment variable or provide api_key parameter.")
        return None
    
    try:
        from anthropic import Anthropic
        
        client = Anthropic(api_key=api_key)
        
        prompt = f"""You are a research trend analyst. Based on the following research publication trends from 2018-2023, provide:

1. Key observations about topic shifts and emerging areas
2. Recommendations for researchers on promising research directions
3. Topics that may be under-explored but have high potential
4. Connections to current global events or technological advancements

Trend Data:
{trend_summary}

Provide a concise, actionable analysis (300-500 words)."""
        
        message = client.messages.create(
            model="claude-3-5-haiku-20241022",
            max_tokens=1024,
            messages=[{"role": "user", "content": prompt}]
        )
        
        return message.content[0].text
    
    except ImportError:
        print("‚ö†Ô∏è Anthropic library not installed. Run: pip install anthropic")
        return None
    except Exception as e:
        print(f"‚ùå Error generating insights: {e}")
        return None

# Generate insights
print("Generating AI insights...")
insights = generate_insights(trend_summary)

if insights:
    print("\n" + "="*80)
    print("AI-GENERATED INSIGHTS & RECOMMENDATIONS")
    print("="*80)
    print(insights)
    print("="*80)

Generating AI insights...
‚ùå Error generating insights: Error code: 404 - {'type': 'error', 'error': {'type': 'not_found_error', 'message': 'model: claude-3-5-sonnet-20241022'}, 'request_id': 'req_011CVsN1YhY8wzhCoM4xXX3i'}


## 4. Interactive Exploration

Explore different topics and generate custom visualizations.

In [41]:
# Install anthropic if needed
try:
    from anthropic import Anthropic
    print("‚úÖ Anthropic library available")
except ImportError:
    print("Installing anthropic...")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "anthropic"])
    from anthropic import Anthropic
    print("‚úÖ Anthropic installed")

‚úÖ Anthropic library available


### Use Case 1: Pre-computed Topic Insights

This function generates insights that will be **saved to database** and displayed instantly when users click treemap topics.

In [42]:
from anthropic import Anthropic

# Initialize Anthropic client
client = Anthropic(api_key=os.environ['ANTHROPIC_API_KEY'])

def generate_topic_insights(topic_name, paper_titles, year_trends, citation_trends):
    """
    Use Case 1: Generate pre-computed insights for treemap topics
    
    Args:
        topic_name: Name of the subtopic
        paper_titles: List of paper titles in this topic
        year_trends: Dict of year -> paper count
        citation_trends: Dict of year -> citation count
    
    Returns:
        JSON string with insights and recommendations
    """
    
    # Prepare context from trends
    trend_context = f"""
Topic: {topic_name}

Recent Papers ({len(paper_titles)} total):
{chr(10).join([f"- {title}" for title in paper_titles[:10]])}

Publication Trends:
{chr(10).join([f"{year}: {count} papers" for year, count in sorted(year_trends.items())])}

Citation Trends:
{chr(10).join([f"{year}: {count} citations" for year, count in sorted(citation_trends.items())])}
"""
    
    prompt = f"""Analyze this AI research topic and provide actionable insights.

{trend_context}

Provide a JSON response with:
1. "trend_summary": 2-3 sentence summary of publication and citation trends
2. "key_themes": List of 3-5 emerging themes
3. "recommendations": 3 specific recommendations for future research
4. "momentum": "rising", "stable", or "declining"

Keep it concise and actionable. Focus on what researchers should know."""
    
    # Use Claude 3.5 Haiku for fast, cost-effective insights
    message = client.messages.create(
        model="claude-3-5-haiku-20241022",
        max_tokens=1000,
        temperature=0.3,
        messages=[{
            "role": "user",
            "content": prompt
        }]
    )
    
    return message.content[0].text

# Test with a sample topic
print("Testing topic insights generation...")
sample_insights = generate_topic_insights(
    topic_name="Neural Architecture Search",
    paper_titles=[
        "AutoML for Deep Learning",
        "Efficient Neural Architecture Search",
        "One-Shot Neural Architecture Search"
    ],
    year_trends={2021: 45, 2022: 67, 2023: 89},
    citation_trends={2021: 234, 2022: 567, 2023: 891}
)

print("\n" + "="*80)
print("SAMPLE INSIGHTS:")
print("="*80)
print(sample_insights)
print("="*80)

Testing topic insights generation...

SAMPLE INSIGHTS:
{
    "trend_summary": "Neural Architecture Search (NAS) is experiencing accelerating academic interest, with both publication and citation volumes growing significantly year-over-year. The 45% annual growth in publications and 73% growth in citations suggest robust and expanding research momentum.",
    
    "key_themes": [
        "One-shot architecture exploration",
        "Automated machine learning efficiency",
        "Computational cost reduction in neural design",
        "Transfer learning in architecture search",
        "Meta-learning for neural network optimization"
    ],
    
    "recommendations": [
        "Develop more computationally efficient NAS algorithms that reduce training overhead",
        "Explore cross-domain architecture transfer techniques",
        "Create benchmark datasets specifically for neural architecture evaluation"
    ],
    
    "momentum": "rising"
}


In [43]:
def generate_search_insights(query, search_results):
    """
    Use Case 2: Generate real-time insights for search queries
    
    Args:
        query: User's search query
        search_results: List of papers returned from search (with metadata)
    
    Returns:
        JSON string with contextualized insights
    """
    
    # Extract key information from search results
    titles = [paper.get('title', '') for paper in search_results[:15]]
    years = [paper.get('year', '') for paper in search_results]
    citations = [paper.get('citations', 0) for paper in search_results]
    
    # Compute quick stats
    year_dist = pd.Series(years).value_counts().sort_index()
    avg_citations = np.mean([c for c in citations if c > 0])
    
    context = f"""
Search Query: "{query}"

Found {len(search_results)} papers

Top Papers:
{chr(10).join([f"- {title}" for title in titles[:8]])}

Distribution:
{chr(10).join([f"{year}: {count} papers" for year, count in year_dist.items()])}

Average Citations: {avg_citations:.1f}
"""
    
    prompt = f"""Analyze these search results and provide insights for researchers.

{context}

Provide a JSON response with:
1. "relevance_summary": What are the main themes in these results?
2. "key_papers": 3 papers that seem most influential (from the titles)
3. "research_directions": 2-3 emerging directions based on these papers
4. "search_tips": 1-2 tips to refine the search or explore related topics

Be specific and actionable. Help the researcher understand what they found."""
    
    # Use Haiku for fast response (2-3 seconds)
    message = client.messages.create(
        model="claude-3-5-haiku-20241022",
        max_tokens=1000,
        temperature=0.3,
        messages=[{
            "role": "user",
            "content": prompt
        }]
    )
    
    return message.content[0].text

# Test with sample search results
print("Testing search insights generation...")

sample_results = [
    {"title": "Attention Is All You Need", "year": 2017, "citations": 50000},
    {"title": "BERT: Pre-training of Deep Bidirectional Transformers", "year": 2018, "citations": 40000},
    {"title": "GPT-3: Language Models are Few-Shot Learners", "year": 2020, "citations": 30000},
    {"title": "Vision Transformer (ViT)", "year": 2020, "citations": 20000},
    {"title": "Self-Attention Mechanisms in NLP", "year": 2019, "citations": 5000}
]

sample_search_insights = generate_search_insights(
    query="transformer attention mechanisms",
    search_results=sample_results
)

print("\n" + "="*80)
print("SAMPLE SEARCH INSIGHTS:")
print("="*80)
print(sample_search_insights)
print("="*80)

Testing search insights generation...

SAMPLE SEARCH INSIGHTS:
{
    "relevance_summary": "These search results capture the evolution of transformer attention mechanisms across natural language processing and computer vision, highlighting the transformative impact of self-attention techniques on machine learning models.",
    
    "key_papers": [
        "Attention Is All You Need",
        "BERT: Pre-training of Deep Bidirectional Transformers", 
        "GPT-3: Language Models are Few-Shot Learners"
    ],
    
    "research_directions": [
        "Cross-modal attention mechanisms (applying transformer techniques beyond NLP)",
        "Few-shot and zero-shot learning using pre-trained transformer models",
        "Scaling transformer architectures for increasingly complex tasks"
    ],
    
    "search_tips": [
        "Expand search to include 'transformer variants' and 'attention ablation studies'",
        "Look into implementation papers that provide practical insights on attenti

In [44]:
# After running topic modeling and trend analysis...
# This cell shows how to batch-process top topics

def batch_generate_topic_insights(df_papers, top_n=10):
    """
    Generate insights for top N topics and prepare for database storage
    
    Returns:
        DataFrame with topic insights ready to save
    """
    
    # Placeholder: Assume we have topic assignments from earlier analysis
    # In real implementation, use actual topic clusters from DBSCAN/HDBSCAN
    
    print(f"Generating insights for top {top_n} topics...")
    print(f"Using Claude 3.5 Haiku (fast, cost-effective)")
    print()
    
    insights_data = []
    
    # Example: Process top topics
    # In real code, loop through actual topic clusters
    for topic_id in range(top_n):
        # Get papers in this topic (placeholder)
        topic_papers = df_papers.head(20)  # Replace with actual topic filtering
        
        # Extract trends
        year_trends = topic_papers['year'].value_counts().to_dict()
        citation_trends = topic_papers.groupby('year')['citations'].sum().to_dict()
        paper_titles = topic_papers['title'].head(10).tolist()
        
        # Generate insights
        try:
            topic_name = f"Topic_{topic_id}"  # Replace with actual topic name
            insights_json = generate_topic_insights(
                topic_name=topic_name,
                paper_titles=paper_titles,
                year_trends=year_trends,
                citation_trends=citation_trends
            )
            
            insights_data.append({
                'topic_id': topic_id,
                'topic_name': topic_name,
                'insights': insights_json,
                'generated_at': pd.Timestamp.now(),
                'num_papers': len(topic_papers),
                'year_range': f"{topic_papers['year'].min()}-{topic_papers['year'].max()}"
            })
            
            print(f"‚úì Topic {topic_id}: Generated")
            time.sleep(0.5)  # Rate limiting
            
        except Exception as e:
            print(f"‚úó Topic {topic_id}: Failed - {e}")
    
    # Convert to DataFrame
    df_insights = pd.DataFrame(insights_data)
    
    print(f"\n{'='*80}")
    print(f"Generated insights for {len(df_insights)} topics")
    print(f"Ready to save to database")
    print(f"{'='*80}")
    
    return df_insights

# Uncomment to run batch processing
# df_topic_insights = batch_generate_topic_insights(papers_df, top_n=10)
# df_topic_insights.to_json('../data/topic_insights.json', orient='records', indent=2)

print("‚ÑπÔ∏è  Batch processing function ready")
print("Run after completing topic analysis in cells above")

‚ÑπÔ∏è  Batch processing function ready
Run after completing topic analysis in cells above


### Implementation Notes

**For Frontend Integration:**

1. **Treemap Topics (Use Case 1)**
   - Pre-generate insights for top 10 subtopics in this notebook
   - Save to `data/topic_insights.json`
   - Backend loads insights from JSON file
   - Display instantly when user clicks topic (no API call)

2. **Search Results (Use Case 2)**
   - Backend endpoint: `/api/search/insights`
   - Takes search results as input
   - Calls `generate_search_insights()` asynchronously
   - Frontend shows loading animation while waiting (~2-3 seconds)
   - Use optimistic UI pattern: show papers immediately, insights load after

**Cost Estimation (Claude 3.5 Haiku):**
- Use Case 1: ~10 topics √ó $0.002 = $0.02 (one-time)
- Use Case 2: ~$0.002 per search (real-time)
- Very affordable for this use case!

### Batch Processing for Treemap Topics

Generate insights for the **top 10 subtopics** and save to database for instant display.

### Use Case 2: Real-time Search Insights

This function generates insights **on-demand** when users search. It runs asynchronously and shows a loading animation.

In [45]:
# Interactive widget for exploring subtopics
from ipywidgets import interact, Dropdown

# Create dropdown with subtopics
subtopic_dropdown = Dropdown(
    options=[(f"{name} ({count} papers)", name) for name, count in top_subtopics],
    description='Subtopic:',
    style={'description_width': 'initial'}
)

def explore_subtopic(subtopic):
    plot_subtopic_trends(subtopic, papers_df)

interact(explore_subtopic, subtopic=subtopic_dropdown)

interactive(children=(Dropdown(description='Subtopic:', options=(('Infectious Diseases (718 papers)', 'Infecti‚Ä¶

<function __main__.explore_subtopic(subtopic)>

## 5. Export Results for Web Integration

In [46]:
# Export big topic trends
output_path = Path('../data/trends')
output_path.mkdir(exist_ok=True)

# Save big topic trends
topic_trends.to_json(output_path / 'big_topic_trends.json', orient='records', indent=2)
print(f"‚úÖ Saved big topic trends to {output_path / 'big_topic_trends.json'}")

# Save top subtopic trends
subtopic_trends_data = {}
for subtopic, _ in top_subtopics[:10]:
    trends = get_subtopic_trends(subtopic, papers_df)
    if trends is not None:
        subtopic_trends_data[subtopic] = trends.to_dict('records')

with open(output_path / 'subtopic_trends.json', 'w') as f:
    json.dump(subtopic_trends_data, f, indent=2)
print(f"‚úÖ Saved subtopic trends to {output_path / 'subtopic_trends.json'}")

# Save insights if generated
if insights:
    with open(output_path / 'ai_insights.txt', 'w') as f:
        f.write(insights)
    print(f"‚úÖ Saved AI insights to {output_path / 'ai_insights.txt'}")

print("\n‚ú® All results exported successfully!")

‚úÖ Saved big topic trends to ..\data\trends\big_topic_trends.json
‚úÖ Saved subtopic trends to ..\data\trends\subtopic_trends.json

‚ú® All results exported successfully!


## Next Steps

1. **Web Integration**: Create backend API endpoints to serve trend data
2. **Frontend Dashboard**: Build interactive visualizations in the web app
3. **Real-time Insights**: Integrate LLM API for on-demand insights generation
4. **Custom Analysis**: Allow users to select specific topics and time ranges