In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patheffects as path_effects
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns
from matplotlib.ticker import FuncFormatter
import matplotlib.font_manager as fm
from matplotlib.gridspec import GridSpec
import re
import io
import csv

In [None]:
def load_data(filename):
    """Load data from CSV file"""
    try:
        # Read the CSV file
        df = pd.read_csv(filename)

        # If columns are specifically "Year" and "Count"
        if 'Year' in df.columns and 'Count' in df.columns:
            data = df[['Year', 'Count']].copy()
            data.columns = ['Year', 'Publications']
            return data
        else:
            print("Error: CSV file must contain 'Year' and 'Count' columns.")
            raise ValueError("Required columns not found")
    except Exception as e:
        print(f"Error loading data: {e}")
        raise

def create_visualization(data):
    """Visualize cancer research publications over time with key events and metrics"""
    fig, ax_main = plt.subplots(figsize=(14, 8))
    fig.patch.set_facecolor('white')
    ax_main.set_facecolor('white')

    colors = ["#a8dadc", "#f4a261", "#e5989b", "#b5ead7", "#ffb5a7"]
    cmap = LinearSegmentedColormap.from_list("pastel_cmap", colors, N=100)

    # Compute metrics
    total_pubs = data['Publications'].sum()
    avg_annual_growth = np.mean(data['Publications'].pct_change().dropna() * 100)
    recent_growth = (data['Publications'].iloc[-1] / data['Publications'].iloc[-6] - 1) * 100
    peak_year = data.loc[data['Publications'].idxmax(), 'Year']
    peak_pubs = data['Publications'].max()

    # Print metrics
    print(f"Total Publications: {total_pubs:,}")
    print(f"Avg Annual Growth: {avg_annual_growth:.1f}%")
    print(f"5-Year Growth: {recent_growth:.1f}%")
    print(f"Peak Year: {peak_year}")
    print(f"Peak Publications: {peak_pubs:,}")

    # Main data
    x = data['Year']
    y = data['Publications']
    ax_main.plot(x, y, color='black', linewidth=1, alpha=0.8)

    for i in range(len(x) - 1):
        ax_main.fill_between([x[i], x[i+1]], [0, 0], [y[i], y[i+1]],
                             color="#a8dadc", alpha=0.6)

    ax_main.scatter(x, y, color='black', s=20, alpha=0.9,
                    edgecolor=cmap(0.8), linewidth=1)

    ax_main.set_xlabel('Year', fontsize=14, color='black')
    ax_main.set_ylabel('Number of Publications', fontsize=14, color='black')
    ax_main.tick_params(axis='both', colors='black', labelsize=14)
    fig.suptitle('Cancer Research Publications Over Time (1944 - 2024)', fontsize=16, color='black')
    fig.text(0.9, 0.01, "Source: PubMed.gov", ha='right', fontsize=10, color='gray')
    # add grid
    ax_main.grid(color='gray', linestyle='--', linewidth=0.5, alpha=0.7)
    # add line around the plot
    ax_main.set_xlim(data['Year'].min() - 1, data['Year'].max() + 1)
    for spine in ax_main.spines.values():
        spine.set_edgecolor('gray')
        spine.set_linewidth(0.5)


    # Key events with staggered vertical offsets
    key_events = {
        1990: "Human Genome Project\nbegins",
        2003: "Human Genome Project\ncompleted",
        2011: "Cancer Genome \nAtlas milestone"
    }

    used_offsets = [0.12, 0.20, 0.24, 0.15]  # relative vertical offsets
    for i, (year, event) in enumerate(key_events.items()):
        if year in data['Year'].values:
            idx = data[data['Year'] == year].index[0]
            y_value = data.loc[idx, 'Publications']
            offset = used_offsets[i % len(used_offsets)] * data['Publications'].max()
            ax_main.annotate(event,
                             xy=(year, y_value),
                             xytext=(year, y_value + offset),
                             ha='center',
                             fontsize=14,
                             arrowprops=dict(arrowstyle='->', color='black', alpha=0.7),
                             color='black')

    plt.tight_layout()
    plt.savefig('cancer_publications_trend.png', dpi=300, bbox_inches='tight')
    plt.show()

# Main execution
def main(csv_file=None):
    """Main function to run the analysis and visualization"""
    print("Loading data...")
    data = load_data(csv_file)

    print(f"Analyzing {len(data)} years of cancer publication data")
    print(f"Year range: {data['Year'].min()} - {data['Year'].max()}")
    print(f"Total publications: {data['Publications'].sum():,}")

    print("Creating visualization...")
    create_visualization(data)
    print("Visualization completed and saved as 'cancer_publications_trend.png'")

# Execute the code (if CSV file is provided use it, otherwise use sample data)
if __name__ == "__main__":
    main("./PubMed_Timeline_Results_by_Year.csv")