In [7]:
import requests
import time
from collections import defaultdict
import json
from datetime import datetime
import os

class USAArticlesCategorizer:
    def __init__(self, email):
        self.base_url = "https://api.openalex.org/works"
        self.headers = {
            "User-Agent": f"ArticlesCategorizer/{email}",
            "Accept": "application/json"
        }
        self.output_dir = "openalex_topics_results"
        os.makedirs(self.output_dir, exist_ok=True)
        
    def fetch_articles(self, year=2024, page_size=200):
        """
        Fetch articles from USA institutions for a specific year with at least 5 citations
        """
        articles = []
        cursor = "*"
        
        while True:
            params = {
                "filter": (
                    f"publication_year:{year},"
                    "institutions.country_code:US,"
                    "type:article,"
                    "cited_by_count:>5"  # Filter for articles with more than 5 citations
                ),
                "per_page": page_size,
                "cursor": cursor,
                "select": "id,title,topics,authorships,institutions_distinct_count,cited_by_count"
            }
            
            try:
                response = requests.get(self.base_url, params=params, headers=self.headers)
                print(f"Request URL: {response.url}")
                
                if response.status_code != 200:
                    print(f"Error fetching data: {response.status_code}")
                    print(f"Error details: {response.text}")
                    break
                    
                data = response.json()
                
                # Check if we got any results
                results = data.get("results", [])
                if results:
                    articles.extend(results)
                    
                    # Print meta information for debugging
                    meta = data.get("meta", {})
                    print(f"Page results: {len(results)}")
                    print(f"Total results from API: {meta.get('count', 0)}")
                    print(f"Current page: {meta.get('page', 0)}")
                    
                    # Save raw data periodically
                    if len(articles) % 1000 == 0:
                        self.save_raw_data(articles, year)
                        print(f"Fetched {len(articles)} highly-cited articles so far...")
                
                cursor = data.get("meta", {}).get("next_cursor")
                if not cursor:
                    break
                    
                time.sleep(0.1)  # Rate limiting
                
            except Exception as e:
                print(f"Error during fetch: {str(e)}")
                break
        
        # Save final raw data
        self.save_raw_data(articles, year)
        print(f"Final count: {len(articles)} highly-cited articles fetched")
        return articles
    
    def save_raw_data(self, articles, year):
        """Save raw article data to JSON file"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = os.path.join(self.output_dir, f"highly_cited_articles_{year}_{timestamp}.json")
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(articles, f, indent=2)
        print(f"Saved raw data to {filename}")
    
    def categorize_articles(self, articles):
        """
        Categorize articles by their top-level research topics
        """
        categories = defaultdict(list)
        topic_mapping = {}  # Store topic ID to name mapping
        
        for article in articles:
            topics = article.get("topics", [])
            if not topics:
                categories["Uncategorized"].append({
                    "article": article,
                    "topic_id": None,
                    "citations": article.get("cited_by_count", 0)
                })
                continue
                
            topics.sort(key=lambda x: x.get("score", 0), reverse=True)
            top_topic = topics[0]
            topic_id = top_topic.get("id")
            topic_name = top_topic.get("display_name", "Uncategorized")
            
            # Store topic mapping
            topic_mapping[topic_id] = topic_name
            
            categories[topic_name].append({
                "article": article,
                "topic_id": topic_id,
                "citations": article.get("cited_by_count", 0)
            })
        
        return categories, topic_mapping
    
    def generate_report(self, categories, topic_mapping):
        """
        Generate a summary report of the categorized highly-cited articles
        """
        total_articles = sum(len(articles) for articles in categories.values())
        
        report = {
            "total_highly_cited_articles": total_articles,
            "categories": {},
            "top_institutions": defaultdict(int),
            "topic_mapping": topic_mapping
        }
        
        for category, articles in categories.items():
            topic_id = articles[0]["topic_id"] if articles and articles[0]["topic_id"] else "uncategorized"
            
            category_stats = {
                "topic_id": topic_id,
                "article_count": len(articles),
                "percentage": (len(articles) / total_articles) * 100,
                "average_citations": sum(art["citations"] for art in articles) / len(articles),
                "sample_titles": [art["article"]["title"] for art in articles[:3]]
            }
            
            for article_data in articles:
                article = article_data["article"]
                for authorship in article.get("authorships", []):
                    for institution in authorship.get("institutions", []):
                        if institution.get("country_code") == "US":
                            report["top_institutions"][institution.get("display_name", "Unknown")] += 1
            
            report["categories"][category] = category_stats
        
        report["top_institutions"] = dict(
            sorted(report["top_institutions"].items(), 
                  key=lambda x: x[1], 
                  reverse=True)[:10]
        )
        
        return report
    
    def save_report(self, report, year):
        """Save report to JSON format"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        json_filename = os.path.join(self.output_dir, f"highly_cited_report_{year}_{timestamp}.json")
        with open(json_filename, 'w', encoding='utf-8') as f:
            json.dump(report, f, indent=2)
        print(f"Saved report to {json_filename}")

In [8]:
# Imports
import sys
import logging
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# from article_analyzer import USAArticlesCategorizer

# Setup logging for notebook
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    stream=sys.stdout
)
logger = logging.getLogger(__name__)

# Configuration
EMAIL = "ofir.arbili@gmail.com"  # Replace with your email
YEAR = 2024
PAGE_SIZE = 50  # Start with smaller page size for testing

# Initialize the categorizer
logger.info(f"Initializing categorizer with email: {EMAIL}")
categorizer = USAArticlesCategorizer(EMAIL)

# Fetch articles
logger.info(f"Fetching articles for year {YEAR}")
articles = categorizer.fetch_articles(year=YEAR, page_size=PAGE_SIZE)

if articles:
    logger.info(f"Successfully fetched {len(articles)} articles")
    
    # Categorize articles
    logger.info("Categorizing articles by topics...")
    categories, topic_mapping = categorizer.categorize_articles(articles)
    
    # Generate report
    logger.info("Generating report...")
    report = categorizer.generate_report(categories, topic_mapping)
    
    # Save report
    logger.info("Saving report...")
    categorizer.save_report(report, YEAR)
    
    # Analysis and Visualization
    
    # 1. Create DataFrame for topics
    topics_df = pd.DataFrame([
        {
            'topic': topic,
            'topic_id': stats['topic_id'],
            'count': stats['article_count'],
            'percentage': stats['percentage']
        }
        for topic, stats in report['categories'].items()
    ])
    
    # Sort by count
    topics_df = topics_df.sort_values('count', ascending=False)
    
    # 2. Create DataFrame for institutions
    institutions_df = pd.DataFrame([
        {'institution': inst, 'count': count}
        for inst, count in report['top_institutions'].items()
    ]).sort_values('count', ascending=False)
    
    # Plotting
    plt.style.use('seaborn')
    
    # 1. Top Topics Bar Plot
    plt.figure(figsize=(15, 8))
    sns.barplot(data=topics_df.head(10), x='count', y='topic')
    plt.title(f'Top 10 Topics in US Research Publications ({YEAR})')
    plt.xlabel('Number of Articles')
    plt.ylabel('Topic')
    plt.tight_layout()
    plt.show()
    
    # 2. Topics Pie Chart
    plt.figure(figsize=(12, 12))
    plt.pie(topics_df.head(10)['percentage'], 
            labels=topics_df.head(10)['topic'],
            autopct='%1.1f%%')
    plt.title(f'Distribution of Top 10 Topics ({YEAR})')
    plt.axis('equal')
    plt.show()
    
    # 3. Top Institutions Bar Plot
    plt.figure(figsize=(15, 8))
    sns.barplot(data=institutions_df.head(10), x='count', y='institution')
    plt.title(f'Top 10 Contributing US Institutions ({YEAR})')
    plt.xlabel('Number of Articles')
    plt.ylabel('Institution')
    plt.tight_layout()
    plt.show()
    
    # Print Summary Statistics
    print("\nSummary Statistics:")
    print(f"Total Articles: {report['total_articles']}")
    print(f"Number of Topics: {len(topics_df)}")
    print(f"Number of Contributing Institutions: {len(institutions_df)}")
    
    print("\nTop 5 Topics:")
    for _, row in topics_df.head().iterrows():
        print(f"{row['topic']} ({row['topic_id']}): {row['count']} articles ({row['percentage']:.1f}%)")
    
    print("\nTop 5 Institutions:")
    for _, row in institutions_df.head().iterrows():
        print(f"{row['institution']}: {row['count']} articles")
    
    # Save processed data
    topics_df.to_csv(f'topics_analysis_{YEAR}.csv', index=False)
    institutions_df.to_csv(f'institutions_analysis_{YEAR}.csv', index=False)
    
    logger.info("Analysis complete!")
else:
    logger.error("No articles were fetched.")

2025-02-21 22:27:28,587 - INFO - Initializing categorizer with email: ofir.arbili@gmail.com
2025-02-21 22:27:28,588 - INFO - Fetching articles for year 2024
Request URL: https://api.openalex.org/works?filter=publication_year%3A2024%2Cinstitutions.country_code%3AUS%2Ctype%3Aarticle%2Ccited_by_count%3A%3E5&per_page=50&cursor=%2A&select=id%2Ctitle%2Ctopics%2Cauthorships%2Cinstitutions_distinct_count%2Ccited_by_count
Page results: 50
Total results from API: 27814
Current page: None
Request URL: https://api.openalex.org/works?filter=publication_year%3A2024%2Cinstitutions.country_code%3AUS%2Ctype%3Aarticle%2Ccited_by_count%3A%3E5&per_page=50&cursor=IlsxMDAuMCwgMTY0LCAnaHR0cHM6Ly9vcGVuYWxleC5vcmcvVzQzOTA5Njc3MjEnXSI%3D&select=id%2Ctitle%2Ctopics%2Cauthorships%2Cinstitutions_distinct_count%2Ccited_by_count
Page results: 50
Total results from API: 27814
Current page: None
Request URL: https://api.openalex.org/works?filter=publication_year%3A2024%2Cinstitutions.country_code%3AUS%2Ctype%3Aarticle

OSError: 'seaborn' is not a valid package style, path of style file, URL of style file, or library style name (library styles are listed in `style.available`)

In [4]:
# # Imports
# import sys
# import logging
# from datetime import datetime
# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# # from article_analyzer import USAArticlesCategorizer

# # Setup logging for notebook
# logging.basicConfig(
#     level=logging.INFO,
#     format='%(asctime)s - %(levelname)s - %(message)s',
#     stream=sys.stdout
# )
# logger = logging.getLogger(__name__)

# # Configuration
# EMAIL = "ofir.arbili@gmail.com"  # Replace with your email
# YEAR = 2024
# PAGE_SIZE = 50  # Start with smaller page size for testing

# # Initialize the categorizer
# logger.info(f"Initializing categorizer with email: {EMAIL}")
# categorizer = USAArticlesCategorizer(EMAIL)

# # Fetch articles
# logger.info(f"Fetching articles for year {YEAR}")
# articles = categorizer.fetch_articles(year=YEAR, page_size=PAGE_SIZE)

# if articles:
#     logger.info(f"Successfully fetched {len(articles)} articles")
    
#     # Categorize articles
#     logger.info("Categorizing articles by topics...")
#     categories, topic_mapping = categorizer.categorize_articles(articles)
    
#     # Generate report
#     logger.info("Generating report...")
#     report = categorizer.generate_report(categories, topic_mapping)
    
#     # Save report
#     logger.info("Saving report...")
#     categorizer.save_report(report, YEAR)
    
#     # Analysis and Visualization
    
#     # 1. Create DataFrame for topics
#     topics_df = pd.DataFrame([
#         {
#             'topic': topic,
#             'topic_id': stats['topic_id'],
#             'count': stats['article_count'],
#             'percentage': stats['percentage']
#         }
#         for topic, stats in report['categories'].items()
#     ])
    
#     # Sort by count
#     topics_df = topics_df.sort_values('count', ascending=False)
    
#     # 2. Create DataFrame for institutions
#     institutions_df = pd.DataFrame([
#         {'institution': inst, 'count': count}
#         for inst, count in report['top_institutions'].items()
#     ]).sort_values('count', ascending=False)
    
#     # Plotting
#     plt.style.use('seaborn')
    
#     # 1. Top Topics Bar Plot
#     plt.figure(figsize=(15, 8))
#     sns.barplot(data=topics_df.head(10), x='count', y='topic')
#     plt.title(f'Top 10 Topics in US Research Publications ({YEAR})')
#     plt.xlabel('Number of Articles')
#     plt.ylabel('Topic')
#     plt.tight_layout()
#     plt.show()
    
#     # 2. Topics Pie Chart
#     plt.figure(figsize=(12, 12))
#     plt.pie(topics_df.head(10)['percentage'], 
#             labels=topics_df.head(10)['topic'],
#             autopct='%1.1f%%')
#     plt.title(f'Distribution of Top 10 Topics ({YEAR})')
#     plt.axis('equal')
#     plt.show()
    
#     # 3. Top Institutions Bar Plot
#     plt.figure(figsize=(15, 8))
#     sns.barplot(data=institutions_df.head(10), x='count', y='institution')
#     plt.title(f'Top 10 Contributing US Institutions ({YEAR})')
#     plt.xlabel('Number of Articles')
#     plt.ylabel('Institution')
#     plt.tight_layout()
#     plt.show()
    
#     # Print Summary Statistics
#     print("\nSummary Statistics:")
#     print(f"Total Articles: {report['total_articles']}")
#     print(f"Number of Topics: {len(topics_df)}")
#     print(f"Number of Contributing Institutions: {len(institutions_df)}")
    
#     print("\nTop 5 Topics:")
#     for _, row in topics_df.head().iterrows():
#         print(f"{row['topic']} ({row['topic_id']}): {row['count']} articles ({row['percentage']:.1f}%)")
    
#     print("\nTop 5 Institutions:")
#     for _, row in institutions_df.head().iterrows():
#         print(f"{row['institution']}: {row['count']} articles")
    
#     # Save processed data
#     topics_df.to_csv(f'topics_analysis_{YEAR}.csv', index=False)
#     institutions_df.to_csv(f'institutions_analysis_{YEAR}.csv', index=False)
    
#     logger.info("Analysis complete!")
# else:
#     logger.error("No articles were fetched.")

In [5]:
!pwd

/sise/liorrk-group/OrDanOfir/Miki
