<a href="https://colab.research.google.com/github/Tar-ive/Dashboard/blob/main/dashboard_CADS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests

url = "https://api.openalex.org/works?page=1&filter=authorships.author.id:a5088154684&sort=cited_by_count:desc&per_page=10"

try:
    response = requests.get(url)
    response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)
    data = response.json()
    # Now you can work with the 'data' dictionary, which contains the parsed JSON response
    print(data)

except requests.exceptions.RequestException as e:
    print(f"Error querying the URL: {e}")


{'meta': {'count': 47, 'db_response_time_ms': 30, 'page': 1, 'per_page': 10, 'groups_count': None}, 'results': [{'id': 'https://openalex.org/W2802664165', 'doi': 'https://doi.org/10.1111/insr.12269', 'title': 'Statistical Medical Fraud Assessment: Exposition to an Emerging Field', 'display_name': 'Statistical Medical Fraud Assessment: Exposition to an Emerging Field', 'publication_year': 2018, 'publication_date': '2018-05-04', 'ids': {'openalex': 'https://openalex.org/W2802664165', 'doi': 'https://doi.org/10.1111/insr.12269', 'mag': '2802664165'}, 'language': 'en', 'primary_location': {'is_oa': False, 'landing_page_url': 'https://doi.org/10.1111/insr.12269', 'pdf_url': None, 'source': {'id': 'https://openalex.org/S17395904', 'display_name': 'International Statistical Review', 'issn_l': '0306-7734', 'issn': ['0306-7734', '1751-5823'], 'is_oa': False, 'is_in_doaj': False, 'is_indexed_in_scopus': True, 'is_core': True, 'host_organization': 'https://openalex.org/P4310320595', 'host_organiz

In [None]:
orcid = 'https://orcid.org/0000-0002-0965-3521'


In [None]:
def build_author_works_url(orcid):
    # specify endpoint
    endpoint = 'works'

    # build the 'filter' parameter
    filters = (
      f'author.orcid:{orcid}',
      'is_paratext:false'
    )

    # put the URL together
    return f'https://api.openalex.org/{endpoint}?filter={",".join(filters)}'

author_works_url = build_author_works_url(orcid)
print(f'complete URL with filters:\n{author_works_url}')

complete URL with filters:
https://api.openalex.org/works?filter=author.orcid:https://orcid.org/0000-0002-0965-3521,is_paratext:false


In [None]:
import requests

def get_all_citations(works_url):
    works_url_with_cursor = works_url + '&cursor={}'

    # loop through pages
    cursor = '*'
    citation_counts = []
    while cursor:
        # set cursor value and request page from OpenAlex
        url = works_url_with_cursor.format(cursor)
        page_with_results = requests.get(url).json()

        # loop through partial list of results
        # extract citation count from every work
        results = page_with_results['results']
        citation_counts += [work['cited_by_count'] for work in results]

        # update cursor to meta.next_cursor
        cursor = page_with_results['meta']['next_cursor']

    return citation_counts

citation_counts = get_all_citations(author_works_url)
print("complete list of citation counts:\n" + ', '.join(str(x) for x in citation_counts))

complete list of citation counts:
13, 36, 35, 19, 31, 29, 0, 19, 18, 11, 18, 17, 15, 15, 13, 12, 11, 10, 4, 11, 8, 7, 7, 7, 6, 3, 3, 0, 0, 2, 2, 1, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0


In [None]:
sort_value = 'cited_by_count:desc'
author_works_sorted_url = author_works_url + f'&sort={sort_value}'

print(f'complete URL with sort:\n{author_works_sorted_url}')

complete URL with sort:
https://api.openalex.org/works?filter=author.orcid:https://orcid.org/0000-0002-0965-3521,is_paratext:false&sort=cited_by_count:desc


In [None]:
sorted_citation_counts = get_all_citations(author_works_sorted_url)
print("complete list of sorted citation counts:\n" + ', '.join(str(x) for x in sorted_citation_counts))

complete list of sorted citation counts:
36, 35, 31, 29, 19, 19, 18, 18, 17, 15, 15, 13, 13, 12, 11, 11, 11, 10, 8, 7, 7, 7, 6, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0


In [12]:
import requests
import time
import pandas as pd
import json
from typing import List, Dict, Optional

def call_openalex_api(endpoint, params=None):
    """Make API calls with rate limiting and error handling"""
    base_url = f"https://api.openalex.org/{endpoint}"
    headers = {'User-Agent': 'mailto:your_email@example.com'}  # Replace with your email

    try:
        response = requests.get(base_url, params=params, headers=headers)
        response.raise_for_status()
        time.sleep(0.2)  # Rate limiting
        return response.json()
    except Exception as e:
        print(f"Error calling {endpoint} API: {str(e)}")
        return None

def reconstruct_abstract(inverted_index: Dict) -> str:
    """
    Reconstruct abstract text from OpenAlex inverted index format.

    Args:
        inverted_index: Dictionary with words as keys and position lists as values

    Returns:
        Reconstructed abstract text
    """
    if not inverted_index:
        return ""

    # Create a list to hold words at their positions
    word_positions = []

    # Extract all word-position pairs
    for word, positions in inverted_index.items():
        for pos in positions:
            word_positions.append((pos, word))

    # Sort by position and extract words
    word_positions.sort(key=lambda x: x[0])
    words = [word for _, word in word_positions]

    return " ".join(words)

def get_researcher_works(researcher_id):
    """Get all works for a researcher using cursor pagination"""
    # Clean the ID - extract just the ID part (e.g., A5088154684)
    if 'openalex.org/' in researcher_id:
        clean_id = researcher_id.split('/')[-1]
    else:
        clean_id = researcher_id

    # Ensure it starts with A
    if not clean_id.startswith('A'):
        clean_id = f"A{clean_id}"

    print(f"Fetching works for researcher: {clean_id}")

    base_params = {
        'filter': f'author.id:{clean_id}',
        'per-page': 200,
        'sort': 'cited_by_count:desc'
    }

    all_works = []
    cursor = '*'  # Initial cursor

    while cursor:
        try:
            params = base_params.copy()
            params['cursor'] = cursor

            print(f"Fetching works... (current total: {len(all_works)})")
            response = call_openalex_api('works', params)

            if not response or 'results' not in response:
                print("No response or no results")
                break

            works = response['results']
            if not works:
                print("No more works found")
                break

            all_works.extend(works)
            print(f"Retrieved {len(works)} works in this batch")

            # Get next cursor from meta
            cursor = response.get('meta', {}).get('next_cursor')

            if not cursor:  # No more pages
                print("No more pages available")
                break

        except Exception as e:
            print(f"Error fetching works for researcher {clean_id}: {str(e)}")
            break

    print(f"Total works retrieved: {len(all_works)}")
    return all_works

def extract_work_data(works: List[Dict]) -> List[Dict]:
    """
    Extract work IDs, abstracts, and topics from works data.

    Args:
        works: List of work dictionaries from OpenAlex API

    Returns:
        List of dictionaries with extracted data
    """
    extracted_data = []

    for work in works:
        # Extract work ID
        work_id = work.get('id', '')

        # Extract and reconstruct abstract
        abstract_inverted = work.get('abstract_inverted_index', {})
        abstract = reconstruct_abstract(abstract_inverted)

        # Extract topics
        topics = work.get('topics', [])
        topic_info = []
        if topics and isinstance(topics, list):
            for topic in topics:
                if topic and isinstance(topic, dict):
                    topic_info.append({
                        'id': topic.get('id', ''),
                        'display_name': topic.get('display_name', ''),
                        'score': topic.get('score', 0) or 0
                    })

        # Additional useful fields
        title = work.get('display_name', '')
        publication_year = work.get('publication_year', 0)
        doi = work.get('doi', '')
        citations = work.get('cited_by_count', 0) or 0  # Handle None values

        # Open access info
        oa_info = work.get('open_access', {})
        if oa_info and isinstance(oa_info, dict):
            is_oa = oa_info.get('is_oa', False)
            oa_status = oa_info.get('oa_status', '')
        else:
            is_oa = False
            oa_status = ''

        # Primary source info
        primary_location = work.get('primary_location')
        source_name = ''
        if primary_location and isinstance(primary_location, dict):
            source = primary_location.get('source')
            if source and isinstance(source, dict):
                source_name = source.get('display_name', '')

        extracted_data.append({
            'work_id': work_id,
            'title': title,
            'abstract': abstract,
            'topics': topic_info,
            'publication_year': publication_year,
            'doi': doi,
            'citations': citations,
            'is_open_access': is_oa,
            'oa_status': oa_status,
            'source_name': source_name,
            'has_abstract': bool(abstract),
            'num_topics': len(topic_info)
        })

    return extracted_data

def save_to_files(data: List[Dict], researcher_id: str):
    """
    Save the extracted data to various formats.

    Args:
        data: Extracted work data
        researcher_id: Researcher ID for filename
    """

    # Create DataFrame
    df = pd.DataFrame(data)

    # Clean researcher ID for filename
    clean_id = researcher_id.split('/')[-1] if 'openalex.org/' in researcher_id else researcher_id

    # Save to CSV
    csv_filename = f"tahir_ekin_{clean_id}_works.csv"
    df.to_csv(csv_filename, index=False)
    print(f"Data saved to {csv_filename}")

    # Save to JSON for topics preservation
    json_filename = f"tahir_ekin_{clean_id}_works.json"
    with open(json_filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"Data saved to {json_filename}")

    # Create a separate topics DataFrame for analysis
    topics_data = []
    for work in data:
        work_id = work['work_id']
        for topic in work['topics']:
            topics_data.append({
                'work_id': work_id,
                'work_title': work['title'],
                'topic_id': topic['id'],
                'topic_name': topic['display_name'],
                'topic_score': topic['score']
            })

    if topics_data:
        topics_df = pd.DataFrame(topics_data)
        topics_csv = f"tahir_ekin_{clean_id}_topics.csv"
        topics_df.to_csv(topics_csv, index=False)
        print(f"Topics data saved to {topics_csv}")

    return df

def main():
    """
    Main function to get Tahir Ekin's works
    """
    # Tahir Ekin's researcher ID
    RESEARCHER_ID = "A5088154684"  # or "https://openalex.org/A5088154684"

    print("Getting Tahir Ekin's Works from OpenAlex")
    print("=" * 45)
    print("IMPORTANT: Update the email in call_openalex_api() function!")
    print("Change 'your_email@example.com' to your actual email")
    print("=" * 45)

    # Step 1: Fetch all works for Tahir Ekin
    works = get_researcher_works(RESEARCHER_ID)

    if not works:
        print("No works found for this researcher.")
        return [], pd.DataFrame()

    # Step 2: Extract relevant data
    print("\nExtracting work data...")
    extracted_data = extract_work_data(works)

    # Step 3: Save to files
    print("\nSaving data...")
    df = save_to_files(extracted_data, RESEARCHER_ID)

    # Step 4: Display summary statistics
    print("\nSummary Statistics:")
    print("=" * 20)
    print(f"Total works: {len(extracted_data)}")
    print(f"Works with abstracts: {sum(1 for w in extracted_data if w['has_abstract'])}")
    print(f"Works without abstracts: {sum(1 for w in extracted_data if not w['has_abstract'])}")
    print(f"Total citations: {sum(w['citations'] for w in extracted_data)}")
    avg_citations = sum(w['citations'] for w in extracted_data) / len(extracted_data) if extracted_data else 0
    print(f"Average citations per work: {avg_citations:.1f}")
    print(f"Open access works: {sum(1 for w in extracted_data if w['is_open_access'])}")
    avg_topics = df['num_topics'].mean() if not df.empty else 0
    print(f"Average topics per work: {avg_topics:.2f}")

    # Show year distribution
    if 'publication_year' in df.columns and not df['publication_year'].isna().all():
        # Filter out 0 and NaN values
        valid_years = df[df['publication_year'] > 0]['publication_year']
        if not valid_years.empty:
            year_counts = valid_years.value_counts().sort_index()
            print(f"\nPublication years: {year_counts.index.min()} to {year_counts.index.max()}")
            print(f"Most productive year: {year_counts.idxmax()} ({year_counts.max()} works)")

    # Show top 5 most cited works
    print("\nTop 5 Most Cited Works:")
    print("-" * 25)
    top_cited = sorted(extracted_data, key=lambda x: x['citations'], reverse=True)[:5]
    for i, work in enumerate(top_cited, 1):
        print(f"\n{i}. {work['title']}")
        print(f"   Citations: {work['citations']}")
        print(f"   Year: {work['publication_year']}")
        print(f"   Open Access: {work['is_open_access']}")
        print(f"   Source: {work['source_name']}")
        if work['abstract']:
            print(f"   Abstract: {work['abstract'][:150]}...")

    return extracted_data, df

# Run the extraction
if __name__ == "__main__":
    # IMPORTANT: Update the email in the call_openalex_api function!
    # Change 'your_email@example.com' to your actual email
    extracted_data, df = main()

Getting Tahir Ekin's Works from OpenAlex
IMPORTANT: Update the email in call_openalex_api() function!
Change 'your_email@example.com' to your actual email
Fetching works for researcher: A5088154684
Fetching works... (current total: 0)
Retrieved 47 works in this batch
Fetching works... (current total: 47)
No more works found
Total works retrieved: 47

Extracting work data...

Saving data...
Data saved to tahir_ekin_A5088154684_works.csv
Data saved to tahir_ekin_A5088154684_works.json
Topics data saved to tahir_ekin_A5088154684_topics.csv

Summary Statistics:
Total works: 47
Works with abstracts: 29
Works without abstracts: 18
Total citations: 388
Average citations per work: 8.3
Open access works: 13
Average topics per work: 2.53

Publication years: 2013 to 2025
Most productive year: 2019 (11 works)

Top 5 Most Cited Works:
-------------------------

1. Statistical Medical Fraud Assessment: Exposition to an Emerging Field
   Citations: 36
   Year: 2018
   Open Access: True
   Source: Int

In [13]:
topic_df = pd.read_csv('/content/tahir_ekin_A5088154684_topics.csv')

In [20]:
topic_df['topic_name'].isnull().sum()

np.int64(0)

In [21]:
works_df = pd.read_csv('/content/tahir_ekin_A5088154684_works.csv')

In [25]:
pd.set_option('display.max_colwidth', None)
print(works_df['topics'])

0                  [{'id': 'https://openalex.org/T11652', 'display_name': 'Imbalanced Data Classification Techniques', 'score': 0.9913}, {'id': 'https://openalex.org/T11443', 'display_name': 'Advanced Statistical Process Monitoring', 'score': 0.9864}, {'id': 'https://openalex.org/T11871', 'display_name': 'Advanced Statistical Methods and Models', 'score': 0.9824}]
1                             [{'id': 'https://openalex.org/T10734', 'display_name': 'Information and Cyber Security', 'score': 0.9991}, {'id': 'https://openalex.org/T11045', 'display_name': 'Privacy, Security, and Data Protection', 'score': 0.9881}, {'id': 'https://openalex.org/T13243', 'display_name': 'Innovation in Digital Healthcare Systems', 'score': 0.9815}]
2                                               [{'id': 'https://openalex.org/T13910', 'display_name': 'Computational and Text Analysis Methods', 'score': 0.998}, {'id': 'https://openalex.org/T11819', 'display_name': 'Data-Driven Disease Surveillance', 'score': 0.98