In [2]:
import pandas as pd
import requests
from io import StringIO

# Scimagojr

In [4]:
# data for 2024 (most recent year)
url = "https://www.scimagojr.com/journalrank.php?year=2025&out=xls"
response = requests.get(url)

df = pd.read_csv(StringIO(response.text), sep=';')

# clean column names
df.columns = [col.lower().replace(' ', '_') for col in df.columns]

print(len(df))
df.head()

31136


Unnamed: 0,rank,sourceid,title,type,issn,publisher,open_access,open_access_diamond,sjr,sjr_best_quartile,...,ref._/_doc.,%female,overton,sdg,country,region,publisher.1,coverage,categories,areas
0,1,28773,Ca-A Cancer Journal for Clinicians,journal,"15424863, 00079235",John Wiley and Sons Inc,No,No,145004,Q1,...,6288,4821,4,37,United States,Northern America,John Wiley and Sons Inc,1950-2025,Hematology (Q1); Oncology (Q1),Medicine
1,2,19434,MMWR Recommendations and Reports,journal,"10575987, 15458601",Centers for Disease Control and Prevention (CDC),Yes,No,41754,Q1,...,27533,7593,1,5,United States,Northern America,Centers for Disease Control and Prevention (CDC),1990-2024,Epidemiology (Q1); Health Information Manageme...,Environmental Science; Health Professions; Med...
2,3,20315,Nature Reviews Molecular Cell Biology,journal,"14710072, 14710080",Nature Research,No,No,37353,Q1,...,9245,4122,0,15,United Kingdom,Western Europe,Nature Research,2000-2025,Cell Biology (Q1); Molecular Biology (Q1),"Biochemistry, Genetics and Molecular Biology"
3,4,29431,Quarterly Journal of Economics,journal,"00335533, 15314650",Oxford University Press,No,No,35995,Q1,...,6979,2518,35,27,United Kingdom,Western Europe,Oxford University Press,1886-2025,Economics and Econometrics (Q1),"Economics, Econometrics and Finance"
4,5,20425,Nature Reviews Drug Discovery,journal,"14741784, 14741776",Nature Research,No,No,30506,Q1,...,3566,2667,1,58,United Kingdom,Western Europe,Nature Research,2002-2025,Drug Discovery (Q1); Medicine (miscellaneous) ...,"Medicine; Pharmacology, Toxicology and Pharmac..."


In [5]:
df.to_csv('sjr_journals_2024.csv', index=False)

# Web of Science

In [None]:
#!pip install fake-useragent

Collecting fake-useragent
  Downloading fake_useragent-2.2.0-py3-none-any.whl.metadata (17 kB)
Downloading fake_useragent-2.2.0-py3-none-any.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fake-useragent
Successfully installed fake-useragent-2.2.0


In [None]:
import os
import time
import random
import requests
import re
from bs4 import BeautifulSoup
from pprint import pprint
from fake_useragent import UserAgent
from tqdm.auto import tqdm

In [None]:
# check adequateness
session = requests.session()
ua = UserAgent()
headers = {'User-Agent': ua.random}
response = session.get('https://wos-journal.info/?jsearch=&page=2', headers=headers)
print(response.status_code)

200


In [None]:
# base URL pattern
base_url = "https://wos-journal.info/?jsearch=&page={}"

journal_links = []

In [None]:
# process all 23711 journals
for page in tqdm(range(0, 2371), desc="Processing pages"):
    url = base_url.format(page)
    response = session.get(url, headers=headers)

    if response.status_code != 200:
        print(f"Failed to fetch page {page}")
        continue

    soup = BeautifulSoup(response.content, 'html.parser')

    # find all journal pages
    journal_pages = soup.find_all('div', class_=lambda x: x and x.startswith('cardj'))

    for page in journal_pages:
        link = page.find('a', href=lambda href: href and 'journalid' in href)
        full_url = 'https://wos-journal.info' + link['href'][1:]
        # print(full_url)
        journal_links.append(full_url)

    time.sleep(random.choice([0.5, 0.4, 0.3]))

Processing pages:   0%|          | 0/2371 [00:00<?, ?it/s]

In [None]:
# forgot the last one
journal_links.append("https://wos-journal.info/journalid/23550")

In [None]:
print(f"Total links collected: {len(journal_links)}")

Total links collected: 23711


In [None]:
journals = []

for i, page in tqdm(enumerate(journal_links), desc="Processing pages"):
    try:
        response = session.get(page, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        # data for each journal
        journal_data = {}

        # title
        title_div = soup.find('div', class_='title title2 col-12 col-md-12 h5 py-5')
        if title_div and title_div.find('a'):
          title_text = title_div.find('a').text.strip()
          journal_data['title'] = title_text

        journal_data['url'] = page

        # journal features mapping
        features = {
            'Abbreviation': 'abbreviation',
            'ISSN:': 'issn',
            'eISSN:': 'eissn',
            'Category:': 'category',
            'WoS Core Citation Indexes:': 'wos_index',
            'Journal Impact Factor (JIF):': 'jif',
            '5-year Impact Factor:': '5_year_jif',
            'Open Access Support:': 'open_access',
            'Country:': 'country',
            'Status in WoS core:': 'wos_status',
            'Publisher:': 'publisher'
        }

        # find all features
        titles = soup.find_all('div', class_='title')

        for title_div in titles:
            title_text = title_div.text.strip()
            content_div = title_div.find_next_sibling('div', class_='content')

            if content_div:
                content_text = content_div.text.strip()

                # check each feature
                for feature, col_name in features.items():
                    if feature in title_text:
                        journal_data[col_name] = content_text

                # best_ranking
                if 'Best ranking' in title_text:
                    best_ranking = content_text.split('║')[0].strip()
                    journal_data['best_ranking'] = best_ranking

                    # percentage for quartile
                    percentage_span = content_div.find('span', class_='badge badge-primary')

                    if percentage_span:
                        percentage_text = percentage_span.text.strip().replace('Percentage rank: ', '')
                        journal_data['percentage'] = percentage_text

        journals.append(journal_data)

    except Exception as e:
        print(f"Error with {page}: {e}")
        continue

    time.sleep(random.choice([0.4, 0.3, 0.2]))

Processing pages: 0it [00:00, ?it/s]

In [None]:
journals

[{'title': 'CA-A CANCER JOURNAL FOR CLINICIANS',
  'url': 'https://wos-journal.info/journalid/1',
  'abbreviation': 'CA-CANCER J CLIN',
  'issn': '1542-4863',
  'eissn': '1542-4863',
  'category': 'ONCOLOGY - SCIE',
  'wos_index': 'SCIE - Science Citation Index Expanded',
  'jif': '232.4',
  '5_year_jif': '353',
  'best_ranking': 'ONCOLOGY',
  'percentage': '99.7%',
  'open_access': 'Fully Open Access ― It may take a publication fee. For more info, check it on DOAJ.ORG',
  'country': 'UNITED STATES',
  'wos_status': 'Active',
  'publisher': 'N/A'},
 {'title': 'NATURE REVIEWS MICROBIOLOGY',
  'url': 'https://wos-journal.info/journalid/13',
  'abbreviation': 'NAT REV MICROBIOL',
  'issn': '1740-1534',
  'eissn': '1740-1534',
  'category': 'MICROBIOLOGY - SCIE',
  'wos_index': 'SCIE - Science Citation Index Expanded',
  'jif': '103.3',
  '5_year_jif': '99.1',
  'best_ranking': 'MICROBIOLOGY',
  'percentage': '99.4%',
  'open_access': 'Hybrid and Open Access Support',
  'country': 'ENGLAND

In [None]:
df = pd.DataFrame(journals)
print(df)

df.to_csv('wos_journals_data.csv', index=False)

                                               title  \
0                 CA-A CANCER JOURNAL FOR CLINICIANS   
1                        NATURE REVIEWS MICROBIOLOGY   
2                      NATURE REVIEWS DRUG DISCOVERY   
3              NATURE REVIEWS MOLECULAR CELL BIOLOGY   
4                   Kidney International Supplements   
5                                             LANCET   
6                           Nature Reviews Materials   
7                   Nature Reviews Clinical Oncology   
8                    NEW ENGLAND JOURNAL OF MEDICINE   
9                 Nature Reviews Earth & Environment   
10                             NATURE REVIEWS CANCER   
11                                  World Psychiatry   
12                                ANNALS OF ONCOLOGY   
13                      Living Reviews in Relativity   
14                         NATURE REVIEWS IMMUNOLOGY   
15                    Nature Reviews Disease Primers   
16                                     Nature En

In [None]:
print(df['publisher'])