In [28]:
import json
import requests
import pandas as pd
from tqdm import tqdm
import re
from bs4 import BeautifulSoup

# Scopus API key
API_KEY = 'a17167505f5d6799ad4cf9c9f28de7f1'

# Search query
query = 'TITLE-ABS-KEY(("software engineering" OR "programming" OR "software development" OR "computer science" OR "computer engineering") AND ("education" OR "teaching") AND ("LLM" OR "large language model"))'

# Base URL for Scopus API
base_url = "https://api.elsevier.com/content/search/scopus"

# Headers for the API request
headers = {
    'X-ELS-APIKey': API_KEY,
    'Accept': 'application/json'
}

# Parameters for the API request
params = {
    'query': query,
    'view': 'STANDARD',
    'start': 0,
    'count': 25  # Initial count to handle pagination
}

# Function to get metadata
def get_metadata(base_url, params, headers):
    all_data = []
    pbar = tqdm(desc="Fetching papers", unit="paper")
    while True:
        response = requests.get(base_url, params=params, headers=headers)
        response.raise_for_status()
        data = response.json()
        entries = data['search-results']['entry']
        all_data.extend(entries)
        pbar.update(len(entries))
        # Check if there's a next page link
        if any(link['@ref'] == 'next' for link in data['search-results']['link']):
            params['start'] += params['count']
        else:
            break
    pbar.close()
    return all_data

# Function to parse the metadata
def parse_metadata(entries):
    papers_metadata = []
    for entry in tqdm(entries, desc="Processing papers", unit="paper"):
        eid = entry['eid']
        doi = entry.get('prism:doi', 'N/A')
        if doi == 'N/A':
            continue  # Skip entries without a DOI
        title = entry.get('dc:title')
        url = next((link['@href'] for link in entry.get('link', []) if link['@ref'] == 'scopus'), 'N/A')
        abstract, bibtex, year = get_abstract_bibtex(eid, url, title)
        metadata = {
            'title': title,
            'url': url,
            'doi': doi,
            'abstract': abstract,
            'year': year,
            'num_pages': calculate_num_pages(entry.get('prism:pageRange', 'N/A')),
            'paper_type': entry.get('subtypeDescription', 'N/A'),
            'bibtex': bibtex
        }
        papers_metadata.append(metadata)
    return papers_metadata

# Function to get abstract, bibtex, and year using BeautifulSoup and regex
def get_abstract_bibtex(eid, url, title):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    content = response.text
    soup = BeautifulSoup(content, 'html.parser')
    
    # Extract abstract using regex
    abstract = 'N/A'
    abstract_match = re.search(r'abstractSection.*?>(.*?)</section>', content, re.DOTALL)
    if abstract_match:
        abstract = re.sub('<.*?>', '', abstract_match.group(1)).strip()
    
    # Extract authors
    authors = []
    authors_match = soup.find_all('a', {'class': 'authorName'})
    if authors_match:
        authors = [author.get_text(strip=True) for author in authors_match]
    author_list = ', '.join(authors) if authors else 'N/A'
    
    # Extract year using BeautifulSoup
    year = 'N/A'
    journal_info = soup.find('span', {'id': 'journalInfo'})
    if journal_info:
        year_match = re.search(r'\b\d{4}\b', journal_info.get_text())
        if year_match:
            year = year_match.group(0)
    
    # Construct bibtex
    bibtex = f"@article{{{eid.replace(':', '_')},\n" \
             f"  title={{{title}}},\n" \
             f"  author={{{author_list}}},\n" \
             f"  journal={{N/A}},\n" \
             f"  year={{{year}}},\n" \
             f"  volume={{N/A}},\n" \
             f"  pages={{N/A}},\n" \
             f"  doi={{N/A}}\n" \
             f"}}"
    return abstract, bibtex, year

# Function to calculate number of pages from page range
def calculate_num_pages(page_range):
    if page_range and '-' in page_range:
        start, end = page_range.split('-')
        return int(end) - int(start) + 1
    return 4  # Default to 4 pages if page range is not available

# Get metadata
entries = get_metadata(base_url, params, headers)
papers_metadata = parse_metadata(entries)

# Convert to DataFrame
df = pd.DataFrame(papers_metadata)

# Save to CSV
df.to_csv('Scopus.csv', index=False)

print(f"Metadata extraction complete. {len(papers_metadata)} papers extracted. Check the Scopus.csv file.")


Fetching papers: 230paper [00:06, 35.75paper/s]
Processing papers: 100%|██████████| 230/230 [03:24<00:00,  1.12paper/s]

Metadata extraction complete. 193 papers extracted. Check the Scopus.csv file.



