# ดึงข้อมูลของ AJ มาสร้าง CSV

In [None]:
import os
import json
import csv

# Define the folder containing the files and the output CSV file path
folder_path = './2018'  # Change this to your folder path
consolidated_csv_path = './2018_consolidated_data.csv'  # Path to save the consolidated CSV

# List all files in the folder (files from 201800000 to 201802791)
files = [f"{folder_path}/{str(i).zfill(9)}" for i in range(201800000, 202002791)]

# Prepare the consolidated data list and headers
consolidated_data = []
headers = ["Author Name", "Affiliation", "Country", "City", "Organization", "Affiliation ID", "Publication Year", "Publication Month", "Publication Day"]

# Process each file in the folder
for file_path in files:
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            try:
                json_data = json.load(file)  # Load JSON content
                # Extract relevant data
                item = json_data.get("abstracts-retrieval-response", {}).get("item", {})
                bibrecord = item.get("bibrecord", {})
                head = bibrecord.get("head", {})

                # Handle author-group in a more flexible way
                author_groups = head.get("author-group", [])
                if isinstance(author_groups, dict):  # If it's a single dictionary, convert it to a list
                    author_groups = [author_groups]

                for group in author_groups:
                    # Check if 'affiliation' is a dictionary before trying to access it
                    affiliation = group.get("affiliation", {})
                    if isinstance(affiliation, dict):
                        country = affiliation.get("country", "")
                        city = affiliation.get("city", "")
                        organizations = affiliation.get("organization", [])
                        organization = ", ".join(org.get("$", "") if isinstance(org, dict) else org for org in organizations)

                        # Handle 'affiliation-id' which might be a list or a dict
                        affiliation_id = ""
                        affiliation_ids = affiliation.get("affiliation-id", [])
                        if isinstance(affiliation_ids, list) and affiliation_ids:
                            # If it's a list, extract the @afid from the first item (if it exists)
                            affiliation_id = affiliation_ids[0].get("@afid", "")
                        elif isinstance(affiliation_ids, dict):
                            # If it's a dictionary, extract the @afid
                            affiliation_id = affiliation_ids.get("@afid", "")

                    else:
                        country = city = organization = affiliation_id = ""

                    authors = group.get("author", [])
                    for author in authors:
                        author_name = author.get("preferred-name", {}).get("ce:indexed-name", "")

                        # Extract publication date
                        date_sort = item.get("ait:process-info", {}).get("ait:date-sort", {})
                        year = date_sort.get("@year", "")
                        month = date_sort.get("@month", "")
                        day = date_sort.get("@day", "")

                        # Append row to the consolidated data
                        consolidated_data.append([author_name, affiliation.get("$", ""), country, city, organization, affiliation_id, year, month, day])

            except json.JSONDecodeError:
                print(f"Error decoding JSON in file: {file_path}")

# Write the consolidated data to a single CSV file
with open(consolidated_csv_path, 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(headers)  # Write header row
    writer.writerows(consolidated_data)  # Write data rows

print(f"Consolidated CSV saved at: {consolidated_csv_path}")


# Fetch data API scopus


In [None]:
import requests
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from itertools import cycle
import random
from collections import defaultdict

# List of API keys
API_KEYS = [
    "e7dc17913a45bb7b0cfd5926ab1fbdd8",
    "abae376cf14a3091414d66dacbe18fe3",
    "960ced6009eae3d945d254da8e4c0dcc",
    "7be1d3f08dd6c1a087ea9422bf8e523f",
    "e64cca60e1887864c71fb79b76e25abf",
    "be419565f4a17dff5a054c6fcaedb531",
    "2e2c7367371b610cac9d27f8e511126f",
    "75576f83731f57b5ca57ca1579bb8598",
    "0e1815b9396a9d6f73270b2c824e1fd8",
    "7a47ff373dde6f7663c001445ee31b45",
    "1d50f79df44135f08e3df50fa1bc0a87",
    "0b3d8082db2b122049f699685a4691c6",
    "f4568289911bca8a90f480909926646e",
    "857f2d1976752b58e1a8db0c423452af",
    "3fb826ae005c4e4c9ba6a00482436c3b",
    "294d0182245ffbe46765ce9a2fe5f50b"
]

BASE_URL = 'https://api.elsevier.com/content/search/scopus'
count = 25  # Number of records per request
max_records_per_subject_area = 3000  # Max records per subject area
all_data = []  # Store all fetched data

# List of subject areas
subject_areas = [
    "MULT", "AGRI", "ARTS", "BIOC", "BUSI", "CENG", "CHEM", "COMP",
    "DECI", "EART", "ECON", "ENER", "ENGI", "ENVI", "IMMU", "MATE",
    "MATH", "MEDI", "NEUR", "NURS", "PHAR", "PHYS", "PSYC", "SOCI",
    "VETE", "DENT", "HEAL"
]

# Function to fetch data for a subject area and year
def fetch_data(api_key, subject_area, start, year):
    headers = {'X-ELS-APIKey': api_key}
    query = f'''ALL ( . ) AND PUBYEAR = {year} AND SUBJAREA ( "{subject_area}" )'''
    params = {'query': query, 'count': count, 'start': start, 'view': 'STANDARD'}
    
    retries = 3  # Retry up to 3 times if we hit a rate limit
    for _ in range(retries):
        try:
            response = requests.get(BASE_URL, headers=headers, params=params)
            if response.status_code == 200:
                data = response.json()
                entries = data.get('search-results', {}).get('entry', [])
                for entry in entries:
                    entry['queried_subject_area'] = subject_area
                    entry['queried_year'] = year  # Add year to the entry for reference
                return entries
            elif response.status_code == 429:
                wait_time = random.randint(5, 15)  # Random wait time between retries
                print(f"Rate limit hit. Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                print(f"Error fetching data for {subject_area} (start={start}, year={year}): {response.status_code}")
                return []
        except requests.exceptions.RequestException as e:
            print(f"Request failed for {subject_area} (start={start}, year={year}): {e}")
            return []
    return []  # Return an empty list if all retries failed

# Function to collect data for all years and subject areas
def collect_data_for_years_and_subjects(subject_areas, start_year, end_year):
    api_key_pool = cycle(API_KEYS)  # Cycle through API keys
    total_tasks = len(subject_areas) * (max_records_per_subject_area // count) * (end_year - start_year + 1)
    
    if total_tasks == 0:
        print("No tasks to process.")
        return
    
    completed_tasks = 0
    
    with ThreadPoolExecutor(max_workers=10) as executor:  # Adjusted max_workers for better control
        futures = []
        for year in range(start_year, end_year + 1):
            for subject_area in subject_areas:
                for start in range(0, max_records_per_subject_area, count):
                    api_key = next(api_key_pool)
                    futures.append(executor.submit(fetch_data, api_key, subject_area, start, year))
        
        start_time = time.time()
        for future in as_completed(futures):
            entries = future.result()
            if entries:
                all_data.extend(entries)
            completed_tasks += 1
            elapsed_time = time.time() - start_time
            progress = (completed_tasks / total_tasks) * 100
            eta = (elapsed_time / completed_tasks) * (total_tasks - completed_tasks) if completed_tasks > 0 else 0
            print(f"Progress: {progress:.2f}%, ETA: {eta:.2f}s")

# Collect data for the years 2018-2023 and all subject areas
collect_data_for_years_and_subjects(subject_areas, 2018, 2023)

# Deduplicate and process the records
grouped_by_title = defaultdict(list)

for entry in all_data:
    title = entry.get('dc:title', '')
    authors = entry.get('dc:creator', '').split(',')
    subject_area = entry.get('queried_subject_area', '')
    queried_year = entry.get('queried_year', '')  # Add queried year
    affiliations = entry.get('affiliation', [])
    countries = [aff.get('affiliation-country', '') for aff in affiliations if aff.get('affiliation-country')]

    grouped_by_title[title].append({
        'Authors': [author.strip() for author in authors],
        'Source Title': entry.get('prism:publicationName', ''),
        'DOI': entry.get('prism:doi', ''),
        'EID': entry.get('eid', ''),
        'Year': queried_year,
        'Abstract': entry.get('dc:description', ''),
        'Subjected Area': subject_area,
        'Countries': countries
    })

# Flatten grouped records and handle duplicates
unique_records = []
for title, entries in grouped_by_title.items():
    for record in entries:
        for country in record['Countries']:
            unique_records.append({
                'Source Title': record['Source Title'],
                'DOI': record['DOI'],
                'EID': record['EID'],
                'Year': record['Year'],
                'Abstract': record['Abstract'],
                'Subjected Area': record['Subjected Area'],
                'Country': country.strip()
            })

# Convert to DataFrame and save as CSV
df = pd.DataFrame(unique_records)
output_csv = 'scopus_data_2018-2023(3000).csv'
df.to_csv(output_csv, index=False, encoding='utf-8')

print(f"Cleaned data saved to '{output_csv}'")


Progress: 0.01%, ETA: 17111.52s
Progress: 0.01%, ETA: 9322.26s
Progress: 0.02%, ETA: 7830.44s
Progress: 0.02%, ETA: 6706.31s
Progress: 0.03%, ETA: 5425.39s
Progress: 0.03%, ETA: 6582.52s
Progress: 0.04%, ETA: 5764.81s
Progress: 0.04%, ETA: 5627.39s
Progress: 0.05%, ETA: 5007.19s
Progress: 0.05%, ETA: 4531.87s
Progress: 0.06%, ETA: 4262.44s
Progress: 0.06%, ETA: 4065.18s
Progress: 0.07%, ETA: 3921.23s
Progress: 0.07%, ETA: 3666.18s
Progress: 0.08%, ETA: 3431.22s
Progress: 0.08%, ETA: 3822.87s
Progress: 0.09%, ETA: 3970.85s
Progress: 0.09%, ETA: 3779.87s
Progress: 0.10%, ETA: 3838.71s
Progress: 0.10%, ETA: 3741.43s
Progress: 0.11%, ETA: 3563.08s
Progress: 0.11%, ETA: 3505.38s
Progress: 0.12%, ETA: 3784.37s
Progress: 0.12%, ETA: 3777.04s
Progress: 0.13%, ETA: 3765.62s
Progress: 0.13%, ETA: 3717.33s
Progress: 0.14%, ETA: 3751.15s
Progress: 0.14%, ETA: 3689.12s
Progress: 0.15%, ETA: 3961.76s
Progress: 0.15%, ETA: 3938.05s
Progress: 0.16%, ETA: 3894.06s
Progress: 0.16%, ETA: 3812.02s
Progres