In [345]:
import os
from pyspark.sql import SparkSession ,  Row , SQLContext 
from pyspark.sql.functions import col,concat_ws, expr, explode, lit, when, from_json , StructType, StringType, to_json, udf, regexp_extract
from pyspark.sql.types import StructType, StructField, StringType
import requests
import json

In [21]:
spark = SparkSession.builder\
        .master('local')\
        .appName('Spark Tutorial')\
        .config('spark.ui.port', '4040')\
        .getOrCreate()

In [340]:
base_dir = "./Raw_data/test/"  # Replace with your directory path
# base_dir = './test'

output_file = "./output"  # Replace with your desired output path

years = [str(year) for year in range(2018, 2023)]
# years = [str(year) for year in range(2022, 2023)]

combined_df = None

# Adjusted column selection
selected_columns = [
    #normal columns
    col("abstracts-retrieval-response.language.@xml:lang").alias("language"), #
    col("abstracts-retrieval-response.coredata.srctype").alias("source_type"),
    col("abstracts-retrieval-response.coredata.prism:doi").alias("prism:doi"),
    col("abstracts-retrieval-response.coredata.prism:coverDate").alias("cover_date"),
    col("abstracts-retrieval-response.coredata.prism:aggregationType").alias("aggregation_type"),
    col("abstracts-retrieval-response.coredata.source-id").alias("source_id"),
    col("abstracts-retrieval-response.coredata.citedby-count").alias("citedby_count"),
    col("abstracts-retrieval-response.coredata.prism:volume").alias("volume"),
    col("abstracts-retrieval-response.coredata.subtype").alias("subtype"),
    col("abstracts-retrieval-response.coredata.dc:title").alias("title"),
    col("abstracts-retrieval-response.coredata.prism:issueIdentifier").alias("issue_identifier"),
    col("abstracts-retrieval-response.coredata.subtypeDescription").alias("subtype_description"),
    col("abstracts-retrieval-response.coredata.prism:publicationName").alias("publication_name"),
    col("abstracts-retrieval-response.coredata.prism:startingPage").alias("starting_page"),
    col("abstracts-retrieval-response.coredata.prism:endingPage").alias("ending_page"),
    col("abstracts-retrieval-response.coredata.dc:identifier").alias("identifier"),
    col("abstracts-retrieval-response.coredata.dc:publisher").alias("publisher"),

    #array columns
    col("abstracts-retrieval-response.coredata.dc:creator.author").alias("authors"),
    col("abstracts-retrieval-response.affiliation").alias("affiliation"),
    col("abstracts-retrieval-response.subject-areas.subject-area").alias("subject_area"), #
    col("abstracts-retrieval-response.authkeywords.author-keyword").alias("authkeyword"), #
    col("abstracts-retrieval-response.idxterms.mainterm").alias("idxterm"), #
    col("abstracts-retrieval-response.item.bibrecord.tail.bibliography.reference").alias("reference_itemid"), #

    
    # col("abstracts-retrieval-response.authors.author[0].preferred-name.ce:given-name").alias("author_given_name"),
    # col("abstracts-retrieval-response.authors.author[0].preferred-name.ce:indexed-name").alias("author_indexed_name"),
    # col("abstracts-retrieval-response.authors.author[0].ce:degrees").alias("author_degrees"),
    # col("abstracts-retrieval-response.authors.author[0].@_fa").alias("author_fa"),
    # col("abstracts-retrieval-response.authors.author[0].@auid").alias("author_auid"),
    
    # col("abstracts-retrieval-response.subject-areas.subject-area[0].@_fa").alias("subject_area_fa"),
    # col("abstracts-retrieval-response.subject-areas.subject-area[0].$").alias("subject_area_name"),
    # col("abstracts-retrieval-response.subject-areas.subject-area[0].@code").alias("subject_area_code"),
    # col("abstracts-retrieval-response.subject-areas.subject-area[0].@abbrev").alias("subject_area_abbrev"),
    
    # col("abstracts-retrieval-response.authkeywords.author-keyword[0].@_fa").alias("authkeyword_fa"),
    # col("abstracts-retrieval-response.authkeywords.author-keyword[0].$").alias("authkeyword"),
    
    # col("abstracts-retrieval-response.idxterms.mainterm[0].$").alias("idxterm"),
    # col("abstracts-retrieval-response.idxterms.mainterm[0].@weight").alias("idxterm_weight"),
    # col("abstracts-retrieval-response.idxterms.mainterm[0].@candidate").alias("idxterm_candidate"),
    
    # col("abstracts-retrieval-response.affiliation[0].affiliation-city").alias("affiliation_city"),
    # col("abstracts-retrieval-response.affiliation[0].@id").alias("affiliation_id"),
    # col("abstracts-retrieval-response.affiliation[0].affilname").alias("affiliation_name"),
    # col("abstracts-retrieval-response.affiliation[0].affiliation-country").alias("affiliation_country"),
    
    # col("abstracts-retrieval-response.item.bibrecord.tail.bibliography.reference[0].ref-info.refd-itemidlist.itemid[1].$").alias("reference_itemid")
]


for year in years:
    year_dir = os.path.join(base_dir, year)  # Construct the path to the year's folder
    
    # Check if the directory exists (optional but recommended)
    if os.path.exists(year_dir):
        print(f"Processing directory: {year_dir}")
        
        # Read JSON files from the folder
        year_df = spark.read.option("multiline", True).json(year_dir)
        year_df = year_df.select(*selected_columns)
        
        # Combine with previous DataFrames
        if combined_df is None:
            combined_df = year_df
        else:
            combined_df = combined_df.union(year_df)
    else:
        print(f"Directory does not exist: {year_dir}")

In [None]:
# Define your Scopus Search API endpoint
search_base_url = "https://api.elsevier.com/content/search/scopus"
abstract_base_url = "https://api.elsevier.com/content/abstract/doi/"

# Your API Key
api_key = "01d58db39c61dd0939ce840a1321fbc0"

# Query for searching articles
search_query = "TITLE-ABS-KEY(chulalongkorn)"

# Headers for authentication
headers = {
    "X-ELS-APIKey": api_key,
    "Accept": "application/json"
}

# Parameters for Scopus search
search_params = {
    "query": search_query,
    "count": 1,  # Fetch only 1 result per page
}

# Create the 'additional_file' folder if it doesn't exist
output_folder = "Additional_Data"
os.makedirs(output_folder, exist_ok=True)

# Maximum number of results to fetch
max_results = 1000  # Adjust as needed
total_results = 0

# Fetch results one at a time
for i in range(max_results):
    search_params['start'] = i  # Increment by 1 for each article
    
    # Send the request for the current article
    search_response = requests.get(search_base_url, headers=headers, params=search_params)
    
    if search_response.status_code == 200:
        search_data = search_response.json()
        entries = search_data.get("search-results", {}).get("entry", [])
        
        if not entries:
            print(f"No more articles found after {total_results} results.")
            break
        
        # Extract the DOI and fetch details for the article
        result = entries[0]  # Since count=1, there is only one entry
        doi = result.get("prism:doi")
        
        if doi:
            # Retrieve metadata for the article
            abstract_url = f"{abstract_base_url}{doi}"
            abstract_response = requests.get(abstract_url, headers=headers)
            
            if abstract_response.status_code == 200:
                article_data = abstract_response.json()
                
                # Save the article's data as a separate JSON file
                file_path = os.path.join(output_folder, f"article_{i + 1}.json")
                with open(file_path, "w", encoding="utf-8") as json_file:
                    json.dump(article_data, json_file, ensure_ascii=False, indent=4)
                    
                total_results += 1
            else:
                print(f"Failed to retrieve metadata for DOI {doi}: {abstract_response.status_code}")
        else:
            print(f"No DOI found for article {i + 1}. Skipping.")
    else:
        print(f"Failed to retrieve data for article {i + 1}: {search_response.status_code}")
        break

print(f"All data has been saved to the '{output_folder}' folder. Fetched a total of {total_results} articles.")


In [341]:
Additional_file_dir = "./Additional_Data"

addidtional_df = spark.read.option("multiline", True).json(Additional_file_dir)

selected_columns = [
    #normal columns
    col("abstracts-retrieval-response.coredata.srctype").alias("source_type"),
    col("abstracts-retrieval-response.coredata.prism:doi").alias("prism:doi"),
    col("abstracts-retrieval-response.coredata.prism:coverDate").alias("cover_date"),
    col("abstracts-retrieval-response.coredata.prism:aggregationType").alias("aggregation_type"),
    col("abstracts-retrieval-response.coredata.source-id").alias("source_id"),
    col("abstracts-retrieval-response.coredata.citedby-count").alias("citedby_count"),
    col("abstracts-retrieval-response.coredata.prism:volume").alias("volume"),
    col("abstracts-retrieval-response.coredata.subtype").alias("subtype"),
    col("abstracts-retrieval-response.coredata.dc:title").alias("title"),
    col("abstracts-retrieval-response.coredata.prism:issueIdentifier").alias("issue_identifier"),
    col("abstracts-retrieval-response.coredata.subtypeDescription").alias("subtype_description"),
    col("abstracts-retrieval-response.coredata.prism:publicationName").alias("publication_name"),
    col("abstracts-retrieval-response.coredata.prism:startingPage").alias("starting_page"),
    col("abstracts-retrieval-response.coredata.prism:endingPage").alias("ending_page"),
    col("abstracts-retrieval-response.coredata.dc:identifier").alias("identifier"),
    col("abstracts-retrieval-response.coredata.dc:publisher").alias("publisher"),

    #array columns
    # col("abstracts-retrieval-response.coredata.dc:creator.author").alias("authors"),
    # col("abstracts-retrieval-response.affiliation").alias("affiliation"),
]

addidtional_df = addidtional_df.select(*selected_columns)

In [None]:
result_df = combined_df.unionByName(addidtional_df, allowMissingColumns=True)

In [35]:
# Save the extracted data to CSV
try:
    combined_df.coalesce(1).write.option("header", True).mode('overwrite').csv(output_file)
    print("Data extraction complete! Check the output folder.")
except Exception as e:
    print(f"Error: {e}")

In [344]:
result_df.show()