In [None]:
import os
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import col

In [None]:
spark = SparkSession.builder\
        .master('local')\
        .appName('Spark Tutorial')\
        .config('spark.ui.port', '4040')\
        .getOrCreate()

In [None]:
base_dir = "./Raw_data"  # Replace with your directory path
output_file = "./output"  # Replace with your desired output path

years = [str(year) for year in range(2018, 2023)]

combined_df = None

# Adjusted column selection
selected_columns = [
    col("abstracts-retrieval-response.language.@xml:lang").alias("language"),
    
    col("abstracts-retrieval-response.coredata.srctype").alias("source_type"),
    col("abstracts-retrieval-response.coredata.eid").alias("eid"),
    col("abstracts-retrieval-response.coredata.prism:coverDate").alias("cover_date"),
    col("abstracts-retrieval-response.coredata.prism:aggregationType").alias("aggregation_type"),
    col("abstracts-retrieval-response.coredata.source-id").alias("source_id"),
    col("abstracts-retrieval-response.coredata.citedby-count").alias("citedby_count"),
    col("abstracts-retrieval-response.coredata.prism:volume").alias("volume"),
    col("abstracts-retrieval-response.coredata.subtype").alias("subtype"),
    col("abstracts-retrieval-response.coredata.dc:title").alias("title"),
    col("abstracts-retrieval-response.coredata.prism:issueIdentifier").alias("issue_identifier"),
    col("abstracts-retrieval-response.coredata.subtypeDescription").alias("subtype_description"),
    col("abstracts-retrieval-response.coredata.prism:publicationName").alias("publication_name"),
    col("abstracts-retrieval-response.coredata.prism:startingPage").alias("starting_page"),
    col("abstracts-retrieval-response.coredata.prism:endingPage").alias("ending_page"),
    col("abstracts-retrieval-response.coredata.dc:identifier").alias("identifier"),
    col("abstracts-retrieval-response.coredata.dc:publisher").alias("publisher"),
]

for year in years:
    year_dir = os.path.join(base_dir, year)  # Construct the path to the year's folder
    
    # Check if the directory exists (optional but recommended)
    if os.path.exists(year_dir):
        print(f"Processing directory: {year_dir}")
        
        # Read JSON files from the folder
        year_df = spark.read.option("multiline", True).json(year_dir)
        year_df = year_df.select(*selected_columns)
        
        # Combine with previous DataFrames
        if combined_df is None:
            combined_df = year_df
        else:
            combined_df = combined_df.union(year_df)
    else:
        print(f"Directory does not exist: {year_dir}")

Processing directory: ./Raw_data\2018
Processing directory: ./Raw_data\2019
Data extraction complete! Check the output folder.


In [None]:
# Save the extracted data to CSV
try:
    combined_df.coalesce(1).write.option("header", True).mode('overwrite').csv(output_file)
    print("Data extraction complete! Check the output folder.")
except Exception as e:
    print(f"Error: {e}")

In [11]:
combined_df.count()

5874