In [None]:
import findspark
findspark.init()

spark_url = 'local'
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

spark = SparkSession.builder\
        .master(spark_url)\
        .appName('Spark Tutorial')\
        .config('spark.ui.port', '4040')\
        .getOrCreate()

sc = spark.sparkContext

In [None]:
from pyspark.sql.functions import col

# Path to folders containing JSON files
input_folder = "./Raw_data/2022/"  # Replace with your directory path
output_file = "./output/kuy/"  # Replace with your desired output path

# Load JSON files (read all files in the folder)
df = spark.read.option("multiline", True).json(input_folder)

df.printSchema()


# Adjusted column selection
selected_columns = [
    col("abstracts-retrieval-response.language.@xml:lang").alias("language"),
    
    col("abstracts-retrieval-response.coredata.srctype").alias("source_type"),
    col("abstracts-retrieval-response.coredata.eid").alias("eid"),
    col("abstracts-retrieval-response.coredata.prism:coverDate").alias("cover_date"),
    col("abstracts-retrieval-response.coredata.prism:aggregationType").alias("aggregation_type"),
    col("abstracts-retrieval-response.coredata.source-id").alias("source_id"),
    col("abstracts-retrieval-response.coredata.citedby-count").alias("citedby_count"),
    col("abstracts-retrieval-response.coredata.prism:volume").alias("volume"),
    col("abstracts-retrieval-response.coredata.subtype").alias("subtype"),
    col("abstracts-retrieval-response.coredata.dc:title").alias("title"),
    col("abstracts-retrieval-response.coredata.prism:issueIdentifier").alias("issue_identifier"),
    col("abstracts-retrieval-response.coredata.subtypeDescription").alias("subtype_description"),
    col("abstracts-retrieval-response.coredata.prism:publicationName").alias("publication_name"),
    col("abstracts-retrieval-response.coredata.prism:startingPage").alias("starting_page"),
    col("abstracts-retrieval-response.coredata.prism:endingPage").alias("ending_page"),
    col("abstracts-retrieval-response.coredata.dc:identifier").alias("identifier"),
    col("abstracts-retrieval-response.coredata.dc:publisher").alias("publisher"),
    ]
# Apply selection
try:
    extracted_df = df.select(*selected_columns)
    extracted_df.show()  # Display the data for validation
    
    # Save the extracted data to CSV
    extracted_df.coalesce(1).write.option("header", True).csv(output_file)
    print("Data extraction complete! Check the output folder.")
except Exception as e:
    print(f"Error: {e}")