In [1]:
from pyspark.sql import SparkSession ,  Row , SQLContext 
from pyspark.sql.functions import col,concat_ws, expr, explode, lit, when, from_json , StructType, StringType, to_json, udf, regexp_extract, count, sum, first, collect_list
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, IntegerType
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, CountVectorizer
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import requests
import json
import os

In [3]:
# spark = SparkSession.builder\
#         .master("local[*]") \
#         .appName('Data Engineer & Data Science Component')\
#         .config('spark.ui.port', '4041')\
#         .config("spark.executor.memory", "4g") \
#         .getOrCreate()

spark = SparkSession.builder\
        .master("local") \
        .appName('Data Engineer & Data Science Component')\
        .config('spark.ui.port', '4040')\
        .getOrCreate()

PySparkRuntimeError: [JAVA_GATEWAY_EXITED] Java gateway process exited before sending its port number.

Fetch More article from Scopus

In [None]:
# Define your Scopus Search API endpoint
search_base_url = "https://api.elsevier.com/content/search/scopus"
abstract_doi_url = "https://api.elsevier.com/content/abstract/doi/"
abstract_scopus_url = "https://api.elsevier.com/content/abstract/scopus_id/"
abstract_eid_url = "https://api.elsevier.com/content/abstract/eid/"

# Your API Key
api_key = "01d58db39c61dd0939ce840a1321fbc0"

# Query for searching articles
search_query = "TITLE-ABS-KEY(chulalongkorn) AND (PUBYEAR < 2018 OR PUBYEAR > 2023)"

# Headers for authentication
headers = {
    "X-ELS-APIKey": api_key,
    "Accept": "application/json"
}

# Parameters for Scopus search
search_params = {
    "query": search_query,
    "count": 1,  # Fetch only 1 result per page
}

# Create the 'additional_file' folder if it doesn't exist
output_folder = "./Raw_data/Additional_Data"
os.makedirs(output_folder, exist_ok=True)

# Maximum number of results to fetch
max_results = 1000  # Adjust as needed
total_results = 0
travelled = 0

# Fetch results one at a time
while total_results < max_results:
    search_params['start'] = travelled  # Increment by 1 for each article
    
    # Send the request for the current article
    search_response = requests.get(search_base_url, headers=headers, params=search_params)
    
    if search_response.status_code == 200:
        search_data = search_response.json()
        entries = search_data.get("search-results", {}).get("entry", [])
        
        if not entries:
            print(f"No more articles found after {total_results} results.")
            break
        
        # Extract the DOI and fetch details for the article
        result = entries[0]  # Since count=1, there is only one entry
        doi = result.get("prism:doi")
        scopus_id = result.get("dc:identifier")
        eid = result.get("eid")
        
        if doi:
            # Retrieve metadata for the article
            abstract_url = f"{abstract_doi_url}{doi}"
            abstract_response = requests.get(abstract_url, headers=headers)
            
            if abstract_response.status_code == 200:
                article_data = abstract_response.json()
                
                # Save the article's data as a separate JSON file
                file_path = os.path.join(output_folder, f"article_{travelled + 1}.json")
                with open(file_path, "w", encoding="utf-8") as json_file:
                    json.dump(article_data, json_file, ensure_ascii=False, indent=4)
                    
                total_results += 1
            else:
                print(f"Failed to retrieve metadata for DOI {doi}: {abstract_response.status_code}")
                
        elif scopus_id:
            # Retrieve metadata for the article
            abstract_url = f"{abstract_scopus_url}{scopus_id}"
            abstract_response = requests.get(abstract_url, headers=headers)
            
            if abstract_response.status_code == 200:
                article_data = abstract_response.json()
                
                # Save the article's data as a separate JSON file
                file_path = os.path.join(output_folder, f"article_{travelled + 1}.json")
                with open(file_path, "w", encoding="utf-8") as json_file:
                    json.dump(article_data, json_file, ensure_ascii=False, indent=4)
                    
                total_results += 1
            else:
                print(f"Failed to retrieve metadata for Scopus ID {scopus_id}: {abstract_response.status_code}")
        elif eid:
            # Retrieve metadata for the article
            abstract_url = f"{abstract_eid_url}{eid}"
            abstract_response = requests.get(abstract_url, headers=headers)
            
            if abstract_response.status_code == 200:
                article_data = abstract_response.json()
                
                # Save the article's data as a separate JSON file
                file_path = os.path.join(output_folder, f"article_{travelled + 1}.json")
                with open(file_path, "w", encoding="utf-8") as json_file:
                    json.dump(article_data, json_file, ensure_ascii=False, indent=4)
                    
                total_results += 1
            else:
                print(f"Failed to retrieve metadata for EID {eid}: {abstract_response.status_code}")
        else:
            print(f"No DOI found for article {travelled + 1}. Skipping.")
    else:
        print(f"Failed to retrieve data for article {travelled + 1}: {search_response.status_code}")
        break
    travelled += 1

print(f"All data has been saved to the '{output_folder}' folder. Fetched a total of {total_results} articles.")


Read all json

In [4]:
base_dir = "./Raw_data"  # Replace with your directory path
base_dir = './test'

df = spark.read.option("multiline", True).option("recursiveFileLookup", True).json(base_dir)

In [10]:
# Adjusted column selection
base_data_selected_columns = [
    #normal columns
    col("abstracts-retrieval-response.language.@xml:lang").alias("language"), #
    col("abstracts-retrieval-response.coredata.srctype").alias("source_type"),
    col("abstracts-retrieval-response.coredata.prism:doi").alias("prism:doi"),
    col("abstracts-retrieval-response.coredata.prism:coverDate").alias("cover_date"),
    col("abstracts-retrieval-response.coredata.prism:aggregationType").alias("aggregation_type"),
    col("abstracts-retrieval-response.coredata.source-id").alias("source_id"),
    col("abstracts-retrieval-response.coredata.citedby-count").alias("citedby_count"),
    col("abstracts-retrieval-response.coredata.prism:volume").alias("volume"),
    col("abstracts-retrieval-response.coredata.subtype").alias("subtype"),
    col("abstracts-retrieval-response.coredata.dc:title").alias("title"),
    col("abstracts-retrieval-response.coredata.prism:issueIdentifier").alias("issue_identifier"),
    col("abstracts-retrieval-response.coredata.subtypeDescription").alias("subtype_description"),
    col("abstracts-retrieval-response.coredata.prism:publicationName").alias("publication_name"),
    col("abstracts-retrieval-response.coredata.prism:startingPage").alias("starting_page"),
    col("abstracts-retrieval-response.coredata.prism:endingPage").alias("ending_page"),
    col("abstracts-retrieval-response.coredata.dc:identifier").alias("identifier"),
    col("abstracts-retrieval-response.coredata.dc:publisher").alias("publisher"),

    #array columns
    col("abstracts-retrieval-response.coredata.dc:creator.author").alias("authors"),
    col("abstracts-retrieval-response.affiliation").alias("affiliation"),
    col("abstracts-retrieval-response.subject-areas.subject-area").alias("subject_area"), #
    col("abstracts-retrieval-response.authkeywords.author-keyword").alias("authkeyword"), #
    col("abstracts-retrieval-response.idxterms.mainterm").alias("idxterm"), #
    col("abstracts-retrieval-response.item.bibrecord.tail.bibliography.reference").alias("reference_itemid"), #
    col("abstracts-retrieval-response.item.bibrecord.item-info.itemidlist.itemid").alias("itemid"), #

    
    # col("abstracts-retrieval-response.authors.author[0].preferred-name.ce:given-name").alias("author_given_name"),
    # col("abstracts-retrieval-response.authors.author[0].preferred-name.ce:indexed-name").alias("author_indexed_name"),
    # col("abstracts-retrieval-response.authors.author[0].ce:degrees").alias("author_degrees"),
    # col("abstracts-retrieval-response.authors.author[0].@_fa").alias("author_fa"),
    # col("abstracts-retrieval-response.authors.author[0].@auid").alias("author_auid"),
    
    # col("abstracts-retrieval-response.subject-areas.subject-area[0].@_fa").alias("subject_area_fa"),
    # col("abstracts-retrieval-response.subject-areas.subject-area[0].$").alias("subject_area_name"),
    # col("abstracts-retrieval-response.subject-areas.subject-area[0].@code").alias("subject_area_code"),
    # col("abstracts-retrieval-response.subject-areas.subject-area[0].@abbrev").alias("subject_area_abbrev"),
    
    # col("abstracts-retrieval-response.authkeywords.author-keyword[0].@_fa").alias("authkeyword_fa"),
    # col("abstracts-retrieval-response.authkeywords.author-keyword[0].$").alias("authkeyword"),
    
    # col("abstracts-retrieval-response.idxterms.mainterm[0].$").alias("idxterm"),
    # col("abstracts-retrieval-response.idxterms.mainterm[0].@weight").alias("idxterm_weight"),
    # col("abstracts-retrieval-response.idxterms.mainterm[0].@candidate").alias("idxterm_candidate"),
    
    # col("abstracts-retrieval-response.affiliation[0].affiliation-city").alias("affiliation_city"),
    # col("abstracts-retrieval-response.affiliation[0].@id").alias("affiliation_id"),
    # col("abstracts-retrieval-response.affiliation[0].affilname").alias("affiliation_name"),
    # col("abstracts-retrieval-response.affiliation[0].affiliation-country").alias("affiliation_country"),
    
    # col("abstracts-retrieval-response.item.bibrecord.tail.bibliography.reference[0].ref-info.refd-itemidlist.itemid[1].$").alias("reference_itemid")
]

additional_data_selected_columns = [
    #normal columns
    col("abstracts-retrieval-response.coredata.srctype").alias("source_type"),
    col("abstracts-retrieval-response.coredata.prism:doi").alias("prism:doi"),
    col("abstracts-retrieval-response.coredata.prism:coverDate").alias("cover_date"),
    col("abstracts-retrieval-response.coredata.prism:aggregationType").alias("aggregation_type"),
    col("abstracts-retrieval-response.coredata.source-id").alias("source_id"),
    col("abstracts-retrieval-response.coredata.citedby-count").alias("citedby_count"),
    col("abstracts-retrieval-response.coredata.prism:volume").alias("volume"),
    col("abstracts-retrieval-response.coredata.subtype").alias("subtype"),
    col("abstracts-retrieval-response.coredata.dc:title").alias("title"),
    col("abstracts-retrieval-response.coredata.prism:issueIdentifier").alias("issue_identifier"),
    col("abstracts-retrieval-response.coredata.subtypeDescription").alias("subtype_description"),
    col("abstracts-retrieval-response.coredata.prism:publicationName").alias("publication_name"),
    col("abstracts-retrieval-response.coredata.prism:startingPage").alias("starting_page"),
    col("abstracts-retrieval-response.coredata.prism:endingPage").alias("ending_page"),
    col("abstracts-retrieval-response.coredata.dc:identifier").alias("identifier"),
    col("abstracts-retrieval-response.coredata.dc:publisher").alias("publisher"),

    #array columns
    col("abstracts-retrieval-response.coredata.dc:creator.author").alias("authors"),
    col("abstracts-retrieval-response.affiliation").alias("affiliation"),
]



selected_df = df.select(*base_data_selected_columns)

Topic Distribution Over Time

In [21]:
#Topic Distribution Over Time

Topic_Distribution_Columns = [
    col("abstracts-retrieval-response.subject-areas.subject-area").alias('subject_area'), #ดูจำนวน subject แต่ละอัน ในแต่ละปี
    col("abstracts-retrieval-response.coredata.prism:coverDate").alias('date'), # ดูปี
    col("abstracts-retrieval-response.coredata.prism:aggregationType").alias('aggregationType') #  เผื่อดูแยกแต่ละประเภทการตีพิมพ์
]

Topic_Distribution_df = df.select(*Topic_Distribution_Columns)

#Topic_Distribution_df.select('subject_area').show(10, False)
Topic_Distribution_df_exploded = Topic_Distribution_df.withColumn('subject_area',explode(col('subject_area').getItem('$')))

Topic_Distribution_df_year = Topic_Distribution_df_exploded.withColumn('date',regexp_extract(col('date'),r'(\d{4})',1))

Topic_Distribution_df_year.show()

# Topic_Distribution_df_exploded.groupBy('subject_area').count().orderBy('count', ascending=False).show(10, False)


# Topic_Distribution_df_year.filter(col('subject_area').isNull()).show(10, False)


+--------------------+----+---------------+
|        subject_area|date|aggregationType|
+--------------------+----+---------------+
|           Neurology|2022|        Journal|
|Neurology (clinical)|2022|        Journal|
|        Biochemistry|2020|        Journal|
|  Molecular Medicine|2020|        Journal|
|   Molecular Biology|2020|        Journal|
|Pharmaceutical Sc...|2020|        Journal|
|      Drug Discovery|2020|        Journal|
|Clinical Biochemi...|2020|        Journal|
|   Organic Chemistry|2020|        Journal|
|Renewable Energy,...|2022|        Journal|
|     Fuel Technology|2022|        Journal|
|Condensed Matter ...|2022|        Journal|
|Energy Engineerin...|2022|        Journal|
|          Geophysics|2021|        Journal|
|Earth and Planeta...|2021|        Journal|
|   Multidisciplinary|2023|        Journal|
|Public Health, En...|2023|        Journal|
|Immunology and Al...|2022|        Journal|
|          Immunology|2022|        Journal|
|   Multidisciplinary|2023|     

In [54]:
output_file = "./output/Topic_Distribution_Data"  # Replace with your desired output path
# Save the extracted data to CSV
try:
    Topic_Distribution_df_year.coalesce(1).write.option("header", True).mode('overwrite').csv(output_file)
    print("Data extraction complete! Check the output folder.")
except Exception as e:
    print(f"Error: {e}")

Data extraction complete! Check the output folder.


Citation network

In [None]:
#abstracts-retrieval-response.item.bibrecord.item-info.itemidlist.itemid //เอา SGR เป็นหลัก (ใช้ดู reference)
#abstracts-retrieval-response.item.bibrecord.tail.bibliography.reference[0].ref-info.refd-itemidlist.itemid //เอา SGR เป็นหลัก (ใช้ดู reference)
Citation_Network_Columns = [
    col("abstracts-retrieval-response.coredata.dc:title").alias('title'),
    col("abstracts-retrieval-response.item.bibrecord.item-info.itemidlist.itemid").alias('SGR_id'), #ดูจำนวน subject แต่ละอัน ในแต่ละปี
]
Citation_Network_df = df.select(*Citation_Network_Columns)

Citation_Network_df_exploded = Citation_Network_df.withColumn('SGR_id',explode(col('SGR_id')))

filtered_df = Citation_Network_df_exploded.filter(col("SGR_id").getItem('@idtype') == ("SGR"))

Title_SGR_ID_df = filtered_df.withColumn('SGR_id', col('SGR_id').getItem('$'))

Title_SGR_ID_df.show(10, False)



+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+
|title                                                                                                                                                                                            |SGR_id     |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+
|Nuclear imaging for localization and surgical outcome prediction in epilepsy: A review of latest discoveries and future perspectives                                                             |85145388556|
|Syntheses and anti-HIV and human cluster of differentiation 4 (CD4) down-modulating potencies of pyridine-fused cyclotriazadisulfonamide (CADA) compounds              

In [104]:
#abstracts-retrieval-response.item.bibrecord.tail.bibliography.reference[0].ref-info.refd-itemidlist.itemid //เอา SGR เป็นหลัก (ใช้ดู reference)
Reference_Column = [
    col("abstracts-retrieval-response.coredata.dc:title").alias('title'),
    col("abstracts-retrieval-response.item.bibrecord.tail.bibliography.reference").alias('Ref'),
]

Reference_df = df.select(*Reference_Column)
Reference_df_exploded = Reference_df.withColumn('Ref',explode(col('Ref')))

Reference_df_id = Reference_df_exploded.withColumn('Ref', col('Ref').getItem('ref-info').getItem('refd-itemidlist').getItem('itemid'))


schema = ArrayType(StructType([
    StructField("$", StringType(), True),
    StructField("@idtype", StringType(), True)
]))
Reference_df_id_json = Reference_df_id.withColumn('Ref', from_json(col('Ref'), schema))

Reference_df_id_exploded = Reference_df_id_json.withColumn('Ref',explode(col('Ref')))

filtered_df_SGR = Reference_df_id_exploded.filter(col("Ref").getItem('@idtype') == ("SGR"))

Reference_df_SGR = filtered_df_SGR.withColumn('Ref', col('Ref').getItem('$'))

Connection_df = Reference_df_SGR.join(Title_SGR_ID_df, Reference_df_SGR.title == Title_SGR_ID_df.title, 'left').drop(Title_SGR_ID_df.title).select('title','SGR_id','Ref')

Connection_df.show(10, False)

+------------------------------------------------------------------------------------------------------------------------------------+-----------+-----------+
|title                                                                                                                               |SGR_id     |Ref        |
+------------------------------------------------------------------------------------------------------------------------------------+-----------+-----------+
|Nuclear imaging for localization and surgical outcome prediction in epilepsy: A review of latest discoveries and future perspectives|85145388556|85056435462|
|Nuclear imaging for localization and surgical outcome prediction in epilepsy: A review of latest discoveries and future perspectives|85145388556|33749638426|
|Nuclear imaging for localization and surgical outcome prediction in epilepsy: A review of latest discoveries and future perspectives|85145388556|85109038660|
|Nuclear imaging for localization and surgical

In [106]:
output_file_1 = "./output/Citation_network_title_id"  
output_file_2 = "./output/Citation_network_ref"  
# Save the extracted data to CSV
try:
    Title_SGR_ID_df.coalesce(1).write.option("header", True).mode('overwrite').csv(output_file_1)
    Connection_df.coalesce(1).write.option("header", True).mode('overwrite').csv(output_file_2)
    print("Data extraction complete! Check the output folder.")
except Exception as e:
    print(f"Error: {e}")

Data extraction complete! Check the output folder.


Journal Impact Prediction

In [33]:
#abstracts-retrieval-response.coredata.prism:publicationName // แบ่งตามชื่อวารสาร
#abstracts-retrieval-response.coredata.prism:aggregationType
#abstracts-retrieval-response.coredata.citedby-count // เอามาคำนวณ impact factor
#Impact factor คือ จำนวณ citation ทั้งหมดของวารสารนั้นๆ / จำนวน article ที่อยู่ในวารสารนั้น 

#Impact Factor (IF)= 
#Number of Articles in Journal / Sum of Citations for All Articles in Journal

Journal_Impact_Column = [
    col("abstracts-retrieval-response.coredata.dc:identifier").alias('identifier'),
    col("abstracts-retrieval-response.coredata.prism:publicationName").alias('publication_name'),
    col("abstracts-retrieval-response.coredata.prism:aggregationType").alias('aggregationType'),
    col("abstracts-retrieval-response.coredata.citedby-count").alias('citedby_count'),
    #เพิ่มเติม เป็นตัวแปรต้นสำหรับ predict
    col("abstracts-retrieval-response.subject-areas.subject-area").alias('subject_area'),
    col("abstracts-retrieval-response.affiliation").alias('affiliation'),
    

]
Journal_Impact_df = df.select(*Journal_Impact_Column)

# Journal_Impact_df.groupBy('publication_name').count().orderBy('count', ascending=False).show(10, False)
Journal_Impact_df = Journal_Impact_df.withColumn(
    'citedby_count', col('citedby_count').cast('int')
)

# เก็บไว้ก่อน
# Journey_Count_By_Publication_Name = Journal_Impact_df.groupBy('publication_name').agg(
#     sum('citedby_count').alias('total_citedby_count'),                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 
#     count('citedby_count').alias('number_of_articles')
#     ).orderBy('total_citedby_count', ascending=False)
# Journey_Count_By_Publication_Name.show(10, False)

In [37]:
#start encoding subject_area
Journal_Impact_df_exploded = Journal_Impact_df.withColumn('subject_area',explode(col('subject_area').getItem('$')))


Journal_Impact_df_grouped = Journal_Impact_df_exploded.groupBy('publication_name').agg(
    collect_list("subject_area").alias("subject_area_list"),
)

cv_subject = CountVectorizer(inputCol="subject_area_list", outputCol="subject_area_vector", binary=True)


encode_subject_df = cv_subject.fit(Journal_Impact_df_grouped).transform(Journal_Impact_df_grouped).drop('subject_area_list').join(Journal_Impact_df, 'publication_name', 'left').drop('subject_area')

#start encoding affiliation
schema = ArrayType(StructType([
    StructField("affiliation-city", StringType(), True),
    StructField("@id", StringType(), True),
    StructField("affilname", StringType(), True),
    StructField("affiliation-country", StringType(), True),
    StructField("@href", StringType(), True),
]))
encode_subject_df_json = encode_subject_df.withColumn('affiliation', from_json(col('affiliation'), schema))

encode_subject_df_json_exploded = encode_subject_df_json.withColumn('affiliation',explode(col('affiliation').getItem('affilname')))

encode_subject_df_json_grouped = encode_subject_df_json_exploded.groupBy('publication_name').agg(
    collect_list("affiliation").alias("affiliation_list"),
)

cv_affiliation = CountVectorizer(inputCol="affiliation_list", outputCol="affiliation_vector", binary=True)

encode_affiliation_df = cv_affiliation.fit(encode_subject_df_json_grouped).transform(encode_subject_df_json_grouped).drop('affiliation_list').join(encode_subject_df_json, 'publication_name', 'left').drop('affiliation')


#start encoding aggregationType & publication_name
indexer_aggregation = StringIndexer(inputCol="aggregationType", outputCol="aggregationType_index")
encoder_aggregation = OneHotEncoder(inputCol="aggregationType_index", outputCol="aggregationType_vector")

indexer_publication = StringIndexer(inputCol="publication_name", outputCol="publication_name_index")
encoder_publication = OneHotEncoder(inputCol="publication_name_index", outputCol="publication_name_vector")

indexer_identifier = StringIndexer(inputCol="identifier", outputCol="identifier_index")
encoder_identifier = OneHotEncoder(inputCol="identifier_index", outputCol="identifier_vector")

pipeline = Pipeline(stages=[indexer_aggregation, indexer_publication , indexer_identifier , encoder_aggregation, encoder_publication , encoder_identifier])

encode_df = pipeline.fit(encode_affiliation_df).transform(encode_affiliation_df).drop('aggregationType' , 'aggregationType_index', 'publication_name' , 'publication_name_index', 'identifier', 'identifier_index')

encode_df.show(10, False)



+-----------------------------------+------------------------------------+-------------+----------------------+-----------------------+-----------------+
|affiliation_vector                 |subject_area_vector                 |citedby_count|aggregationType_vector|publication_name_vector|identifier_vector|
+-----------------------------------+------------------------------------+-------------+----------------------+-----------------------+-----------------+
|(80,[0],[1.0])                     |(68,[23,30,37,47],[1.0,1.0,1.0,1.0])|0            |(3,[1],[1.0])         |(38,[5],[1.0])         |(47,[37],[1.0])  |
|(80,[0,56],[1.0,1.0])              |(68,[5,9],[1.0,1.0])                |8            |(3,[0],[1.0])         |(38,[6],[1.0])         |(47,[17],[1.0])  |
|(80,[0,21,59,71],[1.0,1.0,1.0,1.0])|(68,[9,16,19,21],[1.0,1.0,1.0,1.0]) |68           |(3,[0],[1.0])         |(38,[7],[1.0])         |(47,[1],[1.0])   |
|(80,[0],[1.0])                     |(68,[3,21,39],[1.0,1.0,1.0])        |0 

Data Science Component

In [None]:
feature_columns = ['affiliation_vector', 'subject_area_vector', 
                   'aggregationType_vector', 'publication_name_vector', 
                   'identifier_vector']

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(encode_df)
data = data.select("features", "citedby_count")

train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

gbt = GBTRegressor(featuresCol="features", labelCol="citedby_count", maxIter=100)
model = gbt.fit(train_data)

predictions = model.transform(test_data)

Root Mean Squared Error (RMSE): 8.216508340554153
R2 Score: -0.3733746009680414


In [43]:
evaluator = RegressionEvaluator(labelCol="citedby_count", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

r2_evaluator = RegressionEvaluator(labelCol="citedby_count", predictionCol="prediction", metricName="r2")
r2 = r2_evaluator.evaluate(predictions)
print(f"R2 Score: {r2}")


Root Mean Squared Error (RMSE): 8.216508340554153
R2 Score: -0.3733746009680414


In [None]:
param_grid = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [5, 10]) \
    .addGrid(gbt.maxIter, [50, 100]) \
    .build()

crossval = CrossValidator(estimator=gbt,
                          estimatorParamMaps=param_grid,
                          evaluator=evaluator,
                          numFolds=3)

cv_model = crossval.fit(train_data)

In [None]:
rf_model = cv_model.stages[-1]  # Last stage in the pipeline
print("Feature Importances:", rf_model.featureImportances)

In [32]:
output_file = "./output/Data.csv"  # Replace with your desired output path
# Save the extracted data to CSV
try:
    selected_df.coalesce(1).write.option("header", True).mode('overwrite').csv(output_file)
    print("Data extraction complete! Check the output folder.")
except Exception as e:
    print(f"Error: {e}")

Error: [UNSUPPORTED_DATA_TYPE_FOR_DATASOURCE] The CSV datasource doesn't support the column `authors` of the type "ARRAY<STRUCT<`@_fa`: STRING, `@auid`: STRING, `@seq`: STRING, affiliation: STRING, `author-url`: STRING, `ce:alias`: STRING, `ce:alt-name`: STRING, `ce:degrees`: STRING, `ce:given-name`: STRING, `ce:indexed-name`: STRING, `ce:initials`: STRING, `ce:suffix`: STRING, `ce:surname`: STRING, `preferred-name`: STRUCT<`ce:given-name`: STRING, `ce:indexed-name`: STRING, `ce:initials`: STRING, `ce:surname`: STRING>>>".


In [None]:

selected_df.show(5)

+--------+-----------+--------------------+----------+----------------+-----------+-------------+------+-------+--------------------+----------------+-------------------+--------------------+-------------+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|language|source_type|           prism:doi|cover_date|aggregation_type|  source_id|citedby_count|volume|subtype|               title|issue_identifier|subtype_description|    publication_name|starting_page|ending_page|          identifier|           publisher|             authors|         affiliation|        subject_area|         authkeyword|             idxterm|    reference_itemid|              itemid|
+--------+-----------+--------------------+----------+----------------+-----------+-------------+------+-------+--------------------+----------------+-------------------+----------------

In [9]:
selected_df.printSchema()

root
 |-- language: string (nullable = true)
 |-- source_type: string (nullable = true)
 |-- prism:doi: string (nullable = true)
 |-- cover_date: string (nullable = true)
 |-- aggregation_type: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- citedby_count: string (nullable = true)
 |-- volume: string (nullable = true)
 |-- subtype: string (nullable = true)
 |-- title: string (nullable = true)
 |-- issue_identifier: string (nullable = true)
 |-- subtype_description: string (nullable = true)
 |-- publication_name: string (nullable = true)
 |-- starting_page: string (nullable = true)
 |-- ending_page: string (nullable = true)
 |-- identifier: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- @_fa: string (nullable = true)
 |    |    |-- @auid: string (nullable = true)
 |    |    |-- @seq: string (nullable = true)
 |    |    |-- affiliation: string (nu