In [12]:
import sys
import pyspark
import time
from pyspark.sql import SparkSession 
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import concat_ws
from delta import DeltaTable, configure_spark_with_delta_pip

In [13]:
csv_file = "hdfs:///Dat500_Group09/output_meta/part*"
# output files
delta_preprocessing_file = "hdfs:///Dat500_Group09/spark_result/preprocessing"
delta_training_file = "hdfs:///Dat500_Group09/spark_result/training"
delta_testing_file = "hdfs:///Dat500_Group09/spark_result/testing"

In [14]:
builder = pyspark.sql.SparkSession.builder.appName("Arxiv_test_preprocessing") \
    .master('yarn')\
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\
    .config('spark.executor.instances', '12') \
    .config("spark.executor.memory", "1g") \
    #.config('spark.databricks.delta.optimizeWrites.enabled', "true")\
    #.config("spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite", "true")\
    #.config("spark.databricks.delta.properties.defaults.autoOptimize.autoCompact", "true")
    #.config('spark.driver.memory', '4g')\                
    #.config("spark.sql.shuffle.partitions", "32")
    #.config("spark.databricks.delta.properties.defaults.delta.targetFileSize", "104857600")
        #.config('spark.driver.maxResultSize', '2g')
    
    #.config("spark.databricks.delta.properties.defaults.delta.targetFileSize", "128m")
    #.config('spark.driver.maxResultSize', '2g') 
    #.config("spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite", "true")\

    #spark.databricks.delta.autoCompact.maxFileSize
    #.config("spark.sql.shuffle.partitions", "200").config('spark.driver.memory', '2g') \

# Set the delta.targetFileSize configuration
#104857600   = 100MB
#134217728   = 128MB
#268435456 = 256
#spark.conf.set("delta.targetFileSize", "512MB")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [15]:
# Create data schema
#==================================================================== 
dbschema = StructType([
StructField("id", StringType(), True),
StructField("authors", StringType(), True),
StructField("title", StringType(), True),
StructField("abstract", StringType(), True),
StructField("journal_ref", StringType(), True),
StructField("category", StringType(), True),
StructField("update_date", StringType(), True),
])

In [16]:
# Import csv file for the data
#==================================================================== 
try:
    arxiv_df =spark.read.options(delimiter="::", header=False, schema=dbschema).csv(csv_file)    
except:
    print(f"Error: Could not read the data for this file {csv_file}")
    sys.exit(1)


In [17]:
# Change the column names to the same name for the Arxiv metadata 
#====================================================================
arxiv_df = arxiv_df.selectExpr("_c0 as id", "_c1 as authors", "_c2 as title", "_c3 as abstract", 
                            "_c4 as journal_ref", "_c5 as category", "_c6 as update_date")



In [18]:
arxiv_df = arxiv_df.withColumn("category1", split(col("category"), " ").getItem(0))

In [19]:
arxiv_df.show(3)

+----------+--------------------+--------------------+--------------------+-----------+---------------+------------+---------+
|        id|             authors|               title|            abstract|journal_ref|       category| update_date|category1|
+----------+--------------------+--------------------+--------------------+-----------+---------------+------------+---------+
|2210.02280|     Minoru Wakimoto|Mock theta functi...| In this paper, w...|       None|math.NT math.RT|2022-10-11\t|  math.NT|
|2210.02281|P. Jameson Graber...|On monotonicity c...| In this paper we...|       None|math.AP math.OC|2022-10-06\t|  math.AP|
|2210.02282|Cornelia Ott, Hed...|Covering Properti...| The sum-rank met...|       None|  cs.IT math.IT|2022-10-06\t|    cs.IT|
+----------+--------------------+--------------------+--------------------+-----------+---------------+------------+---------+
only showing top 3 rows



In [None]:
# old method we read from csv file
'''print("Add new column for arxiv data to recategorize the articles")
csv_category = "hdfs:///Dat500_Group09/input/category.csv"
# Add new column for arxiv data to recategorize the articles
category_df =spark.read.options(delimiter=",", header=True, inferSchema=True).csv(csv_category)   ''' 

Add new column for arxiv data to recategorize the articles


                                                                                

In [20]:
#delta_category_file   
delta_category_file = "hdfs:///Dat500_Group09/spark_result/category"
category_df = spark.read.format("delta").load(delta_category_file)
print(category_df.count())

# Take the first category from the category column 
arxiv_df = arxiv_df.withColumn("category1", split(col("category"), " ").getItem(0))

# join the two data frames where "category1" in arxiv_df equal to "category" in category_df
arxiv_joined_df = arxiv_df.join(broadcast(category_df), arxiv_df.category1 == category_df.sub_category, "inner")

# select the required columns from the joined data frame
arxiv_df = arxiv_joined_df.select("id", "authors", "title", "abstract", "category", "update_date", "main_category", col("description").alias("main_topic"))
# old method

arxiv_df.count()

                                                                                

155


                                                                                

2090053

23/04/26 03:09:50 ERROR YarnClientSchedulerBackend: YARN application has exited unexpectedly with state KILLED! Check the YARN application logs for more details.
23/04/26 03:09:50 ERROR YarnClientSchedulerBackend: Diagnostics message: Application application_1679580022279_0234 was killed by user ubuntu at 192.168.9.208
23/04/26 03:09:50 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_28_40 !
23/04/26 03:09:50 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_28_2 !
23/04/26 03:09:50 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_32_2 !
23/04/26 03:09:50 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_32_40 !
23/04/26 03:09:50 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_28_4 !
23/04/26 03:09:50 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_32_4 !
23/04/26 03:09:50 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_28_7 !
23/04/26 03:09:50 WARN Blo

In [16]:
)

# Take the first category from the category column 
arxiv_df = arxiv_df.withColumn("category1", split(col("category"), " ").getItem(0))

# join the two data frames where "category1" in arxiv_df equal to "category" in category_df
arxiv_joined_df = arxiv_df.join(category_df, arxiv_df.category1 == category_df.sub_category, "inner")

# select the desired columns from the resulting data frame
#select("id", "title", "abstract", "authers_num", "article_date", "main_category")
arxiv_df = arxiv_joined_df.select("id", "authors", "title", "abstract", "category", "update_date", "main_category", col("description").alias("main_topic"))

arxiv_df.show(3) 

+----------+--------------------+--------------------+-----------------+--------------------+----------------+
|        id|               title|            abstract|     sub_category|            category|   main_category|
+----------+--------------------+--------------------+-----------------+--------------------+----------------+
|2210.02280|Mock theta functi...| In this paper, w...|          math.NT|     math.NT math.RT|     Mathematics|
|2210.02281|On monotonicity c...| In this paper we...|          math.AP|     math.AP math.OC|     Mathematics|
|2210.02282|Covering Properti...| The sum-rank met...|            cs.IT|       cs.IT math.IT|Computer Science|
|2210.02283|Detection of long...| We report and qu...|cond-mat.mes-hall|   cond-mat.mes-hall|         Physics|
|2210.02284|Unsupervised Sent...| Measuring Senten...|            cs.CL|               cs.CL|Computer Science|
|2210.02285|Enhanced Soft Lim...| In flat space, t...|           hep-th|              hep-th|         Physics|
|

In [27]:
import requests
from bs4 import BeautifulSoup

# the content of the category & subcateory in this url
url = 'https://arxiv.org/category_taxonomy'
response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

# Find the main categories section of the page
main_category = soup.find_all('h2', {'class': 'accordion-head'})

# collect all the main categories int a dictionary
category_dict = {category.text: [] for category in main_category}

# Add all sub categories into list
Sub_categories_list = []
for div in soup.find_all('div', class_='columns divided'):
    h4 = div.find('h4')
    span = h4.find('span')
    if span:
        span = span.text.strip('()')
    sub_category = h4.text.strip().split()[0]
    if sub_category == "Category":
        continue
    Sub_categories_list.append((sub_category, span))

# Add all sub categories to related main_category
for cat in Sub_categories_list:
    if cat[0].split('.')[0] == 'cs':
        category_dict.get('Computer Science').append(cat)
    elif cat[0].split('.')[0] == 'econ':
        category_dict.get('Economics').append(cat)
    elif cat[0].split('.')[0] == 'eess':
        category_dict.get('Electrical Engineering and Systems Science').append(cat)
    elif cat[0].split('.')[0] == 'math':
        category_dict.get('Mathematics').append(cat)
    elif cat[0].split('.')[0] == 'q-bio':
        category_dict.get('Quantitative Biology').append(cat) 
    elif cat[0].split('.')[0] == 'q-fin':
        category_dict.get('Quantitative Finance').append(cat)       
    elif cat[0].split('.')[0] == 'stat':
        category_dict.get('Statistics').append(cat)
    else:
        category_dict.get('Physics').append(cat)

# prepare creating a new dataframe for the category from a list of tuples which contains the candidates values
category_tuples = []
for main, subList in category_dict.items():
    for sub in subList:    
        category_tuples.append((main, sub[0], sub[1]))

category_df = spark.createDataFrame(category_tuples, ["main_category", "sub_category", "description"])
    

In [38]:
category_df.show(5)

[Stage 19:>                                                         (0 + 1) / 1]

+----------------+------------+--------------------+
|   main_category|sub_category|         description|
+----------------+------------+--------------------+
|Computer Science|       cs.AI|Artificial Intell...|
|Computer Science|       cs.AR|Hardware Architec...|
|Computer Science|       cs.CC|Computational Com...|
|Computer Science|       cs.CE|Computational Eng...|
|Computer Science|       cs.CG|Computational Geo...|
+----------------+------------+--------------------+
only showing top 5 rows



                                                                                

In [36]:
categorylist = []
for main, subList in category_dict.items():
    for sub in subList:    
        categorylist.extend([main, sub[0], sub[1]])

Computer Science [('cs.AI', 'Artificial Intelligence'), ('cs.AR', 'Hardware Architecture'), ('cs.CC', 'Computational Complexity'), ('cs.CE', 'Computational Engineering, Finance, and Science'), ('cs.CG', 'Computational Geometry'), ('cs.CL', 'Computation and Language'), ('cs.CR', 'Cryptography and Security'), ('cs.CV', 'Computer Vision and Pattern Recognition'), ('cs.CY', 'Computers and Society'), ('cs.DB', 'Databases'), ('cs.DC', 'Distributed, Parallel, and Cluster Computing'), ('cs.DL', 'Digital Libraries'), ('cs.DM', 'Discrete Mathematics'), ('cs.DS', 'Data Structures and Algorithms'), ('cs.ET', 'Emerging Technologies'), ('cs.FL', 'Formal Languages and Automata Theory'), ('cs.GL', 'General Literature'), ('cs.GR', 'Graphics'), ('cs.GT', 'Computer Science and Game Theory'), ('cs.HC', 'Human-Computer Interaction'), ('cs.IR', 'Information Retrieval'), ('cs.IT', 'Information Theory'), ('cs.LG', 'Machine Learning'), ('cs.LO', 'Logic in Computer Science'), ('cs.MA', 'Multiagent Systems'), ('

In [30]:
categorylist[1:5]

['cs.AI', 'Artificial Intelligence', 'Computer Science', 'cs.AR']

In [None]:
rint(sub_categories[1:10])
# Find the main categories section of the page
main_categories = []
main_category = soup.find_all('h2', {'class': 'accordion-head'})
for category in main_category:
    main_categories.append((category.text.strip(), 'ArXiv', ''))

print(main_categories[1:10])
# Combine the main and sub categories into a list
#categories = main_categories + sub_categories



In [25]:
print(categories[1:5])

[('Economics', 'ArXiv', ''), ('Electrical Engineering and Systems Science', 'ArXiv', ''), ('Mathematics', 'ArXiv', ''), ('Physics', 'ArXiv', '')]


In [None]:
# Create a data frame from the list of categories
#category_df = spark.createDataFrame(categories, schema)

# Show the data frame
#category_df.show()
