In [13]:
import sys
import pyspark
import time
from pyspark.sql import SparkSession 
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import col, split, when, count
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, LinearSVC, NaiveBayes
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.feature import StringIndexer, RegexTokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import concat_ws
from delta import DeltaTable, configure_spark_with_delta_pip

partition = 0
test_ratio = 0.2
bprint = 1
filepath = 'hdfs:///Dat500_Group09/input/output_meta/part*'
# set the path to the Delta table
delta_table_path = "hdfs:///Dat500_Group09/spark_result/final_result/arxiv_meta"


#filepath = 'hdfs:///Dat500_Group09/output_meta/part*'
#delta_table_path = "hdfs:///Dat500_Group09/result/arxiv_sample"    

In [14]:
def Create_ML_pipline(ML_model = "LR"):
  # Convert the main_category column to numeric using StringIndexer
  labelIndexer = StringIndexer(inputCol="main_category", outputCol="label")

  # Define the regular expression tokenizer
  regexTokenizer = RegexTokenizer(inputCol="text", outputCol="tokens", pattern="\\W")

  # Define the stop words remover
  stopWordsRemover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")

  # Define the TF-IDF Vectorizer
  countVectorizer = CountVectorizer(inputCol="filtered_tokens", outputCol="vectorize_features")
  # hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
  idf = IDF(inputCol="vectorize_features", outputCol="features")

  if ML_model == 'LR': # Create logistic regression classifier     
    ML_Model = LogisticRegression(featuresCol="features", labelCol="label", maxIter=100)
  elif ML_model == 'RF': # Create a Random Forest classifier
    ML_Model = RandomForestClassifier(numTrees=500, maxDepth=5, labelCol="label", featuresCol="features")
  elif ML_model == 'NB': # Create a Naive Bayes classifier
    ML_Model = NaiveBayes(modelType="multinomial", labelCol="label", featuresCol="features")

  # Define the Pipeline
  pipeline = Pipeline(stages=[labelIndexer, regexTokenizer, stopWordsRemover, countVectorizer, idf, ML_Model])

  return pipeline


In [15]:
  # Set the configuration properties for Delta tables
builder = pyspark.sql.SparkSession.builder.appName("Arxiv_Classification") \
    .master('yarn') \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\
    .config('spark.executor.instances', '8') \
    .config("spark.executor.memory", "2g")
    #.config("spark.sql.shuffle.partitions", "32")
    #.config('spark.driver.memory', '4g')
    #.config("spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite", "true")\
    #.config("spark.databricks.delta.properties.defaults.autoOptimize.autoCompact", "true")\    
        #.config('spark.driver.maxResultSize', '4g') \
    #.config('spark.executor.instances', '12') \
    #.config("spark.executor.memory", "1g") \
    #.config("spark.sql.shuffle.partitions", "200").config('spark.driver.memory', '4g') \
# Set the delta.targetFileSize configuration
#104857600   = 100MB
#134217728   = 128MB
#268435456 = 256 for 1TB data
#spark.conf.set("delta.targetFileSize", "128MB")


spark = configure_spark_with_delta_pip(builder).getOrCreate()  

23/04/22 10:17:37 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
23/04/22 10:17:45 WARN Client: Same path resource file:///home/ubuntu/.ivy2/jars/io.delta_delta-core_2.12-2.3.0.jar added multiple times to distributed cache.
23/04/22 10:17:45 WARN Client: Same path resource file:///home/ubuntu/.ivy2/jars/io.delta_delta-storage-2.3.0.jar added multiple times to distributed cache.
23/04/22 10:17:45 WARN Client: Same path resource file:///home/ubuntu/.ivy2/jars/org.antlr_antlr4-runtime-4.8.jar added multiple times to distributed cache.


In [12]:
spark.stop()

In [16]:
# create the schema for our dataset to determine the datatype for each columns)
dbschema = StructType([
    StructField("id", StringType(), True),
    StructField("authors", StringType(), True),
    StructField("title", StringType(), True),
    StructField("abstract", StringType(), True),
    StructField("journal_ref", StringType(), True),
    StructField("category", StringType(), True),
    StructField("update_date", StringType(), True),
])

In [17]:
# import csv file for the data
try:
  arxiv_df =spark.read.options(delimiter="::", header=False, schema=dbschema).csv(filepath)    
except:
    print(f"Error: Could not read the data for this file {filepath}")    

                                                                                

In [18]:
# change the column names to the same name for the Arxiv metadata 
arxiv_df = arxiv_df.selectExpr("_c0 as id", "_c1 as authors", "_c2 as title", "_c3 as abstract", 
                                "_c4 as journal_ref", "_c5 as category", "_c6 as update_date")

arxiv_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- title: string (nullable = true)
 |-- abstract: string (nullable = true)
 |-- journal_ref: string (nullable = true)
 |-- category: string (nullable = true)
 |-- update_date: string (nullable = true)



In [19]:
# try this later
arxiv_df.cache() #Cache the data

DataFrame[id: string, authors: string, title: string, abstract: string, journal_ref: string, category: string, update_date: string]

In [7]:
if partition > 0:
    arxiv_df = arxiv_df.repartition(partition)
print("The number of partition for the Data:",arxiv_df.rdd.getNumPartitions())

The number of partition for the Data: 27


In [20]:
executer = spark.conf.get("spark.executor.instances")
print("number of executer in the cluster:",executer)

print("the No of shuffle.partitions",  spark.conf.get("spark.sql.shuffle.partitions"))
#print('spark.driver.maxResultSize', spark.conf.get("spark.driver.maxResultSize"))
print("spark.executor.memory",  spark.conf.get("spark.executor.memory"))
#print("spark.driver.memory",  spark.conf.get("spark.driver.memory"))

number of executer in the cluster: 8
the No of shuffle.partitions 200
spark.executor.memory 2g


In [21]:
from pyspark import SparkConf

conf = SparkConf()
config_map = conf.getAll()
for key, value in config_map:
    print(f"{key}: {value}")

spark.executor.memory: 2g
spark.yarn.dist.jars: file:///home/ubuntu/.ivy2/jars/io.delta_delta-core_2.12-2.3.0.jar,file:///home/ubuntu/.ivy2/jars/io.delta_delta-storage-2.3.0.jar,file:///home/ubuntu/.ivy2/jars/org.antlr_antlr4-runtime-4.8.jar
spark.submit.pyFiles: /home/ubuntu/.ivy2/jars/io.delta_delta-core_2.12-2.3.0.jar,/home/ubuntu/.ivy2/jars/io.delta_delta-storage-2.3.0.jar,/home/ubuntu/.ivy2/jars/org.antlr_antlr4-runtime-4.8.jar
spark.app.name: Arxiv_Classification
spark.executor.instances: 8
spark.jars.packages: io.delta:delta-core_2.12:2.3.0
spark.master: yarn
spark.sql.extensions: io.delta.sql.DeltaSparkSessionExtension
spark.ui.proxyBase: /proxy/application_1679580022279_0122
spark.yarn.isPython: true
spark.submit.deployMode: client
spark.yarn.dist.pyFiles: file:///home/ubuntu/.ivy2/jars/io.delta_delta-core_2.12-2.3.0.jar,file:///home/ubuntu/.ivy2/jars/io.delta_delta-storage-2.3.0.jar,file:///home/ubuntu/.ivy2/jars/org.antlr_antlr4-runtime-4.8.jar
spark.repl.local.jars: file://

In [10]:
spark = SparkSession.builder.appName("MyApp").getOrCreate()
aqe_enabled = spark.conf.get("spark.sql.adaptive.enabled")
print(f"Adaptive Query Execution is {'enabled' if aqe_enabled == 'true' else 'disabled'}")


23/04/18 16:05:41 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
Adaptive Query Execution is enabled


In [22]:
arxiv_df = arxiv_df.withColumn("main_category",
                   when(split(arxiv_df.category, "\\.")[0] == "cs", "computer science")
                   .when(split(arxiv_df.category, "\\.")[0] == "math", "mathematics")
                   .when(split(arxiv_df.category, "\\.")[0] == "econ", "economics")
                   .when(split(arxiv_df.category, "\\.")[0] == "eess", "electrical engineering")
                   .when(split(arxiv_df.category, "\\.")[0] == "q-bio", "quantitative biology")
                   .when(split(arxiv_df.category, "\\.")[0] == "q-fin", "quantitative finance")
                   .when(split(arxiv_df.category, "\\.")[0] == "stat", "statistics")                   
                   .otherwise("physics"))

In [11]:
arxiv_df.show(3, False)

[Stage 1:>                                                          (0 + 1) / 1]

+----------+--------------------------------------------------+----------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+---------------+------------+----------------+
|id        |authors                                           |title                                               |abs

                                                                                

In [23]:
# determine the columns that we used for machine learning model
clean_arxiv_df = arxiv_df.select("id", "title", "abstract", "main_category")

clean_arxiv_df.show(3, False)

[Stage 1:>                                                          (0 + 1) / 1]

+----------+----------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------+
|id        |title                                               |abstract                                                                                                                                          

                                                                                

In [24]:
# Concatenate the title and abstract into a single column
clean_arxiv_df = clean_arxiv_df.withColumn('text', concat_ws(' ', clean_arxiv_df['title'], clean_arxiv_df['abstract']))


In [25]:
pipeline = Create_ML_pipline()

In [26]:
trainingData, testData = clean_arxiv_df.randomSplit([1-test_ratio, test_ratio], seed=24)

In [27]:
if bprint == 1:
    print("="*100)
    print("Training Data size: ", trainingData.count())
    print("Testing Data size: ", testData.count())
    print("="*100)



                                                                                

Training Data size:  1763556




Testing Data size:  440363


                                                                                

In [28]:
# Fit the model
ML_model = pipeline.fit(trainingData)

                                                                                

23/04/22 10:25:38 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB




23/04/22 10:27:03 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB


                                                                                

23/04/22 10:27:06 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:28:34 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:28:36 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:30:22 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


[Stage 20:>                                                         (0 + 4) / 4]

23/04/22 10:30:23 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/04/22 10:30:23 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


                                                                                

23/04/22 10:30:24 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
23/04/22 10:30:24 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
23/04/22 10:30:24 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:30:41 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:30:43 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:31:00 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:31:01 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:31:18 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:31:20 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:31:36 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:31:38 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:31:55 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:31:56 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:32:16 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:32:18 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:32:34 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:32:36 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:32:52 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:32:54 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:33:11 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:33:13 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:33:30 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:33:32 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:33:48 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:33:50 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:34:06 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:34:09 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:34:25 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:34:28 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:34:44 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:34:47 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:35:02 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:35:05 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:35:21 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:35:22 WARN BlockManager: Asked to remove block broadcast_69_piece2, which does not exist
23/04/22 10:35:23 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:35:39 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:35:41 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:35:58 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:36:00 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:36:15 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:36:18 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:36:34 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:36:36 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:36:53 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:36:55 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:37:12 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:37:14 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:37:30 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:37:33 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:37:49 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:37:51 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:38:07 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:38:10 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:38:27 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:38:29 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:38:47 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:38:49 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:39:07 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:39:09 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:39:26 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:39:28 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:39:46 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:39:48 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:40:06 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:40:08 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:40:27 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:40:30 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:40:48 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:40:51 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:41:10 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:41:12 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:41:32 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:41:34 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:41:54 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:41:57 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:42:16 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:42:18 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:42:40 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:42:42 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:43:00 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:43:02 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:43:20 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:43:21 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:43:39 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:43:42 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:43:59 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:44:01 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:44:19 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:44:22 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:44:40 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:44:42 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:45:00 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:45:02 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:45:20 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:45:21 WARN BlockManager: Asked to remove block broadcast_159, which does not exist
23/04/22 10:45:21 WARN BlockManager: Asked to remove block broadcast_159_piece0, which does not exist
23/04/22 10:45:23 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:45:40 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:45:43 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:46:01 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:46:03 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:46:21 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:46:23 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:46:41 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:46:43 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:47:02 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:47:04 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:47:26 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:47:28 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:47:50 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:47:59 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


[Stage 128:>                                                        (0 + 3) / 4]

23/04/22 10:48:16 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:48:18 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:48:35 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:48:38 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:48:55 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:48:58 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:49:15 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:49:17 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:49:35 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:49:37 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:49:54 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:49:57 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:50:15 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:50:17 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


[Stage 142:>                                                        (0 + 4) / 4]

23/04/22 10:50:35 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:50:37 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:50:54 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:50:57 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:51:14 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:51:16 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:51:33 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:51:36 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:51:54 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:51:57 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:52:15 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:52:17 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:52:35 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:52:37 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:52:54 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:52:57 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:53:16 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:53:19 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:53:37 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:53:39 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:53:58 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:54:01 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:54:18 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:54:20 WARN BlockManager: Asked to remove block broadcast_237_piece4, which does not exist
23/04/22 10:54:20 WARN BlockManager: Asked to remove block broadcast_237_piece3, which does not exist
23/04/22 10:54:20 WARN BlockManager: Asked to remove block broadcast_237_piece1, which does not exist
23/04/22 10:54:20 WARN BlockManager: Asked to remove block broadcast_237_piece2, which does not exist
23/04/22 10:54:21 WARN BlockManagerMaster: Failed to remove broadcast 237 with removeFromMaster = true - Block broadcast_237_piece2 does not exist
org.apache.spark.SparkException: Block broadcast_237_piece2 does not exist
	at org.apache.spark.errors.SparkCoreErrors$.blockDoesNotExistError(SparkCoreErrors.scala:234)
	at org.apache.spark.storage.BlockInfoManager.blockInfo(BlockInfoManager.scala:237)
	at org.apache.spark.storage.BlockInfoManager.removeBlock(BlockInfoManager.scala:500)
	at org.apache.spark.storage.BlockManager.removeBlockInternal(BlockManager.scala:2011)
	at org.apache.spa



23/04/22 10:54:39 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:54:41 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:54:59 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:55:04 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:55:22 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:55:24 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:55:41 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:55:44 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:56:02 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:56:05 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:56:22 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:56:25 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:56:45 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:56:47 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:57:06 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:57:09 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:57:28 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:57:30 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:57:47 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:57:49 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:58:10 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:58:14 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:58:32 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:58:34 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:58:53 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:58:57 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:59:17 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:59:19 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:59:36 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:59:39 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 10:59:55 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 10:59:58 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 11:00:15 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 11:00:18 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 11:00:37 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 11:00:38 WARN BlockManager: Asked to remove block broadcast_291_piece4, which does not exist
23/04/22 11:00:38 WARN BlockManager: Asked to remove block broadcast_291, which does not exist
23/04/22 11:00:38 WARN BlockManager: Asked to remove block broadcast_291_piece2, which does not exist
23/04/22 11:00:38 WARN BlockManager: Asked to remove block broadcast_291_piece0, which does not exist
23/04/22 11:00:39 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 11:00:56 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 11:00:58 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 11:01:21 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 11:01:23 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 11:01:44 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 11:01:46 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 11:02:06 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 11:02:09 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 11:02:27 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 11:02:31 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 11:02:58 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 11:03:00 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 11:03:17 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 11:03:19 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 11:03:44 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 11:03:47 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 11:04:07 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/22 11:04:18 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/22 11:04:36 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

In [18]:
# save the machine learning model
pipelinePath = 'hdfs:///Dat500_Group09/result/ML_model'
ML_model.write().overwrite().save(pipelinePath)

                                                                                

23/04/21 04:13:20 WARN TaskSetManager: Stage 229 contains a task of very large size (4849 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/04/21 04:13:22 WARN TaskSetManager: Stage 233 contains a task of very large size (4184 KiB). The maximum recommended task size is 1000 KiB.
23/04/21 04:13:23 WARN TaskSetManager: Stage 237 contains a task of very large size (16725 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

In [None]:
# load the saved machine learning model
#from pyspark.ml import PipelineModel
#savedPipelineModel = PipelineModel.load(pipelinePath)


In [16]:
# Make predictions on the testing data
df_Prediction = ML_model.transform(testData)

In [17]:
# Print the dataframe with the original main_category and the predicted one
df_Prediction = df_Prediction.select("id", "main_category", "label", "prediction")
df_Prediction.show(3)

23/04/21 05:35:45 WARN DAGScheduler: Broadcasting large task binary with size 22.7 MiB


[Stage 223:>                                                        (0 + 1) / 1]

+----------+----------------+-----+----------+
|        id|   main_category|label|prediction|
+----------+----------------+-----+----------+
|2210.02287|computer science|  2.0|       2.0|
|2210.02293|         physics|  0.0|       0.0|
|2210.02298|         physics|  0.0|       5.0|
+----------+----------------+-----+----------+
only showing top 3 rows



                                                                                

In [18]:
# Evaluate the model using the F1 score
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(df_Prediction)
print("accuracy = %g" % accuracy)

23/04/21 05:35:57 WARN DAGScheduler: Broadcasting large task binary with size 22.7 MiB




accuracy = 0.859028


                                                                                

In [30]:
# Evaluate the performance of the model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(df_Prediction)
print("Accuracy = %g" % accuracy)

23/04/18 16:34:43 WARN DAGScheduler: Broadcasting large task binary with size 22.7 MiB




Accuracy = 0.848034


                                                                                

In [19]:
# check if the Delta table exists  
# DeltaTable.isDeltaTable(spark, "spark-warehouse/table1") # True 
#DeltaTable
if DeltaTable.isDeltaTable(spark, delta_table_path):
    print("update delta table")
    deltaTable = DeltaTable.forPath(spark, delta_table_path)
    #"target.id = updates.id and target.main_category = updates.main_category") \
    deltaTable.alias("target") \
        .merge(
        source = df_Prediction.alias("updates"),
        condition = "target.id = updates.id") \
        .whenMatchedUpdate( set = 
        {
            "label": "updates.label",
            "prediction": "updates.prediction"     
        }) \
        .whenNotMatchedInsert(values =
        {
            "id": "updates.id",
            "main_category": "updates.main_category",
            "label": "updates.label",
            "prediction": "updates.prediction"        
        }) \
        .execute()
else: # file not exists
    print("Create delta table first time")
    df_Prediction.write.format("delta").save(delta_table_path)
    #df_Prediction.write.format("delta").partitionBy("main_category").save(delta_table_path)

                                                                                

update delta table
23/04/21 05:40:32 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

23/04/21 05:40:54 WARN DAGScheduler: Broadcasting large task binary with size 22.7 MiB




23/04/21 05:41:37 WARN DAGScheduler: Broadcasting large task binary with size 22.7 MiB


                                                                                

In [21]:
df = spark.read.format("delta").load(delta_table_path)
df.show(10)

                                                                                

+---------+----------------+-----+----------+
|       id|   main_category|label|prediction|
+---------+----------------+-----+----------+
|0704.0079|     mathematics|  1.0|       1.0|
|0704.0153|         physics|  0.0|       1.0|
|0704.0192|         physics|  0.0|       0.0|
|0704.0202|         physics|  0.0|       0.0|
|0704.0217|computer science|  2.0|       2.0|
|0704.0233|         physics|  0.0|       0.0|
|0704.0280|         physics|  0.0|       0.0|
|0704.0304|computer science|  2.0|       5.0|
|0704.0306|         physics|  0.0|       0.0|
|0704.0451|         physics|  0.0|       0.0|
+---------+----------------+-----+----------+
only showing top 10 rows



In [22]:
df.count()

440363