In [1]:
import sys
import pyspark
import time
from pyspark.sql import SparkSession 
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import col, split, when, count
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, LinearSVC, NaiveBayes
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.feature import StringIndexer, RegexTokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import concat_ws
from delta import DeltaTable, configure_spark_with_delta_pip

partition = 0
test_ratio = 0.2
bprint = 1
filepath = 'hdfs:///Dat500_Group09/output_meta/part*'
# set the path to the Delta table
delta_table_path = "hdfs:///Dat500_Group09/result/arxiv_meta"


#filepath = 'hdfs:///Dat500_Group09/output_meta/part*'
#delta_table_path = "hdfs:///Dat500_Group09/result/arxiv_sample"   

 

In [2]:
def Create_ML_pipline(ML_model = "LR"):
  # Convert the main_category column to numeric using StringIndexer
  labelIndexer = StringIndexer(inputCol="main_category", outputCol="label")

  # Define the regular expression tokenizer
  regexTokenizer = RegexTokenizer(inputCol="text", outputCol="tokens", pattern="\\W")

  # Define the stop words remover
  stopWordsRemover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")

  # Define the TF-IDF Vectorizer
  countVectorizer = CountVectorizer(inputCol="filtered_tokens", outputCol="vectorize_features")
  # hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
  idf = IDF(inputCol="vectorize_features", outputCol="features")

  if ML_model == 'LR': # Create logistic regression classifier     
    ML_Model = LogisticRegression(featuresCol="features", labelCol="label", maxIter=100)
  elif ML_model == 'RF': # Create a Random Forest classifier
    ML_Model = RandomForestClassifier(numTrees=500, maxDepth=5, labelCol="label", featuresCol="features")
  elif ML_model == 'NB': # Create a Naive Bayes classifier
    ML_Model = NaiveBayes(modelType="multinomial", labelCol="label", featuresCol="features")

  # Define the Pipeline
  pipeline = Pipeline(stages=[labelIndexer, regexTokenizer, stopWordsRemover, countVectorizer, idf, ML_Model])

  return pipeline


In [3]:
  # Set the configuration properties for Delta tables
builder = pyspark.sql.SparkSession.builder.appName("Arxiv_Classification") \
    .master('yarn') \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\
    .config('spark.driver.memory', '2g')
  
spark = configure_spark_with_delta_pip(builder).getOrCreate()
  

:: loading settings :: url = jar:file:/home/ubuntu/.local/lib/python3.8/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ubuntu/.ivy2/cache
The jars for the packages stored in: /home/ubuntu/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-454edaf2-fbb5-45ff-800a-32e512fd7a99;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.3.0 in central
	found io.delta#delta-storage;2.3.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
:: resolution report :: resolve 209ms :: artifacts dl 10ms
	:: modules in use:
	io.delta#delta-core_2.12;2.3.0 from central in [default]
	io.delta#delta-storage;2.3.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   

23/04/21 02:16:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/21 02:17:02 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
23/04/21 02:17:13 WARN Client: Same path resource file:///home/ubuntu/.ivy2/jars/io.delta_delta-core_2.12-2.3.0.jar added multiple times to distributed cache.
23/04/21 02:17:13 WARN Client: Same path resource file:///home/ubuntu/.ivy2/jars/io.delta_delta-storage-2.3.0.jar added multiple times to distributed cache.
23/04/21 02:17:13 WARN Client: Same path resource file:///home/ubuntu/.ivy2/jars/org.antlr_antlr4-runtime-4.8.jar added multiple times to distributed cache.


In [8]:
#spark.stop()

In [4]:
# create the schema for our dataset to determine the datatype for each columns)
dbschema = StructType([
    StructField("id", StringType(), True),
    StructField("authors", StringType(), True),
    StructField("title", StringType(), True),
    StructField("abstract", StringType(), True),
    StructField("journal_ref", StringType(), True),
    StructField("category", StringType(), True),
    StructField("update_date", StringType(), True),
])

In [5]:
# import csv file for the data
try:
  arxiv_df =spark.read.options(delimiter="::", header=False, schema=dbschema).csv(filepath)    
except:
    print(f"Error: Could not read the data for this file {filepath}")    

                                                                                

In [7]:
if partition > 0:
    arxiv_df = arxiv_df.repartition(partition)
print("The number of partition for the Data:",arxiv_df.rdd.getNumPartitions())

The number of partition for the Data: 27


In [8]:
executer = spark.conf.get("spark.executor.instances")
print("number of executer in the cluster:",executer)

number of executer in the cluster 12


In [6]:
from pyspark import SparkConf

conf = SparkConf()
config_map = conf.getAll()
for key, value in config_map:
    print(f"{key}: {value}")

spark.yarn.dist.jars: file:///home/ubuntu/.ivy2/jars/io.delta_delta-core_2.12-2.3.0.jar,file:///home/ubuntu/.ivy2/jars/io.delta_delta-storage-2.3.0.jar,file:///home/ubuntu/.ivy2/jars/org.antlr_antlr4-runtime-4.8.jar
spark.submit.pyFiles: /home/ubuntu/.ivy2/jars/io.delta_delta-core_2.12-2.3.0.jar,/home/ubuntu/.ivy2/jars/io.delta_delta-storage-2.3.0.jar,/home/ubuntu/.ivy2/jars/org.antlr_antlr4-runtime-4.8.jar
spark.app.name: Arxiv_Classification
spark.ui.proxyBase: /proxy/application_1679580022279_0108
spark.jars.packages: io.delta:delta-core_2.12:2.3.0
spark.master: yarn
spark.sql.extensions: io.delta.sql.DeltaSparkSessionExtension
spark.yarn.isPython: true
spark.submit.deployMode: client
spark.yarn.dist.pyFiles: file:///home/ubuntu/.ivy2/jars/io.delta_delta-core_2.12-2.3.0.jar,file:///home/ubuntu/.ivy2/jars/io.delta_delta-storage-2.3.0.jar,file:///home/ubuntu/.ivy2/jars/org.antlr_antlr4-runtime-4.8.jar
spark.app.submitTime: 1682043419069
spark.repl.local.jars: file:///home/ubuntu/.ivy2

In [10]:
spark = SparkSession.builder.appName("MyApp").getOrCreate()
aqe_enabled = spark.conf.get("spark.sql.adaptive.enabled")
print(f"Adaptive Query Execution is {'enabled' if aqe_enabled == 'true' else 'disabled'}")


23/04/18 16:05:41 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
Adaptive Query Execution is enabled


In [7]:
# change the column names to the same name for the Arxiv metadata 
arxiv_df = arxiv_df.selectExpr("_c0 as id", "_c1 as authors", "_c2 as title", "_c3 as abstract", 
                                "_c4 as journal_ref", "_c5 as category", "_c6 as update_date")

arxiv_df.printSchema()


root
 |-- id: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- title: string (nullable = true)
 |-- abstract: string (nullable = true)
 |-- journal_ref: string (nullable = true)
 |-- category: string (nullable = true)
 |-- update_date: string (nullable = true)



In [8]:
arxiv_df = arxiv_df.withColumn("main_category",
                   when(split(arxiv_df.category, "\\.")[0] == "cs", "computer science")
                   .when(split(arxiv_df.category, "\\.")[0] == "math", "mathematics")
                   .when(split(arxiv_df.category, "\\.")[0] == "econ", "economics")
                   .when(split(arxiv_df.category, "\\.")[0] == "eess", "electrical engineering")
                   .when(split(arxiv_df.category, "\\.")[0] == "q-bio", "quantitative biology")
                   .when(split(arxiv_df.category, "\\.")[0] == "q-fin", "quantitative finance")
                   .when(split(arxiv_df.category, "\\.")[0] == "stat", "statistics")                   
                   .otherwise("physics"))

In [13]:
arxiv_df.show(3, False)

[Stage 1:>                                                          (0 + 1) / 1]

+----------+--------------------------------------------------+----------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+---------------+------------+----------------+
|id        |authors                                           |title                                               |abs

                                                                                

In [9]:
# determine the columns that we used for machine learning model
clean_arxiv_df = arxiv_df.select("id", "title", "abstract", "main_category")

clean_arxiv_df.show(3, False)

+----------+----------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------+
|id        |title                                               |abstract                                                                                                                                          

In [10]:
# Concatenate the title and abstract into a single column
clean_arxiv_df = clean_arxiv_df.withColumn('text', concat_ws(' ', clean_arxiv_df['title'], clean_arxiv_df['abstract']))


In [15]:
pipeline = Create_ML_pipline()

In [26]:

TrainingFile = '/Dat500_Group09/input/training'
TestingFile = '/Dat500_Group09/input/testing'
# Split the data into training and testing sets
test_ratio = 0.2
trainingData, testData = clean_arxiv_df.randomSplit([1-test_ratio, test_ratio], seed=24)
# save Training Data into Delta table
if not DeltaTable.isDeltaTable(spark, TrainingFile):
    trainingData.write.format("delta").save(TrainingFile)

In [11]:
trainingData, testData = clean_arxiv_df.randomSplit([1-test_ratio, test_ratio], seed=24)

In [12]:
if bprint == 1:
    print("="*100)
    print("Training Data size: ", trainingData.count())
    print("Testing Data size: ", testData.count())
    print("="*100)



                                                                                

Training Data size:  1763556




Testing Data size:  440363


                                                                                

In [16]:
# Fit the model
ML_model = pipeline.fit(trainingData)

                                                                                

23/04/21 02:37:35 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB




23/04/21 02:41:38 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB


                                                                                

23/04/21 02:41:41 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 02:45:59 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 02:46:01 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 02:51:05 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 02:51:07 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/04/21 02:51:07 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
23/04/21 02:51:07 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
23/04/21 02:51:07 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
23/04/21 02:51:08 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 02:51:48 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 02:51:50 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 02:52:29 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 02:52:31 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 02:53:12 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 02:53:14 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 02:53:54 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 02:53:56 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 02:54:37 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 02:54:39 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 02:55:20 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 02:55:23 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 02:56:03 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 02:56:05 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 02:56:45 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 02:56:47 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 02:57:27 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 02:57:30 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 02:58:09 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 02:58:11 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 02:58:51 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 02:58:53 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 02:59:33 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 02:59:35 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:00:16 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:00:18 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:00:58 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:01:00 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:01:40 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:01:42 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:02:22 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:02:24 WARN BlockManager: Asked to remove block broadcast_76, which does not exist
23/04/21 03:02:24 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:03:05 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:03:07 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:03:47 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:03:50 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:04:29 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:04:31 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:05:11 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:05:13 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:05:53 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:05:55 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


[Stage 64:>                                                         (0 + 2) / 4]

23/04/21 03:06:35 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:06:37 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:07:16 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:07:19 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:07:59 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:08:01 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:08:42 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:08:44 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:09:23 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:09:26 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:10:06 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:10:09 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:10:48 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:10:51 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:11:30 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:11:33 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:12:13 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:12:15 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:12:54 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:12:57 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:13:37 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:13:39 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:14:19 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:14:21 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


[Stage 88:>                                                         (0 + 2) / 4]

23/04/21 03:15:01 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:15:03 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:15:43 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:15:45 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:16:25 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:16:27 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:17:06 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:17:08 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:17:48 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:17:50 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:18:30 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:18:32 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:19:11 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:19:13 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:19:53 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:19:56 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:20:35 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:20:38 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:21:17 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:21:19 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


[Stage 108:>                                                        (0 + 2) / 4]

23/04/21 03:22:00 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:22:02 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:22:41 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:22:44 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:23:24 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:23:26 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


[Stage 114:>                                                        (0 + 2) / 4]

23/04/21 03:24:06 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:24:09 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:24:48 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:24:51 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:25:31 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:25:33 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:26:13 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:26:15 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:26:55 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:26:57 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


[Stage 124:>                                                        (0 + 2) / 4]

23/04/21 03:27:37 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:27:39 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:28:19 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:28:21 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:29:01 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:29:03 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:29:43 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:29:46 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:30:25 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:30:28 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:31:08 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:31:10 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:31:49 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:31:52 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:32:31 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:32:34 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:33:14 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:33:16 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:33:56 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:33:59 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:34:38 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:34:42 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:35:21 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:35:23 WARN BlockManager: Asked to remove block broadcast_217_piece3, which does not exist
23/04/21 03:35:24 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


[Stage 148:>                                                        (0 + 2) / 4]

23/04/21 03:36:04 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:36:06 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:36:46 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:36:48 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


[Stage 152:>                                                        (0 + 0) / 4]

23/04/21 03:37:29 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:37:31 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:38:10 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:38:13 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:38:52 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:38:54 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:39:34 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:39:37 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:40:16 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:40:19 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:40:58 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:41:01 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:41:41 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:41:43 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


[Stage 166:>                                                        (0 + 2) / 4]

23/04/21 03:42:23 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:42:26 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:43:05 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:43:07 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:43:47 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:43:49 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:44:28 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:44:32 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:45:12 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:45:14 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:45:53 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:45:55 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:46:35 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:46:38 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:47:18 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:47:20 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:48:00 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:48:02 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


[Stage 184:>                                                        (0 + 2) / 4]

23/04/21 03:48:42 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:48:44 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:49:24 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:49:27 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:50:06 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:50:08 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:50:47 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:50:50 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:51:30 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:51:32 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:52:11 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:52:14 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


[Stage 196:>                                                        (0 + 2) / 4]

23/04/21 03:52:54 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:52:56 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:53:36 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:53:38 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:54:17 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:54:19 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:54:59 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:55:02 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:55:41 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:55:44 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


[Stage 206:>                                                        (0 + 2) / 4]

23/04/21 03:56:23 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:56:26 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:57:05 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:57:08 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


[Stage 210:>                                                        (0 + 2) / 4]

23/04/21 03:57:48 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:57:51 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:58:31 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:58:33 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:59:12 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:59:15 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 03:59:55 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 03:59:57 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB




23/04/21 04:00:37 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

23/04/21 04:00:39 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


[Stage 220:>                                                        (0 + 2) / 4]

23/04/21 04:01:19 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


                                                                                

In [18]:
# save the machine learning model
pipelinePath = 'hdfs:///Dat500_Group09/result/ML_model'
ML_model.write().overwrite().save(pipelinePath)

                                                                                

23/04/21 04:13:20 WARN TaskSetManager: Stage 229 contains a task of very large size (4849 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/04/21 04:13:22 WARN TaskSetManager: Stage 233 contains a task of very large size (4184 KiB). The maximum recommended task size is 1000 KiB.
23/04/21 04:13:23 WARN TaskSetManager: Stage 237 contains a task of very large size (16725 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

In [None]:
# load the saved machine learning model
#from pyspark.ml import PipelineModel
#savedPipelineModel = PipelineModel.load(pipelinePath)


In [19]:
# Make predictions on the testing data
df_Prediction = ML_model.transform(testData)

In [20]:
# Print the dataframe with the original main_category and the predicted one
df_Prediction = df_Prediction.select("id", "main_category", "label", "prediction")
df_Prediction.show(3)

23/04/21 04:17:30 WARN DAGScheduler: Broadcasting large task binary with size 22.7 MiB


[Stage 240:>                                                        (0 + 1) / 1]

+----------+----------------+-----+----------+
|        id|   main_category|label|prediction|
+----------+----------------+-----+----------+
|2210.02287|computer science|  2.0|       2.0|
|2210.02293|         physics|  0.0|       0.0|
|2210.02298|         physics|  0.0|       5.0|
+----------+----------------+-----+----------+
only showing top 3 rows



                                                                                

In [21]:
# Evaluate the model using the F1 score
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(df_Prediction)
print("accuracy = %g" % accuracy)

23/04/21 04:17:52 WARN DAGScheduler: Broadcasting large task binary with size 22.7 MiB




accuracy = 0.857801


                                                                                

In [30]:
# Evaluate the performance of the model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(df_Prediction)
print("Accuracy = %g" % accuracy)

23/04/18 16:34:43 WARN DAGScheduler: Broadcasting large task binary with size 22.7 MiB




Accuracy = 0.848034


                                                                                

In [22]:
# check if the Delta table exists  
# DeltaTable.isDeltaTable(spark, "spark-warehouse/table1") # True 
#DeltaTable
if DeltaTable.isDeltaTable(spark, delta_table_path):
    print("update delta table")
    deltaTable = DeltaTable.forPath(spark, delta_table_path)
    #"target.id = updates.id and target.main_category = updates.main_category") \
    deltaTable.alias("target") \
        .merge(
        source = df_Prediction.alias("updates"),
        condition = "target.id = updates.id") \
        .whenMatchedUpdate( set = 
        {
            "label": "updates.label",
            "prediction": "updates.prediction"     
        }) \
        .whenNotMatchedInsert(values =
        {
            "id": "updates.id",
            "main_category": "updates.main_category",
            "label": "updates.label",
            "prediction": "updates.prediction"        
        }) \
        .execute()
else: # file not exists
    print("Create delta table first time")
    df_Prediction.write.format("delta").save(delta_table_path)
    #df_Prediction.write.format("delta").partitionBy("main_category").save(delta_table_path)

Create delta table first time
23/04/21 04:24:54 WARN DAGScheduler: Broadcasting large task binary with size 23.0 MiB


                                                                                

23/04/21 04:26:31 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

In [23]:
df = spark.read.format("delta").load(delta_table_path)
df.show(10)

                                                                                

+----------+----------------+-----+----------+
|        id|   main_category|label|prediction|
+----------+----------------+-----+----------+
|1710.08686|         physics|  0.0|       0.0|
|1710.08690|         physics|  0.0|       0.0|
|1710.08696|         physics|  0.0|       0.0|
|1710.08697|     mathematics|  1.0|       1.0|
|1710.08709|         physics|  0.0|       0.0|
|1710.08710|         physics|  0.0|       0.0|
|1710.08718|         physics|  0.0|       0.0|
|1710.08721|computer science|  2.0|       4.0|
|1710.08722|     mathematics|  1.0|       1.0|
|1710.08724|     mathematics|  1.0|       1.0|
+----------+----------------+-----+----------+
only showing top 10 rows



In [24]:
df.count()

440363

In [23]:
df.count()

                                                                                

880472