In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType

In [13]:
from kafka import KafkaConsumer, KafkaProducer
import json
import numpy as np
import pandas as pd
import os
from transformers import AutoModelForTokenClassification, AutoTokenizer
import pickle

from pathlib import Path
import sys
sys.path.append(r'F:\Studies\Third_year\Big_data\Final_Code')
from final_src.config import Config

In [18]:
import final_src

In [20]:
scala_version = '2.12'
spark_version = '3.5.5'

packages = [
    f'org.apache.spark:spark-sql-kafka-0-10_{scala_version}:{spark_version}',
    'org.apache.kafka:kafka-clients:2.8.0'
]
spark = SparkSession.builder.master("local").appName("NER_ABSA_Streaming")\
                    .config("spark.jars.packages", ",".join(packages)) \
                    .config("spark.driver.memory", "4g") \
                    .config("spark.executor.memory", "4g") \
                    .getOrCreate()
spark

In [21]:
drinkplaces_aspects = ['DRINK#QUALITY',
 'DRINK#VARIETY',
 'ENVIRONMENT#CLEANLINESS',
 'ENVIRONMENT#AMBIENCE',
 'FOOD#QUALITY',
 'LOCATION',
 'PRICE',
 'SERVICE#ORDER',
 'SERVICE#STAFF']

hotels_aspects = ['HOTEL#LOCATION',
 'HOTEL#QUALITY',
 'HOTEL#FACILITIES',
 'HOTEL#STYLE',
 'WIFI',
 'PRICE',
 'ROOM#QUALITY',
 'ROOM#STYLE',
 'ROOM#FACILITIES',
 'ROOM#SOUND',
 'ROOM#VIEW',
 'ROOM#ATMOSPHERE',
 'ROOM#CLEANLINESS',
 'SERVICE#STAFF',
 'SERVICE#CHECKIN']

restaurants_aspects = ['LOCATION',
 'PRICE',
 'FOOD#QUALITY',
 'FOOD#VARIETY',
 'FOOD#PRESENTATION',
 'FOOD#FRESHNESS',
 'DRINK#QUALITY',
 'ENVIRONMENT#CLEANLINESS',
 'ENVIRONMENT#AMBIENCE',
 'SERVICE#STAFF',
 'SERVICE#ORDER']

eateries_aspects = ['LOCATION',
 'PRICE',
 'FOOD#QUALITY',
 'FOOD#VARIETY',
 'DRINK#QUALITY',
 'DRINK#VARIETY',
 'ENVIRONMENT#CLEANLINESS',
 'ENVIRONMENT#AMBIENCE',
 'SERVICE#STAFF',
 'SERVICE#ORDER']

attractions_aspects = ['LOCATION',
 'PRICE',
 'SERVICE#STAFF',
 'ENVIRONMENT#SCENERY',
 'ENVIRONMENT#ATMOSPHERE',
 'EXPERIENCE#ACTIVITY']

rents_aspects = ['LOCATION', 'PRICE', 'SERVICE#RENTING', 'SERVICE#STAFF', 'VEHICLE#QUALITY']

tours_aspects = ['LOCATION',
 'PRICE',
 'SERVICE#STAFF',
 'EXPERIENCE#ACTIVITY',
 'ENVIRONMENT#SCENERY',
 'ENVIRONMENT#ATMOSPHERE']

campings_aspects = ['LOCATION#DISTANCE',
 'LOCATION#ACCESSIBILITY',
 'SERVICE#STAFF',
 'ENVIRONMENT#SCENERY',
 'ENVIRONMENT#WEATHER',
 'ENVIRONMENT#ATMOSPHERE']

sentiment_map = {
    1: 'NEGATIVE',
    2: 'NEUTRAL',
    3: 'POSITIVE'
}

In [22]:
import torch
label_list = ['B-TOUR', 'B-RENT', 'B-RESTAURANT', 'I-RESTAURANT', 'B-ATTRACTION', 'I-TOUR', 'I-EATERY', 'O', 'B-HOTEL', 'I-HOTEL', 'I-ATTRACTION', 'I-CAMPING', 'B-EATERY', 'B-DRINKPLACE', 'I-RENT', 'B-CAMPING', 'I-DRINKPLACE']
label_map = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

def merge_subwords(tokens, labels):
    merged_tokens = []
    merged_labels = []

    current_token = ""
    current_label = None

    for token, label in zip(tokens, labels):
        if token.endswith('@@'):  # Subword token
            current_token += token[:-2]  # Bỏ @@ và nối vào
            if current_label is None:
                current_label = label  # Lấy label đầu tiên (B- hoặc I-)
        else:
            current_token += token  # Token đầy đủ, nối vào
            if current_label is None:
                current_label = label
            merged_tokens.append(current_token)
            merged_labels.append(current_label)
            current_token = ""
            current_label = None

    # Nếu còn token cuối
    if current_token:
        merged_tokens.append(current_token)
        merged_labels.append(current_label)

    return merged_tokens, merged_labels


def predict_ner(text, model, tokenizer, id2label):
    # Tokenize
    # text = text.lower()
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=128,
        is_split_into_words=False
    )

    # Predict
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

    # Decode tokens và labels
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    labels = [id2label[label_id.item()] for label_id in predictions[0]]
    # print(labels)
    new_tokens, new_lables = merge_subwords(tokens[1:-1], labels[1:-1])
    # Hiển thị sạch
    # for token, label in zip(new_tokens, new_lables):
    #     # token_clean = token.replace("▁", "") if "▁" in token else token
    #     # print(f"{token_clean}\t{label}")
    #     print(f"{token}\t{label}")
    return new_tokens, new_lables

def extract_location_and_domain(tokens, labels):
    target_labels = ['B-TOUR', 'B-RENT', 'B-RESTAURANT', 
                     'B-ATTRACTION', 'B-HOTEL', 'B-EATERY',
                     'B-DRINKPLACE', 'B-CAMPING']
    
    extracted_tokens = []
    domain = None
    extracting = False

    for token, label in zip(tokens, labels):
        if not extracting:
            if label in target_labels:
                extracted_tokens.append(token)
                domain = label.replace('B-', '')  # Lấy domain
                extracting = True
        else:
            if label.startswith('I-'):
                extracted_tokens.append(token)
            else:
                break  # Gặp O hoặc B- khác thì dừng

    if extracted_tokens:
        text = ' '.join(extracted_tokens)
        return {"text": text, "domain": domain}
    else:
        return {"text": None, "domain": None}


In [23]:
def decode_prediction(pred_array, aspects, sentiment_map):
    pred_array = np.array(pred_array).flatten() 

    result = {}
    for aspect, sentiment_id in zip(aspects, pred_array):
        if sentiment_id != 0: 
            result[aspect] = sentiment_map[sentiment_id]

    return result

# decode_prediction(ypred_single, aspects, sentiment_map)

In [24]:
def save_to_csv(output_csv_path, text, place_extracted, domain_extracted, aspect_result):

    record_result = {
        'text': text,
        'place_extracted': place_extracted,
        'domain_extracted': domain_extracted,
        'aspect_result': json.dumps(aspect_result, ensure_ascii=False)
    }

    df_new = pd.DataFrame([record_result])

    if not os.path.exists(output_csv_path):
        df_new.to_csv(output_csv_path, index=False)
    else:
        df_new.to_csv(output_csv_path, mode='a', header=False, index=False)

In [25]:
config = Config()

In [26]:
import json

def ner_absa_udf(text):
    global ner_model, ner_tokenizer
    global attractions_model, attractions_vectorizer
    global hotels_model, hotels_vectorizer
    global restaurants_model, restaurants_vectorizer
    global drinkplaces_model, drinkplaces_vectorizer
    global eateries_model, eateries_vectorizer
    global rents_model, rents_vectorizer
    global tours_model, tours_vectorizer
    global campings_model, campings_vectorizer

    # Load mô hình NER nếu chưa có
    if 'ner_model' not in globals():
        from transformers import AutoModelForTokenClassification, AutoTokenizer
        config = Config()
        ner_model = AutoModelForTokenClassification.from_pretrained(config.MODEL_NER_PATH)
        ner_tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NER_PATH)

    # Load từng mô hình ABSA nếu chưa có
    def load_absa_model(path, global_name_prefix):
        if f'{global_name_prefix}_model' not in globals():
            import pickle
            with open(path, 'rb') as f:
                package = pickle.load(f)
                globals()[f'{global_name_prefix}_model'] = package['model']
                globals()[f'{global_name_prefix}_vectorizer'] = package['vectorizer']

    config = Config()
    load_absa_model(config.ATTRACTIONS_ABSA_MODEL, 'attractions')
    load_absa_model(config.HOTELS_ABSA_MODEL, 'hotels')
    load_absa_model(config.RESTAURANTS_ABSA_MODEL, 'restaurants')
    load_absa_model(config.DRINKPLACES_ABSA_MODEL, 'drinkplaces')
    load_absa_model(config.EATERIES_ABSA_MODEL, 'eateries')
    load_absa_model(config.RENTS_ABSA_MODEL, 'rents')
    load_absa_model(config.TOURS_ABSA_MODEL, 'tours')
    load_absa_model(config.CAMPINGS_ABSA_MODEL, 'campings')

    # Gọi mô hình dự đoán
    predict_tokens, predict_labels = predict_ner(text, ner_model, ner_tokenizer, id2label)
    ner_result = extract_location_and_domain(predict_tokens, predict_labels)
    place_extracted = ner_result['text']
    domain_extracted = ner_result['domain']

    # Xử lý domain và gọi mô hình tương ứng
    domain_mapping = {
        'TOUR': 'tours',
        'RENT': 'rents',
        'RESTAURANT': 'restaurants',
        'ATTRACTION': 'attractions',
        'HOTEL': 'hotels',
        'EATERY': 'eateries',
        'DRINKPLACE': 'drinkplaces',
        'CAMPING': 'campings'
    }

    if domain_extracted in domain_mapping:
        model = globals()[f"{domain_mapping[domain_extracted]}_model"]
        vectorizer = globals()[f"{domain_mapping[domain_extracted]}_vectorizer"]
        vector = vectorizer.transform([text])
        prediction = model.predict(vector)
        aspect_result = decode_prediction(prediction, eval(f"{domain_mapping[domain_extracted]}_aspects"), sentiment_map)
    else:
        place_extracted = ''
        domain_extracted = ''
        aspect_result = {}

    return json.dumps({
        "place": place_extracted,
        "domain": domain_extracted,
        "aspect_result": aspect_result
    }, ensure_ascii=False)


In [27]:
# Đăng ký UDF
ner_absa_predict_udf = udf(ner_absa_udf, StringType())

# Đọc dữ liệu từ Kafka
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", ",".join(config.KAFKA_SERVERS)) \
    .option("subscribe", config.KAFKA_TOPIC_COMMENTS) \
    .option("startingOffsets", "earliest") \
    .load()

In [28]:
df_text = df.selectExpr("CAST(value AS STRING) as json_str")
df_parsed = df_text.selectExpr("from_json(json_str, 'text STRING') as data").select("data.text")

# Dự đoán hàng loạt (batch)
df_with_prediction = df_parsed.withColumn("ner_absa_result", ner_absa_predict_udf(col("text")))


In [29]:
query = df_with_prediction.writeStream \
    .outputMode("append") \
    .format("csv") \
    .option("path", r"F:\Studies\Third_year\Big_data\Final_Code\Result") \
    .option("checkpointLocation", r"F:\Studies\Third_year\Big_data\Final_Code\Checkpoint") \
    .start()

In [31]:
query.awaitTermination()

StreamingQueryException: [STREAM_FAILED] Query [id = 3648b729-3d18-4d5c-b303-be28cf8aca76, runId = 7f3809c9-7543-4869-972a-ee41f92a4aea] terminated with exception: Job aborted due to stage failure: Task 0 in stage 3.0 failed 1 times, most recent failure: Lost task 0.0 in stage 3.0 (TID 3) (LAPTOP-5DQQ5LDG.lan executor driver): org.apache.spark.SparkException: [TASK_WRITE_FAILED] Task failed while writing rows to F:\Studies\Third_year\Big_data\Final_Code\Result.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.taskFailedWhileWritingRowsError(QueryExecutionErrors.scala:775)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:420)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeWrite$2(FileFormatWriter.scala:252)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "F:\software\spark-3.5.5-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 1231, in main
    func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
                                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "F:\software\spark-3.5.5-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 1067, in read_udfs
    udfs.append(read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=i))
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "F:\software\spark-3.5.5-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 529, in read_single_udf
    f, return_type = read_command(pickleSer, infile)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "F:\software\spark-3.5.5-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 90, in read_command
    command = serializer._read_with_length(file)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "F:\software\spark-3.5.5-bin-hadoop3\python\lib\pyspark.zip\pyspark\serializers.py", line 174, in _read_with_length
    return self.loads(obj)
           ^^^^^^^^^^^^^^^
  File "F:\software\spark-3.5.5-bin-hadoop3\python\lib\pyspark.zip\pyspark\serializers.py", line 472, in loads
    return cloudpickle.loads(obj, encoding=encoding)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ModuleNotFoundError: No module named 'final_src'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:94)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.datasources.FileFormatDataWriter.writeWithIterator(FileFormatDataWriter.scala:91)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeTask$1(FileFormatWriter.scala:403)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1397)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:410)
	... 12 more

Driver stacktrace:

In [None]:
# Đăng ký UDF
ner_absa_predict_udf = udf(ner_absa_udf, StringType())

# Đọc dữ liệu từ Kafka
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", ",".join(config.KAFKA_SERVERS)) \
    .option("subscribe", config.KAFKA_TOPIC_COMMENTS) \
    .option("startingOffsets", "earliest") \
    .load()

# Giải mã và chọn trường text
df_text = df.selectExpr("CAST(value AS STRING) as json_str")
df_parsed = df_text.selectExpr("from_json(json_str, 'text STRING') as data").select("data.text")

# Dự đoán hàng loạt (batch)
df_with_prediction = df_parsed.withColumn("ner_absa_result", ner_absa_predict_udf(col("text")))

# Ghi kết quả ra CSV
query = df_with_prediction.writeStream \
    .outputMode("append") \
    .format("csv") \
    .option("path", r"F:\Studies\Third_year\Big_data\Final_Code\Result") \
    .option("checkpointLocation", r"F:\Studies\Third_year\Big_data\Final_Code\Checkpoint") \
    .start()

query.awaitTermination()

StreamingQueryException: [STREAM_FAILED] Query [id = 3648b729-3d18-4d5c-b303-be28cf8aca76, runId = 9c0c6f82-d001-4f82-ad7e-59009b820816] terminated with exception: Job aborted due to stage failure: Task 0 in stage 1.0 failed 1 times, most recent failure: Lost task 0.0 in stage 1.0 (TID 1) (LAPTOP-5DQQ5LDG.lan executor driver): org.apache.spark.SparkException: [TASK_WRITE_FAILED] Task failed while writing rows to F:\Studies\Third_year\Big_data\Final_Code\Result.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.taskFailedWhileWritingRowsError(QueryExecutionErrors.scala:775)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:420)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeWrite$2(FileFormatWriter.scala:252)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "F:\software\spark-3.5.5-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 1231, in main
    func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
                                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "F:\software\spark-3.5.5-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 1067, in read_udfs
    udfs.append(read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=i))
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "F:\software\spark-3.5.5-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 529, in read_single_udf
    f, return_type = read_command(pickleSer, infile)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "F:\software\spark-3.5.5-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 90, in read_command
    command = serializer._read_with_length(file)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "F:\software\spark-3.5.5-bin-hadoop3\python\lib\pyspark.zip\pyspark\serializers.py", line 174, in _read_with_length
    return self.loads(obj)
           ^^^^^^^^^^^^^^^
  File "F:\software\spark-3.5.5-bin-hadoop3\python\lib\pyspark.zip\pyspark\serializers.py", line 472, in loads
    return cloudpickle.loads(obj, encoding=encoding)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ModuleNotFoundError: No module named 'final_src'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:94)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.datasources.FileFormatDataWriter.writeWithIterator(FileFormatDataWriter.scala:91)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeTask$1(FileFormatWriter.scala:403)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1397)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:410)
	... 12 more

Driver stacktrace: