In [1]:
import findspark
findspark.init()
from utils.s3_process import  read_key
from utils.s3_process_nlp import read_csv_from_s3, get_latest_s3_object_version
from utils.clean_text import clean_text_column

from utils.mlflow_func import get_latest_model_version, get_model_version_by_stage
import datetime
import yaml
import mlflow
import os 
import time

os.environ['MLFLOW_TRACKING_URI']="https://dagshub.com/TranChucThien/kltn-sentiment-monitoring-mlops.mlflow"
os.environ['MLFLOW_TRACKING_USERNAME']="TranChucThien"


# Thêm các thư viện cần thiết
import nltk
from nltk.corpus import stopwords

from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
# Các import khác...

from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import col
# 
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param

# 
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import sys
from multiprocessing import Process
import logging

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *

from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType
from pyspark.sql.types import DoubleType

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/thientran/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/thientran/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
def create_pipeline():
    
    document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

    tokenizer = Tokenizer() \
        .setInputCols(["document"]) \
        .setOutputCol("token")
        
    normalizer = Normalizer() \
        .setInputCols(["token"]) \
        .setOutputCol("normalized") \
        .setLowercase(True) 
        
    stop_words_cleaner = StopWordsCleaner() \
        .setInputCols(["normalized"]) \
        .setOutputCol("cleanTokens") \
        .setCaseSensitive(False)
    
    lemmatizer = LemmatizerModel.pretrained("lemma_antbnc") \
        .setInputCols(["cleanTokens"]) \
        .setOutputCol("lemmatized") 
            
    word_embeddings_elmo = ElmoEmbeddings.pretrained("elmo", "en") \
        .setInputCols(["document", "lemmatized"]) \
        .setOutputCol("embeddings")

    sentence_embeddings = SentenceEmbeddings() \
        .setInputCols(["document", "embeddings"]) \
        .setOutputCol("sentence_embeddings") \
        .setPoolingStrategy("SUM")  # or "SUM", "MAX"
        
    classifier = ClassifierDLApproach() \
        .setInputCols(["sentence_embeddings"]) \
        .setOutputCol("category") \
        .setLabelColumn("label") \
        .setMaxEpochs(10) \
        .setLr(0.003) \
        .setBatchSize(8) \
        .setEnableOutputLogs(True) \
        .setOutputLogsPath("classifier_logs")
    
    finisher = Finisher() \
        .setInputCols(["category"]) \
        .setOutputCols(["prediction"]) \
        .setCleanAnnotations(False) \
        
        
    pipeline_elmo = Pipeline(stages=[
        document_assembler,
        tokenizer,
        normalizer,
        stop_words_cleaner,
        lemmatizer,
        word_embeddings_elmo,
        sentence_embeddings,
        classifier,
        finisher
    ])
    return pipeline_elmo


def tune_model(pipeline, train_data, use_hashing=True, vectorizer=None, hashingTF=None, lr=None):
    evaluator = MulticlassClassificationEvaluator(labelCol="Label", predictionCol="prediction", metricName="f1")

    paramGrid = ParamGridBuilder()

    if use_hashing:
        paramGrid = paramGrid.addGrid(hashingTF.numFeatures, [1000, 5000, 10000])
    else:
        paramGrid = paramGrid.addGrid(vectorizer.vocabSize, [5000, 10000])

    paramGrid = paramGrid.addGrid(lr.regParam, [0.0, 0.01, 0.1])
    paramGrid = paramGrid.build()

    crossval = CrossValidator(
        estimator=pipeline,
        estimatorParamMaps=paramGrid,
        evaluator=evaluator,
        numFolds=3
    )

    best_model = crossval.fit(train_data)
    
    return best_model


def evaluator(prediction1, label_col="label", prediction_col="prediction"):
    prediction1 = prediction1.withColumn(label_col, col(label_col).cast(DoubleType()))
    prediction1 = prediction1.withColumn(prediction_col, col(prediction_col).cast(DoubleType()))
    evaluator = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol=prediction_col)
    accuracy = evaluator.evaluate(prediction1, {evaluator.metricName: "accuracy"})
    precision = evaluator.evaluate(prediction1, {evaluator.metricName: "weightedPrecision"})
    recall = evaluator.evaluate(prediction1, {evaluator.metricName: "weightedRecall"})
    f1 = evaluator.evaluate(prediction1, {evaluator.metricName: "f1"})

    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-score:  {f1:.4f}")
    return accuracy, precision, recall, f1

def data_distribution(data, label_col="Label"):
    total_count = data.count()
    label_dist = data.groupBy(label_col).count()
    label_dist = label_dist.withColumn("percentage", (col("count") / total_count) * 100)
    label_dist.orderBy(label_col).show()
    return label_dist.orderBy(label_col)

def load_config(config_path="configs/config.yaml"):       
    try:
        with open(config_path, "r") as f:
            config = yaml.safe_load(f)
        return config
    except Exception as e:
        logging.error(f"Failed to load config: {e}")
        raise


def set_up_mlflow_tracking(config, config_secret):
    """Sets up MLflow tracking URI and credentials."""
    os.environ["MLFLOW_TRACKING_PASSWORD"] = config_secret['mlflow']['password']
    os.environ['MLFLOW_TRACKING_URI'] = config['mlflow']['tracking_uri']
    os.environ['MLFLOW_TRACKING_USERNAME'] = config['mlflow']['username']
    mlflow.set_tracking_uri(os.environ['MLFLOW_TRACKING_URI'])
    logging.info("MLflow tracking setup complete.")
    
def load_dataset(config, config_secret, spark):
    """Loads dataset from S3."""
    bucket = config['s3']['bucket']
    dataset_key = config['s3']['keys']['dataset']
    
    dataset_path = f"s3a://{bucket}/{dataset_key}"
    logging.info(f"Dataset path: {dataset_path}")
    # AWS credentials and region
    AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY = read_key(config_secret['aws']['access_key_path'])
    AWS_REGION = config['aws']['region']
    S3_OUTPUT_KEY = config['s3']['keys']['dataset']
    BUCKET_NAME = config['s3']['bucket']
    
    logging.info("Reading CSV file from S3...")
    data = read_csv_from_s3(dataset_path, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_REGION, spark)
    logging.info("Read csv file from S3 successfully.")
    data.show(3)
    
    
    return data


def get_data_version(config, config_secret):
    """Loads dataset from S3."""
    bucket = config['s3']['bucket']
    dataset_key = config['s3']['keys']['dataset']
    
    dataset_path = f"s3a://{bucket}/{dataset_key}"
    logging.info(f"Dataset path: {dataset_path}")
    # AWS credentials and region
    AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY = read_key(config_secret['aws']['access_key_path'])
    AWS_REGION = config['aws']['region']
    S3_OUTPUT_KEY = config['s3']['keys']['dataset']
    BUCKET_NAME = config['s3']['bucket']
    
    return get_latest_s3_object_version(s3_path=dataset_path,aws_access_key=AWS_ACCESS_KEY_ID,aws_secret_key=AWS_SECRET_ACCESS_KEY,region=AWS_REGION), dataset_path

def split_data(data, train_ratio=0.8, seed=42):
    train_data, validate_data = data.randomSplit([0.8, 0.2], seed=42)
    logging.info("Data split completed.")
    
    print("Train data:")
    train_data.printSchema()
    train_data.show(3)
    
    print("Validate data:")
    validate_data.printSchema()
    validate_data.show(3)

In [3]:
# Configure logging within this process
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logging.info(f"Starting Text Classification Pipeline for Olmo model...")


# Load configuration
logging.info("Loading configuration from 'configs/config.yaml'")
config = load_config("../configs/config.yaml")
config_secret = load_config("../configs/secrets.yaml")
logging.info("Successfully loaded configuration.")

# Spark session initialization
logging.info("Initializing Spark session...")
spark = sparknlp.start(
    SparkSession.builder \
        .appName("Spark NLP - BERT Sentiment Classification") \
        .config("spark.driver.memory", "16G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.262,com.johnsnowlabs.nlp:spark-nlp_2.12:5.3.3") \
        .config("spark.kryoserializer.buffer.max", "2000M") \
        .config("spark.driver.maxResultSize", "0") \
        .getOrCreate()
)
logging.info("Spark Session with Spark NLP is ready.")

# Load dataset
logging.info("Loading dataset from S3...")
data = load_dataset(config, config_secret, spark)
data_version, dataset_path = get_data_version(config, config_secret)

data = data.withColumnRenamed("text", "text").withColumnRenamed("label", "label")
data = data.selectExpr("cast(Text as string) as text", "cast(Label as string) as label")
data = data.filter(col("label") != "3")
logging.info("Successfully loaded dataset.")

2025-06-04 10:25:27,344 - INFO - Starting Text Classification Pipeline for Olmo model...
2025-06-04 10:25:27,346 - INFO - Loading configuration from 'configs/config.yaml'
2025-06-04 10:25:27,373 - INFO - Successfully loaded configuration.
2025-06-04 10:25:27,374 - INFO - Initializing Spark session...
your 131072x1 screen size is bogus. expect trouble
25/06/04 10:25:29 WARN Utils: Your hostname, LE11-D5013 resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/06/04 10:25:29 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/thientran/.local/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/thientran/.ivy2/cache
The jars for the packages stored in: /home/thientran/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-bb1b2e9b-379b-4930-b6d4-c68a38695a9f;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in spark-list
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found com.johnsnowlabs.nlp#spark-nlp_2.12;5.3.3 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-s3;1.12.500 in central
	found com.amazonaws#aws-java-sdk-kms;1.12.500 in central
	found com.amazonaws#aws-java-sdk-core;1.12.500 in central
	found commons-logging#commons-logging;1.1.3 in central
	found commons-codec#common



25/06/04 10:25:43 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
2025-06-04 10:25:43,291 - INFO - Spark Session with Spark NLP is ready.
2025-06-04 10:25:43,292 - INFO - Loading dataset from S3...
2025-06-04 10:25:43,293 - INFO - Dataset path: s3a://tranchucthien-bucket/dataset/dataset.csv
2025-06-04 10:25:43,495 - INFO - Reading CSV file from S3...
25/06/04 10:25:43 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
2025-06-04 10:26:03,581 - INFO - Read csv file from S3 successfully.            
2025-06-04 10:26:05,089 - INFO - Dataset path: s3a://tranchucthien-bucket/dataset/dataset.csv


+-----+--------------------+
|Label|                Text|
+-----+--------------------+
|    1|im getting border...|
|    1|im coming borderl...|
|    1|im getting border...|
+-----+--------------------+
only showing top 3 rows



2025-06-04 10:26:06,597 - INFO - Successfully loaded dataset.


In [4]:
logging.info("Setting up MLflow experiment...")
set_up_mlflow_tracking(config=config, config_secret=config_secret)
experiment_name = f'DL_Elmo_Text_Classification_Experiment'
mlflow.set_experiment(experiment_name)
logging.info("MLflow experiment set up successfully with name: %s", experiment_name)

2025-06-04 10:26:06,607 - INFO - Setting up MLflow experiment...
2025-06-04 10:26:06,610 - INFO - MLflow tracking setup complete.
2025-06-04 10:26:08,817 - INFO - MLflow experiment set up successfully with name: DL_Elmo_Text_Classification_Experiment


In [5]:
total_count = data.count()
logging.info(f"Total samples: {data.count()}")

2025-06-04 10:26:20,265 - INFO - Total samples: 59497                           


In [6]:
from functools import reduce
def k_fold_split(data, k=3, seed=42):
    # Chia dữ liệu thành k phần bằng randomSplit
    weights = [1.0 / k] * k
    return data.randomSplit(weights, seed=seed)

def cross_validate_custom(data, pipeline, k=3):
    folds = k_fold_split(data, k)
    metrics = []

    for i in range(k):
        logging.info(f"=== Fold {i + 1}/{k} ===")
        validate_data = folds[i]
        # Lấy tất cả các fold ngoại trừ fold[i] làm train
        train_folds = [folds[j] for j in range(k) if j != i]
        train_data = reduce(lambda df1, df2: df1.union(df2), train_folds)

        model = pipeline.fit(train_data)
        prediction = model.transform(validate_data).withColumn("prediction", col("prediction")[0].cast("string"))
        accuracy, precision, recall, f1 = evaluator(prediction, label_col="label", prediction_col="prediction")
        metrics.append((accuracy, precision, recall, f1))

    return metrics


In [13]:
pipeline = create_pipeline()

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
elmo download started this may take some time.
Approximate size to download 334.1 MB
[OK!]


In [None]:
train_data, validate_data = data.randomSplit([0.9, 0.1], seed=42)
a,b =  validate_data.randomSplit([0.9, 0.1], seed=42)
print("Count train data:", b.count())

In [14]:
metrics = cross_validate_custom(data, pipeline, k=5)  # hoặc k=5 tùy bạn


2025-06-04 14:10:16,319 - INFO - === Fold 1/5 ===
2025-06-04 14:15:54.260265: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/9c16db13d6b4_classifier_dl9842502838296282843
2025-06-04 14:15:55.096785: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2025-06-04 14:15:55.096949: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/9c16db13d6b4_classifier_dl9842502838296282843
2025-06-04 14:15:56.114249: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2025-06-04 14:15:57.157219: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/9c16db13d6b4_classifier_dl9842502838296282843
2025-06-04 14:15:57.311500: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { ser

Training started - epochs: 10 - learning_rate: 0.003 - batch_size: 8 - training_examples: 47528 - classes: 3
Epoch 1/10 - 27.69s - loss: 7199.6724 - acc: 0.33767465 - batches: 5941
Epoch 2/10 - 27.50s - loss: 5642.485 - acc: 0.60122454 - batches: 5941
Epoch 3/10 - 27.71s - loss: 4953.927 - acc: 0.7173666 - batches: 5941
Epoch 4/10 - 30.40s - loss: 4633.6206 - acc: 0.7736282 - batches: 5941
Epoch 5/10 - 31.90s - loss: 4417.34 - acc: 0.8114585 - batches: 5941
Epoch 6/10 - 33.83s - loss: 4257.999 - acc: 0.83599144 - batches: 5941
Epoch 7/10 - 32.96s - loss: 4153.5117 - acc: 0.85490656 - batches: 5941
Epoch 8/10 - 35.45s - loss: 4081.1377 - acc: 0.86744654 - batches: 5941
Epoch 9/10 - 30.51s - loss: 4033.9739 - acc: 0.8763045 - batches: 5941
Epoch 10/10 - 30.55s - loss: 3999.4333 - acc: 0.8826797 - batches: 5941


2025-06-04 14:28:46,934 - INFO - === Fold 2/5 ===                               


Accuracy:  0.7952
Precision: 0.7953
Recall:    0.7952
F1-score:  0.7942


2025-06-04 14:34:12.560927: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/4fce709856d1_classifier_dl12574968973225978250
2025-06-04 14:34:12.693493: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2025-06-04 14:34:12.693566: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/4fce709856d1_classifier_dl12574968973225978250
2025-06-04 14:34:13.185148: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2025-06-04 14:34:14.255207: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/4fce709856d1_classifier_dl12574968973225978250
2025-06-04 14:34:14.430018: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 1869118 microse

Training started - epochs: 10 - learning_rate: 0.003 - batch_size: 8 - training_examples: 47681 - classes: 3
Epoch 1/10 - 27.03s - loss: 7213.961 - acc: 0.33596897 - batches: 5961
Epoch 2/10 - 27.07s - loss: 7214.3447 - acc: 0.33598992 - batches: 5961
Epoch 3/10 - 27.81s - loss: 5816.062 - acc: 0.5653523 - batches: 5961
Epoch 4/10 - 29.90s - loss: 5011.707 - acc: 0.7068582 - batches: 5961
Epoch 5/10 - 30.18s - loss: 4724.38 - acc: 0.7635277 - batches: 5961
Epoch 6/10 - 34.81s - loss: 4492.475 - acc: 0.8011116 - batches: 5961
Epoch 7/10 - 36.17s - loss: 4350.3945 - acc: 0.82487416 - batches: 5961
Epoch 8/10 - 32.59s - loss: 4235.672 - acc: 0.8440436 - batches: 5961
Epoch 9/10 - 30.04s - loss: 4155.3413 - acc: 0.85677433 - batches: 5961
Epoch 10/10 - 29.08s - loss: 4094.221 - acc: 0.867219 - batches: 5961


2025-06-04 14:48:18,847 - INFO - === Fold 3/5 ===                               


Accuracy:  0.7910
Precision: 0.7907
Recall:    0.7910
F1-score:  0.7900


2025-06-04 14:53:44.607590: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/b49e1a200b76_classifier_dl9300862913484838846
2025-06-04 14:53:44.729043: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2025-06-04 14:53:44.729117: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/b49e1a200b76_classifier_dl9300862913484838846
2025-06-04 14:53:45.227057: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2025-06-04 14:53:46.100061: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/b49e1a200b76_classifier_dl9300862913484838846
2025-06-04 14:53:46.259171: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 1651596 microsecon

Training started - epochs: 10 - learning_rate: 0.003 - batch_size: 8 - training_examples: 47468 - classes: 3
Epoch 1/10 - 30.26s - loss: 5658.344 - acc: 0.58530676 - batches: 5934
Epoch 2/10 - 33.58s - loss: 4957.1387 - acc: 0.709759 - batches: 5934
Epoch 3/10 - 32.24s - loss: 4600.983 - acc: 0.77243805 - batches: 5934
Epoch 4/10 - 28.95s - loss: 4364.447 - acc: 0.8136693 - batches: 5934
Epoch 5/10 - 30.59s - loss: 4201.317 - acc: 0.84200656 - batches: 5934
Epoch 6/10 - 31.13s - loss: 4092.3135 - acc: 0.8618532 - batches: 5934
Epoch 7/10 - 34.97s - loss: 4008.0015 - acc: 0.87622195 - batches: 5934
Epoch 8/10 - 29.99s - loss: 3947.448 - acc: 0.88471264 - batches: 5934
Epoch 9/10 - 28.42s - loss: 3909.2463 - acc: 0.8920656 - batches: 5934
Epoch 10/10 - 31.90s - loss: 3888.4165 - acc: 0.89724845 - batches: 5934


2025-06-04 15:06:15,506 - INFO - === Fold 4/5 ===                               


Accuracy:  0.8133
Precision: 0.8133
Recall:    0.8133
F1-score:  0.8133


2025-06-04 15:12:24.448600: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/0aeae0af1179_classifier_dl15331463003606686069
2025-06-04 15:12:24.582440: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2025-06-04 15:12:24.582508: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/0aeae0af1179_classifier_dl15331463003606686069
2025-06-04 15:12:25.133126: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2025-06-04 15:12:26.306516: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/0aeae0af1179_classifier_dl15331463003606686069
2025-06-04 15:12:26.471172: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 2022582 microse

Training started - epochs: 10 - learning_rate: 0.003 - batch_size: 8 - training_examples: 47709 - classes: 3
Epoch 1/10 - 28.49s - loss: 7570.494 - acc: 0.2944994 - batches: 5964
Epoch 2/10 - 28.47s - loss: 7569.6284 - acc: 0.29452038 - batches: 5964
Epoch 3/10 - 31.63s - loss: 7569.4673 - acc: 0.29452038 - batches: 5964
Epoch 4/10 - 35.06s - loss: 7569.4214 - acc: 0.29452038 - batches: 5964
Epoch 5/10 - 30.67s - loss: 7569.4136 - acc: 0.29452038 - batches: 5964
Epoch 6/10 - 29.77s - loss: 7569.412 - acc: 0.29452038 - batches: 5964
Epoch 7/10 - 34.62s - loss: 7569.412 - acc: 0.29452038 - batches: 5964
Epoch 8/10 - 32.51s - loss: 7569.412 - acc: 0.29452038 - batches: 5964
Epoch 9/10 - 38.16s - loss: 7569.4116 - acc: 0.29452038 - batches: 5964
Epoch 10/10 - 30.08s - loss: 7569.4116 - acc: 0.29452038 - batches: 5964


2025-06-04 15:25:17,113 - INFO - === Fold 5/5 ===                               


Accuracy:  0.2991
Precision: 0.0895
Recall:    0.2991
F1-score:  0.1377


2025-06-04 15:30:35.951814: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/42a07aedb599_classifier_dl15316262005986145512
2025-06-04 15:30:36.065159: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2025-06-04 15:30:36.065228: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/42a07aedb599_classifier_dl15316262005986145512
2025-06-04 15:30:36.505904: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2025-06-04 15:30:37.273898: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/42a07aedb599_classifier_dl15316262005986145512
2025-06-04 15:30:37.421947: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 1470142 microse

Training started - epochs: 10 - learning_rate: 0.003 - batch_size: 8 - training_examples: 47602 - classes: 3
Epoch 1/10 - 27.76s - loss: 7087.7466 - acc: 0.35737395 - batches: 5951
Epoch 2/10 - 27.26s - loss: 5273.9336 - acc: 0.6487605 - batches: 5951
Epoch 3/10 - 28.62s - loss: 4879.205 - acc: 0.7222059 - batches: 5951
Epoch 4/10 - 29.95s - loss: 4560.129 - acc: 0.7771429 - batches: 5951
Epoch 5/10 - 29.22s - loss: 4358.7744 - acc: 0.81334037 - batches: 5951
Epoch 6/10 - 29.64s - loss: 4205.1816 - acc: 0.8366386 - batches: 5951
Epoch 7/10 - 31.39s - loss: 4121.002 - acc: 0.8532983 - batches: 5951
Epoch 8/10 - 32.91s - loss: 4058.8323 - acc: 0.8655672 - batches: 5951
Epoch 9/10 - 30.35s - loss: 4002.6511 - acc: 0.8752521 - batches: 5951
Epoch 10/10 - 30.25s - loss: 3959.205 - acc: 0.8821008 - batches: 5951




Accuracy:  0.7938
Precision: 0.7943
Recall:    0.7938
F1-score:  0.7932


                                                                                

In [11]:
avg_accuracy = sum([m[0] for m in metrics]) / len(metrics)
avg_precision = sum([m[1] for m in metrics]) / len(metrics)
avg_recall = sum([m[2] for m in metrics]) / len(metrics)
avg_f1 = sum([m[3] for m in metrics]) / len(metrics)

print(f"Average Accuracy: {avg_accuracy:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1-score: {avg_f1:.4f}")

Average Accuracy: 0.8028
Average Precision: 0.8029
Average Recall: 0.8028
Average F1-score: 0.8023


In [5]:
from functools import reduce

def cross_validate_custom(data, pipeline, k=3):
    folds = k_fold_split(data, k)
    metrics = []

    for i in range(k):
        logging.info(f"=== Fold {i + 1}/{k} ===")
        validate_data = folds[i]
        train_folds = [folds[j] for j in range(k) if j != i]
        train_data = reduce(lambda df1, df2: df1.union(df2), train_folds)

        model = pipeline.fit(train_data)
        prediction = model.transform(validate_data).withColumn("prediction", col("prediction")[0].cast("string"))
        accuracy, precision, recall, f1 = evaluator(prediction, label_col="label", prediction_col="prediction")
        metrics.append((accuracy, precision, recall, f1))

    return metrics


def run_grid_search_cv(data, k_fold=3):
    lr_list = [0.003, 0.01]
    batch_size_list = [8]
    epoch_list = [5]

    best_f1 = 0
    best_params = {}

    for lr in lr_list:
        for batch_size in batch_size_list:
            for epochs in epoch_list:
                logging.info(f"Running CV for lr={lr}, batch_size={batch_size}, epochs={epochs}")

                classifier = ClassifierDLApproach() \
                    .setInputCols(["sentence_embeddings"]) \
                    .setOutputCol("category") \
                    .setLabelColumn("label") \
                    .setMaxEpochs(epochs) \
                    .setLr(lr) \
                    .setBatchSize(batch_size) \
                    .setEnableOutputLogs(False)

                pipeline = Pipeline(stages=[
                    DocumentAssembler().setInputCol("text").setOutputCol("document"),
                    Tokenizer().setInputCols(["document"]).setOutputCol("token"),
                    Normalizer().setInputCols(["token"]).setOutputCol("normalized").setLowercase(True),
                    StopWordsCleaner().setInputCols(["normalized"]).setOutputCol("cleanTokens").setCaseSensitive(False),
                    LemmatizerModel.pretrained("lemma_antbnc").setInputCols(["cleanTokens"]).setOutputCol("lemmatized"),
                    ElmoEmbeddings.pretrained("elmo", "en").setInputCols(["document", "lemmatized"]).setOutputCol("embeddings"),
                    SentenceEmbeddings().setInputCols(["document", "embeddings"]).setOutputCol("sentence_embeddings").setPoolingStrategy("AVERAGE"),
                    classifier,
                    Finisher().setInputCols(["category"]).setOutputCols(["prediction"]).setCleanAnnotations(False)
                ])

                with mlflow.start_run(run_name=f"CV_lr{lr}_bs{batch_size}_ep{epochs}") as run:
                    metrics = cross_validate_custom(data, pipeline, k=k_fold)
                    avg_accuracy = sum([m[0] for m in metrics]) / len(metrics)
                    avg_precision = sum([m[1] for m in metrics]) / len(metrics)
                    avg_recall = sum([m[2] for m in metrics]) / len(metrics)
                    avg_f1 = sum([m[3] for m in metrics]) / len(metrics)

                    mlflow.log_param("lr", lr)
                    mlflow.log_param("batch_size", batch_size)
                    mlflow.log_param("maxEpochs", epochs)
                    mlflow.log_param("k_fold", k_fold)

                    mlflow.log_metric("avg_accuracy", avg_accuracy)
                    mlflow.log_metric("avg_precision", avg_precision)
                    mlflow.log_metric("avg_recall", avg_recall)
                    mlflow.log_metric("avg_f1", avg_f1)

                    if avg_f1 > best_f1:
                        best_f1 = avg_f1
                        best_params = {"lr": lr, "batch_size": batch_size, "epochs": epochs}
                        logging.info(f"New best F1: {best_f1:.4f} with {best_params}")

    logging.info(f"Best result: F1={best_f1:.4f} with {best_params}")
    return best_params, best_f1

In [10]:
best_params, best_f1 = run_grid_search_cv(data, k_fold=3)

2025-06-03 10:22:13,472 - INFO - Running CV for lr=0.003, batch_size=8, epochs=5


lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
elmo download started this may take some time.
Approximate size to download 334.1 MB
[OK!]


2025-06-03 10:22:21,883 - INFO - === Fold 1/3 ===
2025-06-03 10:27:26.782891: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/8849228df1f7_classifier_dl1773962856390756356
2025-06-03 10:27:26.840734: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2025-06-03 10:27:26.840815: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/8849228df1f7_classifier_dl1773962856390756356
2025-06-03 10:27:27.340335: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2025-06-03 10:27:28.571469: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/8849228df1f7_classifier_dl1773962856390756356
2025-06-03 10:27:28.756151: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { ser

Training started - epochs: 5 - learning_rate: 0.003 - batch_size: 8 - training_examples: 39687 - classes: 3
Epoch 1/5 - 23.90s - loss: 4492.515 - acc: 0.6418311 - batches: 4961
Epoch 2/5 - 23.36s - loss: 4153.958 - acc: 0.7221486 - batches: 4961
Epoch 3/5 - 22.92s - loss: 3917.5547 - acc: 0.7716446 - batches: 4961
Epoch 4/5 - 24.68s - loss: 3751.4656 - acc: 0.80357504 - batches: 4961
Epoch 5/5 - 27.88s - loss: 3636.135 - acc: 0.8250972 - batches: 4961


2025-06-03 10:42:35,005 - INFO - === Fold 2/3 ===                               


Accuracy:  0.7558
Precision: 0.7564
Recall:    0.7558
F1-score:  0.7539


2025-06-03 10:47:33.963171: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/0b9bff23548f_classifier_dl18019890689534691922
2025-06-03 10:47:34.025296: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2025-06-03 10:47:34.025352: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/0b9bff23548f_classifier_dl18019890689534691922
2025-06-03 10:47:34.614089: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2025-06-03 10:47:35.695078: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/0b9bff23548f_classifier_dl18019890689534691922
2025-06-03 10:47:35.848735: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 1885572 microse

Training started - epochs: 5 - learning_rate: 0.003 - batch_size: 8 - training_examples: 39466 - classes: 3
Epoch 1/5 - 24.18s - loss: 4450.542 - acc: 0.63957024 - batches: 4934
Epoch 2/5 - 24.86s - loss: 4122.7783 - acc: 0.7191111 - batches: 4934
Epoch 3/5 - 24.17s - loss: 3880.0667 - acc: 0.7666988 - batches: 4934
Epoch 4/5 - 24.10s - loss: 3714.915 - acc: 0.80252886 - batches: 4934
Epoch 5/5 - 27.97s - loss: 3589.4832 - acc: 0.8256639 - batches: 4934


2025-06-03 11:02:59,410 - INFO - === Fold 3/3 ===                               


Accuracy:  0.7594
Precision: 0.7590
Recall:    0.7594
F1-score:  0.7582


2025-06-03 11:07:38.705725: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/86f3e562b209_classifier_dl14455062086615724094
2025-06-03 11:07:38.791451: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2025-06-03 11:07:38.791526: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/86f3e562b209_classifier_dl14455062086615724094
2025-06-03 11:07:39.513010: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2025-06-03 11:07:41.613112: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/86f3e562b209_classifier_dl14455062086615724094
2025-06-03 11:07:41.923166: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 3217453 microse

Training started - epochs: 5 - learning_rate: 0.003 - batch_size: 8 - training_examples: 39841 - classes: 3
Epoch 1/5 - 23.17s - loss: 4445.5645 - acc: 0.6431978 - batches: 4981
Epoch 2/5 - 22.74s - loss: 4073.2725 - acc: 0.7239458 - batches: 4981
Epoch 3/5 - 23.24s - loss: 3834.0652 - acc: 0.7714859 - batches: 4981
Epoch 4/5 - 23.16s - loss: 3674.495 - acc: 0.8063755 - batches: 4981
Epoch 5/5 - 23.72s - loss: 3563.5935 - acc: 0.82916665 - batches: 4981


                                                                                

Accuracy:  0.7617
Precision: 0.7633
Recall:    0.7617
F1-score:  0.7594


2025-06-03 11:22:19,965 - INFO - New best F1: 0.7571 with {'lr': 0.003, 'batch_size': 8, 'epochs': 5}


🏃 View run CV_lr0.003_bs8_ep5 at: https://dagshub.com/TranChucThien/kltn-sentiment-monitoring-mlops.mlflow/#/experiments/63/runs/52b972d7e0614b01a5f9650dc80dd2ad
🧪 View experiment at: https://dagshub.com/TranChucThien/kltn-sentiment-monitoring-mlops.mlflow/#/experiments/63


2025-06-03 11:22:20,654 - INFO - Running CV for lr=0.01, batch_size=8, epochs=5


lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
elmo download started this may take some time.
Approximate size to download 334.1 MB
[OK!]


2025-06-03 11:22:29,172 - INFO - === Fold 1/3 ===
2025-06-03 11:27:24.639050: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/abb87c442a4a_classifier_dl14414614010468942213
2025-06-03 11:27:24.793219: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2025-06-03 11:27:24.793296: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/abb87c442a4a_classifier_dl14414614010468942213
2025-06-03 11:27:25.360390: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2025-06-03 11:27:26.265248: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/abb87c442a4a_classifier_dl14414614010468942213
2025-06-03 11:27:26.447727: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { 

Training started - epochs: 5 - learning_rate: 0.01 - batch_size: 8 - training_examples: 39687 - classes: 3
Epoch 1/5 - 22.61s - loss: 6000.825 - acc: 0.3371796 - batches: 4961
Epoch 2/5 - 22.04s - loss: 6000.516 - acc: 0.3373056 - batches: 4961
Epoch 3/5 - 23.15s - loss: 6000.516 - acc: 0.3373056 - batches: 4961
Epoch 4/5 - 23.19s - loss: 6000.516 - acc: 0.3373056 - batches: 4961
Epoch 5/5 - 23.66s - loss: 6000.516 - acc: 0.3373056 - batches: 4961


2025-06-03 11:42:43,630 - INFO - === Fold 2/3 ===                               


Accuracy:  0.3382
Precision: 0.1144
Recall:    0.3382
F1-score:  0.1710


2025-06-03 11:48:12.559485: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/bc4ed32b338a_classifier_dl8706843278266446949
2025-06-03 11:48:12.651621: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2025-06-03 11:48:12.651695: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/bc4ed32b338a_classifier_dl8706843278266446949
2025-06-03 11:48:13.169785: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2025-06-03 11:48:14.143728: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/bc4ed32b338a_classifier_dl8706843278266446949
2025-06-03 11:48:14.314171: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 1754697 microsecon

Training started - epochs: 5 - learning_rate: 0.01 - batch_size: 8 - training_examples: 39466 - classes: 3
Epoch 1/5 - 25.78s - loss: 5804.101 - acc: 0.36836103 - batches: 4934
Epoch 2/5 - 29.51s - loss: 5803.654 - acc: 0.36843705 - batches: 4934
Epoch 3/5 - 27.54s - loss: 5803.654 - acc: 0.36843705 - batches: 4934
Epoch 4/5 - 29.37s - loss: 5803.654 - acc: 0.36843705 - batches: 4934
Epoch 5/5 - 27.26s - loss: 5803.654 - acc: 0.36843705 - batches: 4934


2025-06-03 12:04:42,831 - INFO - === Fold 3/3 ===                               


Accuracy:  0.3644
Precision: 0.1328
Recall:    0.3644
F1-score:  0.1946


2025-06-03 12:10:36.070251: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/63b80be1792f_classifier_dl13265948584870888484
2025-06-03 12:10:36.196035: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2025-06-03 12:10:36.196258: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/63b80be1792f_classifier_dl13265948584870888484
2025-06-03 12:10:36.955887: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2025-06-03 12:10:37.959790: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/63b80be1792f_classifier_dl13265948584870888484
2025-06-03 12:10:38.177603: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 2107361 microse

Training started - epochs: 5 - learning_rate: 0.01 - batch_size: 8 - training_examples: 39841 - classes: 3
Epoch 1/5 - 23.86s - loss: 5875.1333 - acc: 0.36666667 - batches: 4981
Epoch 2/5 - 23.50s - loss: 5874.5596 - acc: 0.36666667 - batches: 4981
Epoch 3/5 - 23.31s - loss: 5874.5596 - acc: 0.36666667 - batches: 4981
Epoch 4/5 - 22.53s - loss: 5874.5596 - acc: 0.36666667 - batches: 4981
Epoch 5/5 - 24.53s - loss: 5874.5596 - acc: 0.36666667 - batches: 4981


                                                                                

Accuracy:  0.3679
Precision: 0.1353
Recall:    0.3679
F1-score:  0.1979
🏃 View run CV_lr0.01_bs8_ep5 at: https://dagshub.com/TranChucThien/kltn-sentiment-monitoring-mlops.mlflow/#/experiments/63/runs/ef9fbf47519f413c954ee11f8fc9cc93
🧪 View experiment at: https://dagshub.com/TranChucThien/kltn-sentiment-monitoring-mlops.mlflow/#/experiments/63


2025-06-03 12:25:26,072 - INFO - Best result: F1=0.7571 with {'lr': 0.003, 'batch_size': 8, 'epochs': 5}


In [11]:
print(best_params)
print(best_f1)

{'lr': 0.003, 'batch_size': 8, 'epochs': 5}
0.7571472894732002
