### Setup

In [1]:
import logging

import synapse.ml.core
from synapse.ml.services.language import AnalyzeText

import pyspark.sql.functions as F

from pyspark.sql import DataFrame
from delta.tables import DeltaTable

StatementMeta(, ecf36333-226e-4966-88de-82dc8acfe62c, 3, Finished, Available, Finished)

In [None]:
%run Helpers

In [2]:
logger = setup_logger()

StatementMeta(, ecf36333-226e-4966-88de-82dc8acfe62c, 4, Finished, Available, Finished)

In [3]:
RAW_COMMENTS_TABLE = "abfss://fd12376e-2797-4027-bb8e-42a3a8228a70@onelake.dfs.fabric.microsoft.com/77b89b44-1bcf-42fa-a9ac-7d0593123d3d/Tables/comments"
ENHANCED_COMMENTS_TABLE = "abfss://fd12376e-2797-4027-bb8e-42a3a8228a70@onelake.dfs.fabric.microsoft.com/1ead427c-19d4-417e-bb8f-a68d9adc0f38/Tables/comments"

ENHANCED_VIDEOS_TABLE = "abfss://fd12376e-2797-4027-bb8e-42a3a8228a70@onelake.dfs.fabric.microsoft.com/1ead427c-19d4-417e-bb8f-a68d9adc0f38/Tables/videos"

StatementMeta(, ecf36333-226e-4966-88de-82dc8acfe62c, 5, Finished, Available, Finished)

### Load data

In [4]:
raw_comments = spark.read.format("delta").load(RAW_COMMENTS_TABLE).cache()
raw_comments.count() # Materialize cache
videos = spark.read.format("delta").load(ENHANCED_VIDEOS_TABLE).select("id").filter("country <> 'Unknown'").cache()
videos.count() # Materialize cache

StatementMeta(, ecf36333-226e-4966-88de-82dc8acfe62c, 6, Finished, Available, Finished)

227

### Detect Language

In [5]:
model = (AnalyzeText()
        .setTextCol("textDisplay")
        .setKind("LanguageDetection")
        .setOutputCol("response")
)

comments_with_lang = model.transform(raw_comments)\
        .withColumn("detectedLanguage", F.col("response.documents.detectedLanguage.name"))\
        .withColumn("detectedLanguageIso", F.col("response.documents.detectedLanguage.iso6391Name")
)
comments_with_lang = comments_with_lang.cache()
comments_with_lang.count()  # Persist transformed results

StatementMeta(, ecf36333-226e-4966-88de-82dc8acfe62c, 7, Finished, Available, Finished)

26088

In [6]:
supported_languages = {'af','sq','am','ar','hy','as','az','eu','be','bn','bs','br','bg','my','ca','zh','zh-hans','zh-hant','hr','cs','da','nl','en','eo','et','fil','fi','fr','gl','ka','de','el','gu','ha','he','hi','hu','id','ga','it','ja','jv','kn','kk','km','ko','ku','ky','lo','la','lv','lt','mk','mg','ms','ml','mr','mn','ne','no','or','om','ps','fa','pl','pt','pt-PT','pt-BR','pa','ro','ru','sa','gd','sr','sd','si','sk','sl','so','es','su','sw','sv','ta','te','th','tr','uk','ur','ug','uz','vi','cy','fy','xh','yi'}
comments_supported = comments_with_lang\
    .where(F.col('detectedLanguageIso').isin(supported_languages))\
    .withColumn("comment_hash", F.md5(F.col("textDisplay")))\
    .select(
        "id","textDisplay","videoId","publishedAt","likeCount","detectedLanguage","detectedLanguageIso","comment_hash","_created_date","_modified_date"
    )

StatementMeta(, ecf36333-226e-4966-88de-82dc8acfe62c, 8, Finished, Available, Finished)

### Score sentiment

In [7]:
directory, table_name = ENHANCED_COMMENTS_TABLE.rsplit('/', 1)
all_databases = spark.catalog.listDatabases()
database = [t for t in all_databases if t.locationUri == directory][0].name

if spark.catalog.tableExists(f"`{database}`.`{table_name}`"):
    logger.info(f"{table_name} table found")
    existing_comments = spark.read.format("delta").load(ENHANCED_COMMENTS_TABLE)
    new_or_modified_comments = comments_supported.join(
        existing_comments, 
        [comments_supported.id == existing_comments.id, comments_supported.comment_hash == existing_comments.comment_hash],
        "left_anti"
    ).cache()
    new_or_modified_comments.count()  # Materialize the join
    logger.info("Created new_or_modified_comments from join")
else:
    new_or_modified_comments = comments_supported.cache()
    new_or_modified_comments.count()
    logger.info("Created new_or_modified_comments")

StatementMeta(, ecf36333-226e-4966-88de-82dc8acfe62c, 9, Finished, Available, Finished)

2025-03-16 22:40:40,509 - INFO - comments table found
2025-03-16 22:40:45,614 - INFO - Created new_or_modified_comments from join


In [8]:
if new_or_modified_comments.isEmpty():
    message = "No new comments identified"
    logger.error(message)
    notebookutils.notebook.exit(message)

StatementMeta(, ecf36333-226e-4966-88de-82dc8acfe62c, 10, Finished, Available, Finished)

In [9]:
model = (AnalyzeText()
        .setTextCol("textDisplay")
        .setKind("SentimentAnalysis")
        .setOutputCol("response")
)

comments_with_sentiment = model.transform(new_or_modified_comments)\
        .withColumn("sentiment", F.col("response.documents.sentiment"))\
        .withColumn("positiveConfidence", F.col("response.documents.confidenceScores.positive"))\
        .withColumn("neutralConfidence", F.col("response.documents.confidenceScores.neutral"))\
        .withColumn("negativeConfidence", F.col("response.documents.confidenceScores.negative")
)


StatementMeta(, ecf36333-226e-4966-88de-82dc8acfe62c, 11, Finished, Available, Finished)

In [10]:
comments_with_sentiment = comments_with_sentiment.select(
    "id",
    "textDisplay",
    "videoId",
    "publishedAt",
    "likeCount",
    "detectedLanguage",
    "detectedLanguageIso",
    "comment_hash",
    "sentiment",
    "positiveConfidence",
    "neutralConfidence",
    "negativeConfidence",
    "_created_date",
    "_modified_date"
)


StatementMeta(, ecf36333-226e-4966-88de-82dc8acfe62c, 12, Finished, Available, Finished)

### Merge data

In [11]:
def merge_comments_data(comments_df: DataFrame, table_path: str) -> None:
    """
    Merge the comments data into the Delta table
    
    Args:
        comments_df: The DataFrame containing the new video data and details.
        table_path: The Delta table ABFS path to merge into.
    """
    try:
        target_table = DeltaTable.forPath(spark, table_path)
        logger.info("Merging data started")
        (
            target_table.alias("target").merge(
                comments_df.alias("source"),
                "target.id = source.id"
            ).whenMatchedUpdate(set={
                "textDisplay": "source.textDisplay",
                "videoId": "source.videoId",
                "publishedAt": "source.publishedAt",
                "likeCount": "source.likeCount",
                "detectedLanguage": "source.detectedLanguage",
                "detectedLanguageIso": "source.detectedLanguageIso",
                "comment_hash": "source.comment_hash",
                "sentiment": "source.sentiment",
                "positiveConfidence": "source.positiveConfidence",
                "neutralConfidence": "source.neutralConfidence",
                "negativeConfidence": "source.negativeConfidence",
                "_modified_date": "source._modified_date"
            })
            .whenNotMatchedInsert(values={
                "id": "source.id",
                "textDisplay": "source.textDisplay",
                "videoId": "source.videoId",
                "publishedAt": "source.publishedAt",
                "likeCount": "source.likeCount",
                "detectedLanguage": "source.detectedLanguage",
                "detectedLanguageIso": "source.detectedLanguageIso",
                "comment_hash": "source.comment_hash",
                "sentiment": "source.sentiment",
                "positiveConfidence": "source.positiveConfidence",
                "neutralConfidence": "source.neutralConfidence",
                "negativeConfidence": "source.negativeConfidence",
                "_created_date": "source._created_date",
                "_modified_date": "source._modified_date"
            })
            .execute()
        )
        logger.info("Merging data finished")
        lastCommit = target_table.history(1).collect()[0]
        metrics = lastCommit["operationMetrics"] 

        numInserted = int(metrics.get("numTargetRowsInserted", 0))
        numUpdated = int(metrics.get("numTargetRowsUpdated", 0))
        numDeleted = int(metrics.get("numTargetRowsDeleted", 0))

        logger.info(f"Rows inserted: {numInserted}")
        logger.info(f"Rows updated: {numUpdated}")
        logger.info(f"Rows deleted: {numDeleted}")
        try:
            logger.info("Start optimize")
            target_table.optimize().executeCompaction()
            logger.info("Finished optimize")
        except Exception as e:
            logger.error("Failed to optimize")
            raise

    except Exception as e:
        logger.exception(f"Exception details: {str(e)}")
        raise
        


StatementMeta(, ecf36333-226e-4966-88de-82dc8acfe62c, 13, Finished, Available, Finished)

In [12]:
merge_comments_data(comments_with_sentiment, ENHANCED_COMMENTS_TABLE)

StatementMeta(, ecf36333-226e-4966-88de-82dc8acfe62c, 14, Finished, Available, Finished)

2025-03-16 22:40:48,509 - INFO - Merging data started
2025-03-16 22:42:22,309 - INFO - Merging data finished
2025-03-16 22:42:23,085 - INFO - Rows inserted: 5970
2025-03-16 22:42:23,086 - INFO - Rows updated: 1
2025-03-16 22:42:23,087 - INFO - Rows deleted: 0
2025-03-16 22:42:23,087 - INFO - Start optimize
2025-03-16 22:42:24,700 - INFO - Finished optimize
