In [1]:
# Reading the cleaned data into a dataframe
df = spark.sql("SELECT * FROM bing_lake_db.tbl_latest_news")

StatementMeta(, c0b628ed-a7f5-48bc-9e11-6d20561500be, 3, Finished, Available, Finished)

In [2]:
# Importing the AnalyzeText model from SynapseML
import synapse.ml.core
from synapse.ml.services import AnalyzeText

StatementMeta(, eb165974-90f2-44c4-b19d-6e1cb31fc293, 4, Finished, Available, Finished)

In [4]:
# Importing the model and configuring the input and output of the model

model = (AnalyzeText()
        .setTextCol("description")
        .setKind("SentimentAnalysis")
        .setOutputCol("response")
        .setErrorCol("error"))

StatementMeta(, eb165974-90f2-44c4-b19d-6e1cb31fc293, 6, Finished, Available, Finished)

In [5]:
# applying the model on the data
result = model.transform(df)

StatementMeta(, eb165974-90f2-44c4-b19d-6e1cb31fc293, 7, Finished, Available, Finished)

In [8]:
from pyspark.sql.functions import col


# creating a new column to store only the sentiment of the news article
sentiment_df = result.withColumn("sentiment", col("response.documents.sentiment"))

# removing unnecessary columns
final_sentiment_df = sentiment_df.drop("response","error")

StatementMeta(, eb165974-90f2-44c4-b19d-6e1cb31fc293, 10, Finished, Available, Finished)

In [14]:
# TYPE 1 Incremental Loading to load the news data along with its corresponding sentiment into a delta table


from pyspark.sql.utils import AnalysisException

try:
    table_name = 'bing_lake_db.sentiment_analysis'
    final_sentiment_df.write.format("delta").saveAsTable(table_name)

except AnalysisException:

    print("Table already exists")  

    final_sentiment_df.createOrReplaceTempView("vw_sentiment_df_final")

    spark.sql(f""" MERGE INTO {table_name} target_table
                   USING vw_sentiment_df_final source_view

                   ON source_view.url = target_table.url

                   WHEN MATCHED AND
                   source_view.title <> target_table.title OR
                   source_view.description <> target_table.description OR
                   source_view.category <> target_table.category OR
                   source_view.image <> target_table.image OR
                   source_view.provider <> target_table.provider OR
                   source_view.datePublished <> target_table.datePublished 

                   THEN UPDATE SET * 

                   WHEN NOT MATCHED THEN INSERT *
                
                """)  

StatementMeta(, eb165974-90f2-44c4-b19d-6e1cb31fc293, 16, Finished, Available, Finished)

Table already exists
