# Imports

In [None]:
import synapse.ml.core
from synapse.ml.services.language import AnalyzeText
from pyspark.sql.functions import col

# Load text from Raw

In [None]:
comments = spark.sql("SELECT * FROM Raw.comments")

In [None]:
commentCount = comments.count()
print(commentCount)

In [None]:
videos = spark.sql("SELECT Id AS videoPrimaryId FROM Enhanced.videos")

Inner join the comments and video table to remove videos that are not country specific

In [None]:
comments = comments.join(videos, comments.videoId == videos.videoPrimaryId, "inner").drop("videoPrimaryId")

In [None]:
comments.count()

# Detect Language

In [None]:
model = (AnalyzeText()
        .setTextCol("textDisplay")
        .setKind("LanguageDetection")
        .setOutputCol("response"))

result = model.transform(comments)\
        .withColumn("detectedLanguage", col("response.documents.detectedLanguage.name"))\
        .withColumn("detectedLanguageIso", col("response.documents.detectedLanguage.iso6391Name"))


### Filter to supported languages

In [None]:
supportedLanguages = {'af','sq','am','ar','hy','as','az','eu','be','bn','bs','br','bg','my','ca','zh','zh-hans','zh-hant','hr','cs','da','nl','en','eo','et','fil','fi','fr','gl','ka','de','el','gu','ha','he','hi','hu','id','ga','it','ja','jv','kn','kk','km','ko','ku','ky','lo','la','lv','lt','mk','mg','ms','ml','mr','mn','ne','no','or','om','ps','fa','pl','pt','pt-PT','pt-BR','pa','ro','ru','sa','gd','sr','sd','si','sk','sl','so','es','su','sw','sv','ta','te','th','tr','uk','ur','ug','uz','vi','cy','fy','xh','yi'}

In [None]:
resultSupported = result.where(col('detectedLanguageIso').isin(supportedLanguages))

In [None]:
resultSupported.write.mode("overwrite").format("delta").option("overwriteSchema", "true").saveAsTable("commentsSupported")

### Score sentiment

In [None]:
resultSupported = spark.sql("SELECT * FROM Enhanced.commentsSupported")

In [None]:
resultCount = resultSupported.count()
print(resultCount)
commentsCount - resultCount

In [None]:
model = (AnalyzeText()
        .setTextCol("textDisplay")
        .setKind("SentimentAnalysis")
        .setOutputCol("response"))

result = model.transform(resultSupported)\
        .withColumn("sentiment", col("response.documents.sentiment"))\
        .withColumn("positiveConfidence", col("response.documents.confidenceScores.positive"))\
        .withColumn("neutralConfidence", col("response.documents.confidenceScores.neutral"))\
        .withColumn("negativeConfidence", col("response.documents.confidenceScores.negative"))


In [None]:
result = result.select('id','textDisplay','publishedAt','likeCount','videoId','detectedLanguage','sentiment','positiveConfidence','neutralConfidence','negativeConfidence')

# Write Data

In [None]:
result.write.mode("overwrite").format("delta").option("overwriteSchema", "true").saveAsTable("comments")

### Remove commentsSupported Table

In [None]:
%%sql
DROP TABLE IF EXISTS commentsSupported