## Starting Spark NLP

In [None]:
import findspark
findspark.init()

In [None]:
import sparknlp

spark = sparknlp.start()

In [None]:
import comet_ml

comet_ml.init()

In [None]:
from sparknlp.logging.comet import CometLogger
logger = CometLogger()
logger.experiment.set_name('PretrainedModel')

## Loading data

In [None]:
from pyspark.sql.functions import when, col

df=spark.read.format("csv").option("header", "true").load("source/Twitter_Data.csv")
# drop neutral data
df = df.where(df.category != 0)
# transform numbers to strings
df = df.withColumn("sentiment", when(col("category") == 1, "positive").otherwise("negative"))

In [None]:
df.count()

## Using a Pretrained pipeline

In [None]:
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

document = DocumentAssembler() \
    .setInputCol("clean_text") \
    .setOutputCol("document")

token = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normal")

vivekn =  ViveknSentimentModel.pretrained() \
    .setInputCols(["document", "normal"]) \
    .setOutputCol("result_sentiment") \

finisher = Finisher() \
    .setInputCols(["result_sentiment"]) \
    .setOutputCols("final_sentiment")

In [None]:
pipeline = Pipeline().setStages([document, token, normalizer, vivekn, finisher])

Selecting input data

In [None]:
X = df.select('clean_text').toDF('clean_text')

Training the pipeline

In [None]:
pipelineModel = pipeline.fit(X)
result = pipelineModel.transform(X)

Logging the pipeline parameters

## Logging Evaluation in Comet

In [None]:
from sklearn.metrics import classification_report

df_tot = df.join(result, on=["clean_text"])
pandas_df = df_tot.toPandas()
pandas_df['predicted_sentiment'] = [','.join(map(str, l)) for l in pandas_df['final_sentiment']]

report = classification_report(pandas_df['sentiment'], pandas_df['predicted_sentiment'], output_dict=True, labels=['positive', 'negative'])
for key, value in report.items():
    logger.log_metrics(value,prefix=key)

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(pandas_df['sentiment'], pandas_df['predicted_sentiment'])
logger.log_metrics({"accuracy": accuracy})

In [None]:
logger.end()

## Building a custom model

In [None]:
logger = CometLogger()
logger.experiment.set_name('CustomModel')

In [None]:
vivekn_custom = ViveknSentimentApproach() \
    .setInputCols(["document", "normal"]) \
    .setSentimentCol("sentiment") \
    .setOutputCol("result_sentiment") 
    
pipeline = Pipeline().setStages([document, token, normalizer, vivekn_custom, finisher])

## Training/Test split

In [None]:
(training_set, test_set) = df.randomSplit([0.8, 0.2])

In [None]:
X_train = training_set.select('clean_text', 'sentiment').toDF('clean_text', 'sentiment')
X_test = test_set.select('clean_text', 'sentiment').toDF('clean_text', 'sentiment')
pipelineModel = pipeline.fit(X_train)
result = pipelineModel.transform(X_test)

## Logging Evaluation in Comet

In [None]:
pandas_df = result.select('sentiment', 'final_sentiment').toPandas()
pandas_df['predicted_sentiment'] = [','.join(map(str, l)) for l in pandas_df['final_sentiment']]

report = classification_report(pandas_df['sentiment'], pandas_df['predicted_sentiment'], output_dict=True, labels=['positive', 'negative'])
for key, value in report.items():
    logger.log_metrics(value,prefix=key)

In [None]:
accuracy = accuracy_score(pandas_df['sentiment'], pandas_df['predicted_sentiment'])
logger.log_metrics({"accuracy": accuracy})

In [None]:
logger.end()