In [1]:
from pyspark.ml.feature import RegexTokenizer, HashingTF, IDF, CountVectorizer, Normalizer, StringIndexer, Tokenizer
from pyspark.ml import Pipeline, PipelineModel
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType, DoubleType, StringType, StructType, StructField
from pyspark.sql import SparkSession
from pyspark.ml.feature import StopWordsRemover
from nltk.sentiment import SentimentIntensityAnalyzer
import pyspark

Note that to run this notebook up you need to wget your own dataset using:

wget http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip

and unzip the file into ./sentiment140 subfolder.

This dataset is too large to push to the github repository.

In [2]:
data_file = r"file:///home/jovyan/repos/distributed-sentiment-analysis-on-twitter-data/sentiment140/training.1600000.processed.noemoticon.csv"

In [3]:
conf = pyspark.SparkConf().setAll([('spark.executor.memory', '3g'),
                                   ('spark.executor.cores', '2'), # 4
                                   ('spark.cores.max', '2'), # 4
                                   ('spark.driver.memory','4g')])

# Initialize a Spark session
spark = SparkSession \
    .builder \
    .appName("TrainSentimentModel") \
    .config(conf=conf) \
    .getOrCreate()

sc = spark.sparkContext

In [4]:
sc.getConf().getAll()

[('spark.executor.memory', '3g'),
 ('spark.app.name', 'TrainSentimentModel'),
 ('spark.driver.port', '43089'),
 ('spark.driver.memory', '4g'),
 ('spark.app.id', 'local-1523988106481'),
 ('spark.executor.id', 'driver'),
 ('spark.executor.cores', '2'),
 ('spark.driver.host', 'de4f1c03e850'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.cores.max', '2'),
 ('spark.master', 'local[*]'),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true')]

In [5]:
# define the data schema(format/structure) for our twitter data in the csv file
training_data_schema = StructType([StructField("target", StringType(), True),
                                  StructField("id", StringType(), True),
                                  StructField("date", StringType(), True),
                                  StructField("query", StringType(), True),
                                  StructField("user", StringType(), True),
                                  StructField("text_raw", StringType(), True)])

In [6]:
df_raw = spark.read.csv(
    data_file, schema=training_data_schema
)

In [7]:
df_raw.show(truncate=False)

+------+----------+----------------------------+--------+---------------+---------------------------------------------------------------------------------------------------------------------+
|target|id        |date                        |query   |user           |text_raw                                                                                                             |
+------+----------+----------------------------+--------+---------------+---------------------------------------------------------------------------------------------------------------------+
|0     |1467810369|Mon Apr 06 22:19:45 PDT 2009|NO_QUERY|_TheSpecialOne_|@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D  |
|0     |1467810672|Mon Apr 06 22:19:49 PDT 2009|NO_QUERY|scotthamilton  |is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!      |
|0     |1467810917|Mon Apr 06 22:19:53 P

In [8]:
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import re
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()

at_user_pat = r'@[A-Za-z0-9_]+'  # r'@[\w]+'
url_pat = r'https?://[^ ]+'  # r'https?:\/\/[^\s]+'
www_pat = r'www.[^ ]+'
repeating_chars_pat = r'([A-Za-z])\1+'
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    try:
        bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        bom_removed = souped
    stripped = re.sub(at_user_pat, 'USERNAME', bom_removed)
    stripped = re.sub(url_pat, 'URL', stripped)
    stripped = re.sub(www_pat, 'URL', stripped)
    stripped = re.sub(repeating_chars_pat, r'\1\1', stripped)
    
    lower_case = stripped.lower()
    neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], lower_case)
    letters_only = re.sub("[^a-zA-Z]", " ", neg_handled)
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = [x for x  in tok.tokenize(letters_only) if len(x) > 1]
    return (" ".join(words)).strip()

udf_tweet_cleaner = udf(tweet_cleaner)

In [9]:
text_preprocessed = df_raw.withColumn("text", udf_tweet_cleaner(col("text_raw")))

In [10]:
text_preprocessed.select("target", "text_raw").show(truncate=False)

+------+---------------------------------------------------------------------------------------------------------------------+
|target|text_raw                                                                                                             |
+------+---------------------------------------------------------------------------------------------------------------------+
|0     |@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D  |
|0     |is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!      |
|0     |@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds                            |
|0     |my whole body feels itchy and like its on fire                                                                       |
|0     |@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over

In [11]:
text_preprocessed.select("target", "text").show(truncate=False)

+------+---------------------------------------------------------------------------------------------------------+
|target|text                                                                                                     |
+------+---------------------------------------------------------------------------------------------------------+
|0     |username url aww that bummer you shoulda got david carr of third day to do it                            |
|0     |is upset that he can not update his facebook by texting it and might cry as result school today also blah|
|0     |username dived many times for the ball managed to save the rest go out of bounds                         |
|0     |my whole body feels itchy and like its on fire                                                           |
|0     |username no it not behaving at all mad why am here because can not see you all over there                |
|0     |username not the whole crew                                             

In [12]:
type(text_preprocessed)

pyspark.sql.dataframe.DataFrame

In [13]:
# text_preprocessed = text_preprocessed.dropna()

In [14]:
(train_set, val_set, test_set) = text_preprocessed.randomSplit([0.98, 0.01, 0.01], seed = 2018)

In [15]:
%%time
train_set.cache()
val_set.cache()
test_set.cache()

train_set.count()
val_set.count()
test_set.count()

CPU times: user 60 ms, sys: 10 ms, total: 70 ms
Wall time: 5min 4s


# HashingTF + IDF + Logistic Regression

In [16]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [17]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")

In [20]:
%%time
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol = "target", outputCol = "label")
lr = LogisticRegression(maxIter=100)
pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx, lr])

pipelineFit = pipeline.fit(train_set)
predictions = pipelineFit.transform(val_set)
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(val_set.count())
roc_auc = evaluator.evaluate(predictions)

print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.7904
ROC-AUC: 0.8615
CPU times: user 160 ms, sys: 20 ms, total: 180 ms
Wall time: 32.9 s


In [22]:
pipelineFit.save("hashtf_idf_lr")

In [18]:
%%time
pipelineFit_loaded = PipelineModel.load("hashtf_idf_lr")

predictions = pipelineFit_loaded.transform(val_set)
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(val_set.count())
roc_auc = evaluator.evaluate(predictions)

print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.7904
ROC-AUC: 0.8615
CPU times: user 70 ms, sys: 10 ms, total: 80 ms
Wall time: 3.62 s


# CountVectorizer + IDF + Logistic Regression

In [21]:
%%time
from pyspark.ml.feature import CountVectorizer

tokenizer = Tokenizer(inputCol="text", outputCol="words")
cv = CountVectorizer(vocabSize=2**16, inputCol="words", outputCol='cv')
idf = IDF(inputCol='cv', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol = "target", outputCol = "label")
lr = LogisticRegression(maxIter=100)
pipeline = Pipeline(stages=[tokenizer, cv, idf, label_stringIdx, lr])

pipelineFit = pipeline.fit(train_set)
predictions = pipelineFit.transform(val_set)
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(val_set.count())
roc_auc = evaluator.evaluate(predictions)

print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.7971
ROC-AUC: 0.8662
CPU times: user 180 ms, sys: 20 ms, total: 200 ms
Wall time: 39.9 s


In [24]:
pipelineFit.save("cv_idf_lr")

In [19]:
%%time
pipelineFit_loaded = PipelineModel.load("cv_idf_lr")

predictions = pipelineFit_loaded.transform(val_set)
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(val_set.count())
roc_auc = evaluator.evaluate(predictions)

print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.7971
ROC-AUC: 0.8662
CPU times: user 70 ms, sys: 0 ns, total: 70 ms
Wall time: 2.58 s


# Using N-Gram

In [21]:
from pyspark.ml.feature import NGram, VectorAssembler
from pyspark.ml.feature import ChiSqSelector

def build_trigrams(inputCol=["text","target"], n=3):
    tokenizer = [Tokenizer(inputCol="text", outputCol="words")]
    ngrams = [
        NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    cv = [
        CountVectorizer(vocabSize=2**14,inputCol="{0}_grams".format(i),
            outputCol="{0}_tf".format(i))
        for i in range(1, n + 1)
    ]
    idf = [IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=5) for i in range(1, n + 1)]

    assembler = [VectorAssembler(
        inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)],
        outputCol="rawFeatures"
    )]
    label_stringIdx = [StringIndexer(inputCol = "target", outputCol = "label")]
    selector = [ChiSqSelector(numTopFeatures=2**14,featuresCol='rawFeatures', outputCol="features")]
    lr = [LogisticRegression(maxIter=100)]
    return Pipeline(stages=tokenizer + ngrams + cv + idf+ assembler + label_stringIdx+selector+lr)

In [19]:
def build_ngrams_wocs(inputCol=["text","target"], n=3):
    tokenizer = [Tokenizer(inputCol="text", outputCol="words")]
    ngrams = [
        NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    cv = [
        CountVectorizer(vocabSize=5460,inputCol="{0}_grams".format(i),
            outputCol="{0}_tf".format(i))
        for i in range(1, n + 1)
    ]
    idf = [IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=5) for i in range(1, n + 1)]

    assembler = [VectorAssembler(
        inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)],
        outputCol="features"
    )]
    label_stringIdx = [StringIndexer(inputCol = "target", outputCol = "label")]
    lr = [LogisticRegression(maxIter=100)]
    return Pipeline(stages=tokenizer + ngrams + cv + idf+ assembler + label_stringIdx+lr)

In [24]:
%%time
trigramwocs_pipelineFit = build_ngrams_wocs().fit(train_set)
predictions_wocs = trigramwocs_pipelineFit.transform(val_set)
accuracy_wocs = predictions_wocs.filter(predictions_wocs.label == predictions_wocs.prediction).count() / float(val_set.count())
roc_auc_wocs = evaluator.evaluate(predictions_wocs)

print("Accuracy Score: {0:.4f}".format(accuracy_wocs))
print("ROC-AUC: {0:.4f}".format(roc_auc_wocs))

Accuracy Score: 0.8115
ROC-AUC: 0.8879
CPU times: user 220 ms, sys: 100 ms, total: 320 ms
Wall time: 3min 11s


In [25]:
trigramwocs_pipelineFit.save("ngram_cv_idf_lr")

In [27]:
%%time
trigramwocs_pipelineFit_loaded = PipelineModel.load("ngram_cv_idf_lr")

predictions = trigramwocs_pipelineFit_loaded.transform(val_set)
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(val_set.count())
roc_auc = evaluator.evaluate(predictions)

print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.8115
ROC-AUC: 0.8879
CPU times: user 130 ms, sys: 20 ms, total: 150 ms
Wall time: 2.94 s


In [28]:
%%time
test_predictions = trigramwocs_pipelineFit.transform(test_set)
test_accuracy = test_predictions.filter(test_predictions.label == test_predictions.prediction).count() / float(test_set.count())
test_roc_auc = evaluator.evaluate(test_predictions)

print("Accuracy Score: {0:.4f}".format(test_accuracy))
print("ROC-AUC: {0:.4f}".format(test_roc_auc))

Accuracy Score: 0.8092
ROC-AUC: 0.8858
CPU times: user 40 ms, sys: 30 ms, total: 70 ms
Wall time: 1.41 s


# Using Naive Bayes with ngram_cv_idf_nb

In [89]:
from pyspark.ml.classification import NaiveBayes

In [106]:
def build_ngrams_nb(inputCol=["text","target"], n=3):
    tokenizer = [Tokenizer(inputCol="text", outputCol="words")]
    ngrams = [
        NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    cv = [
        CountVectorizer(vocabSize=5460,inputCol="{0}_grams".format(i),
            outputCol="{0}_tf".format(i))
        for i in range(1, n + 1)
    ]
    idf = [IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=5) for i in range(1, n + 1)]

    assembler = [VectorAssembler(
        inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)],
        outputCol="features"
    )]
    label_stringIdx = [StringIndexer(inputCol = "target", outputCol = "label")]
    nb = [NaiveBayes(smoothing=1.0, modelType="multinomial")]
    return Pipeline(stages=tokenizer + ngrams + cv + idf+ assembler + label_stringIdx+nb)

In [96]:
%%time
nb_pipelineFit = build_ngrams_nb().fit(train_set)
predictions = nb_pipelineFit.transform(val_set)
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(val_set.count())
roc_auc = evaluator.evaluate(predictions)

print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.7846
CPU times: user 270 ms, sys: 70 ms, total: 340 ms
Wall time: 2min 41s


In [98]:
nb_pipelineFit.save("ngram_cv_idf_nb")

In [105]:
%%time
nb_pipelineFit_loaded = PipelineModel.load("ngram_cv_idf_nb")

predictions = nb_pipelineFit_loaded.transform(val_set)
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(val_set.count())
roc_auc = evaluator.evaluate(predictions)

print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.7846
ROC-AUC: 0.5529
CPU times: user 120 ms, sys: 20 ms, total: 140 ms
Wall time: 2.74 s


# Using Gradient Boost Tree with ngram_cv_idf_gbt

In [18]:
from pyspark.ml.classification import GBTClassifier

In [19]:
def build_ngrams_gbt(inputCol=["text","target"], n=3):
    tokenizer = [Tokenizer(inputCol="text", outputCol="words")]
    ngrams = [
        NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    cv = [
        CountVectorizer(vocabSize=5460,inputCol="{0}_grams".format(i),
            outputCol="{0}_tf".format(i))
        for i in range(1, n + 1)
    ]
    idf = [IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=5) for i in range(1, n + 1)]

    assembler = [VectorAssembler(
        inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)],
        outputCol="features"
    )]
    label_stringIdx = [StringIndexer(inputCol = "target", outputCol = "label")]
    gbt = [GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)]
    return Pipeline(stages=tokenizer + ngrams + cv + idf+ assembler + label_stringIdx+gbt)

In [None]:
%%time
gbt_pipelineFit = build_ngrams_gbt().fit(train_set)
predictions = gbt_pipelineFit.transform(val_set)
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(val_set.count())
roc_auc = evaluator.evaluate(predictions)

print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))

# Linear SVM

In [20]:
from pyspark.ml.classification import LinearSVC

In [21]:
def build_ngrams_lsvc(inputCol=["text","target"], n=3):
    tokenizer = [Tokenizer(inputCol="text", outputCol="words")]
    ngrams = [
        NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    cv = [
        CountVectorizer(vocabSize=5460,inputCol="{0}_grams".format(i),
            outputCol="{0}_tf".format(i))
        for i in range(1, n + 1)
    ]
    idf = [IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=5) for i in range(1, n + 1)]

    assembler = [VectorAssembler(
        inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)],
        outputCol="features"
    )]
    label_stringIdx = [StringIndexer(inputCol = "target", outputCol = "label")]
    lsvc = [LinearSVC(maxIter=10, regParam=0.1)]
    return Pipeline(stages=tokenizer + ngrams + cv + idf+ assembler + label_stringIdx+lsvc)

In [22]:
%%time
lsvc_pipelineFit = build_ngrams_lsvc().fit(train_set)
predictions = lsvc_pipelineFit.transform(val_set)
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(val_set.count())
roc_auc = evaluator.evaluate(predictions)

print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.8080
ROC-AUC: 0.8840
CPU times: user 250 ms, sys: 110 ms, total: 360 ms
Wall time: 12min 23s


In [23]:
lsvc_pipelineFit.save("ngram_cv_idf_lsvc")

In [24]:
%%time
lsvc_pipelineFit_loaded = PipelineModel.load("ngram_cv_idf_lsvc")

predictions = lsvc_pipelineFit_loaded.transform(val_set)
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(val_set.count())
roc_auc = evaluator.evaluate(predictions)

print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.8080
ROC-AUC: 0.8840
CPU times: user 90 ms, sys: 40 ms, total: 130 ms
Wall time: 3.69 s


# Comparing Performance of Models on Sentiment140 Test Set

In [51]:
test_data_file = r"file:///home/jovyan/repos/distributed-sentiment-analysis-on-twitter-data/sentiment140/testdata.manual.2009.06.14.csv"

In [52]:
test_data_schema = StructType([StructField("target", StringType(), True),
                               StructField("id", StringType(), True),
                               StructField("date", StringType(), True),
                               StructField("query", StringType(), True),
                               StructField("user", StringType(), True),
                               StructField("text_raw", StringType(), True)])

In [53]:
df_test = spark.read.csv(
    test_data_file, schema=test_data_schema
)

In [54]:
df_test.show(5)

+------+---+--------------------+-------+--------+--------------------+
|target| id|                date|  query|    user|            text_raw|
+------+---+--------------------+-------+--------+--------------------+
|     4|  3|Mon May 11 03:17:...|kindle2|  tpryan|@stellargirl I lo...|
|     4|  4|Mon May 11 03:18:...|kindle2|  vcu451|Reading my kindle...|
|     4|  5|Mon May 11 03:18:...|kindle2|  chadfu|Ok, first assesme...|
|     4|  6|Mon May 11 03:19:...|kindle2|   SIX15|@kenburbary You'l...|
|     4|  7|Mon May 11 03:21:...|kindle2|yamarama|@mikefish  Fair e...|
+------+---+--------------------+-------+--------+--------------------+
only showing top 5 rows



In [55]:
benchmark_set = df_test.withColumn("text", udf_tweet_cleaner(col("text_raw")))

In [61]:
benchmark_set = benchmark_set.filter((col("target") != "2"))

In [66]:
benchmark_set.cache()

benchmark_set.count()

359

### Testing hashtf_idf_lr

In [87]:
pipelineFit_loaded = PipelineModel.load("hashtf_idf_lr")

In [88]:
%%time

predictions = pipelineFit_loaded.transform(benchmark_set)
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(benchmark_set.count())
roc_auc = evaluator.evaluate(predictions)

print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.7827
ROC-AUC: 0.8436
CPU times: user 10 ms, sys: 20 ms, total: 30 ms
Wall time: 288 ms


### Testing cv_idf_lr

In [85]:
pipelineFit_loaded = PipelineModel.load("cv_idf_lr")

In [86]:
%%time

predictions = pipelineFit_loaded.transform(benchmark_set)
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(benchmark_set.count())
roc_auc = evaluator.evaluate(predictions)

print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.7827
ROC-AUC: 0.8446
CPU times: user 30 ms, sys: 0 ns, total: 30 ms
Wall time: 851 ms


### Testing ngram_cv_idf_lr

In [69]:
trigramwocs_pipelineFit_loaded = PipelineModel.load("ngram_cv_idf_lr")

In [83]:
%%time

test_predictions = trigramwocs_pipelineFit_loaded.transform(benchmark_set)
test_accuracy = test_predictions.filter(test_predictions.label == test_predictions.prediction).count() / float(benchmark_set.count())
test_roc_auc = evaluator.evaluate(test_predictions)

print("Accuracy Score: {0:.4f}".format(test_accuracy))
print("ROC-AUC: {0:.4f}".format(test_roc_auc))

Accuracy Score: 0.8106
ROC-AUC: 0.9019
CPU times: user 40 ms, sys: 30 ms, total: 70 ms
Wall time: 634 ms


### Testing NLTK Vader Analyser

In [79]:
from nltk.sentiment import SentimentIntensityAnalyzer

vader_analyzer = SentimentIntensityAnalyzer()

In [80]:
def nltk_sentiment_analysis(text):
    result = vader_analyzer.polarity_scores(str(text))
    prediction = "4" if result['compound'] >= 0 else "0"
    # prediction = "4" if result['pos'] >= result['neg'] else "0"

    return prediction

udf_nltk_sentiment_analysis = udf(nltk_sentiment_analysis)

In [84]:
%%time

test_predictions = benchmark_set.withColumn("nltk_prediction", udf_nltk_sentiment_analysis(col("text")))
test_accuracy = test_predictions.filter(test_predictions.target == test_predictions.nltk_prediction).count() / float(benchmark_set.count())

print("Accuracy Score: {0:.4f}".format(test_accuracy))

Accuracy Score: 0.7855
CPU times: user 10 ms, sys: 0 ns, total: 10 ms
Wall time: 220 ms
