In [42]:
# https://github.com/tthustla/setiment_analysis_pyspark/blob/master/Sentiment%20Analysis%20with%20PySpark.ipynb

In [43]:
import pyspark as ps
import warnings
from pyspark.sql import SQLContext
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import NGram, VectorAssembler
from pyspark.ml.feature import ChiSqSelector

In [2]:
# to start a spark context
sc=SparkContext.getOrCreate()   
    
spark = SparkSession.builder.appName("TweetSentiApp").getOrCreate()

In [3]:
df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('datasetReviewed.csv')
type(df)

pyspark.sql.dataframe.DataFrame

In [4]:
df.show(5)

+--------------------+---------------+-------------+-------------------+--------------+-------------+-------------+--------------------+----------+
|            tweet_id|        company|company_count|         created_at|favorite_count|retweet_count|  screen_name|               tweet|polaridade|
+--------------------+---------------+-------------+-------------------+--------------+-------------+-------------+--------------------+----------+
| 1128984581330407424|BANCO DO BRASIL|            1|2019-05-16 11:24:34|             0|            3|AgenciaEstado|RT @colunadobroad...|         0|
| 1128600913092857856|       IGUATEMI|            1|2019-05-15 10:00:00|             4|            1|AgenciaEstado|Iguatemi vai repe...|      null|
|https://t.co/ZTml...|              1|         null|               null|          null|         null|         null|                null|      null|
| 1128245224294293504|            IRB|            1|2019-05-14 10:26:37|             0|            1|AgenciaEsta

In [5]:
df = df.dropna()
df.count()

2667

In [6]:
(train_set, val_set, test_set) = df.randomSplit([0.98, 0.01, 0.01], seed = 2000)

In [12]:
tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol = "polaridade", outputCol = "label")
pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx])

In [46]:
pipelineFit = pipeline.fit(train_set)
train_df = pipelineFit.transform(train_set)
val_df = pipelineFit.transform(val_set)
train_df.show(1)

+-----------+-------+-------------+-------------------+--------------+-------------+-------------+--------------------+----------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|   tweet_id|company|company_count|         created_at|favorite_count|retweet_count|  screen_name|               tweet|polaridade|               words|                  cv|            features|label|       rawPrediction|         probability|prediction|
+-----------+-------+-------------+-------------------+--------------+-------------+-------------+--------------------+----------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|1,10735E+18|  CIELO|            1|2019-03-17 18:35:36|             5|            1|colunadobroad|Visa e Cielo lanç...|         0|[visa, e, cielo, ...|(7804,[0,5,6,11,2...|(7804,[0,5,6,11,2...|  0.0|[148.667419061282...|[1.0,2.1485922172...|

In [16]:
lr = LogisticRegression(maxIter=100)
lrModel = lr.fit(train_df)
predictions = lrModel.transform(val_df)

In [17]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

0.9393939393939393

In [18]:
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(val_set.count())
accuracy

0.7

In [21]:
tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
cv = CountVectorizer(vocabSize=2**16, inputCol="words", outputCol='cv')
idf = IDF(inputCol='cv', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol = "polaridade", outputCol = "label")
lr = LogisticRegression(maxIter=100)
pipeline = Pipeline(stages=[tokenizer, cv, idf, label_stringIdx, lr])

In [22]:
pipelineFit = pipeline.fit(train_set)
predictions = pipelineFit.transform(val_set)
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(val_set.count())
roc_auc = evaluator.evaluate(predictions)

In [24]:
print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.8000
ROC-AUC: 0.9899


In [26]:
def build_trigrams(inputCol=["tweet","polaridade"], n=3):
    tokenizer = [Tokenizer(inputCol="tweet", outputCol="words")]
    ngrams = [
        NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    cv = [
        CountVectorizer(vocabSize=2**14,inputCol="{0}_grams".format(i),
            outputCol="{0}_tf".format(i))
        for i in range(1, n + 1)
    ]
    idf = [IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=5) for i in range(1, n + 1)]

    assembler = [VectorAssembler(
        inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)],
        outputCol="rawFeatures"
    )]
    label_stringIdx = [StringIndexer(inputCol = "polaridade", outputCol = "label")]
    selector = [ChiSqSelector(numTopFeatures=2**14,featuresCol='rawFeatures', outputCol="features")]
    lr = [LogisticRegression(maxIter=100)]
    return Pipeline(stages=tokenizer + ngrams + cv + idf+ assembler + label_stringIdx+selector+lr)

In [35]:
trigram_pipelineFit = build_trigrams().fit(train_set)
predictions = trigram_pipelineFit.transform(val_set)
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(val_set.count())
roc_auc = evaluator.evaluate(predictions)

In [37]:
print ("Accuracy Score: {0:.4f}".format(accuracy))
print ("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.8000
ROC-AUC: 0.9798


In [30]:
def build_ngrams_wocs(inputCol=["tweet","polaridade"], n=3):
    tokenizer = [Tokenizer(inputCol="tweet", outputCol="words")]
    ngrams = [
        NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    cv = [
        CountVectorizer(vocabSize=5460,inputCol="{0}_grams".format(i),
            outputCol="{0}_tf".format(i))
        for i in range(1, n + 1)
    ]
    idf = [IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=5) for i in range(1, n + 1)]

    assembler = [VectorAssembler(
        inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)],
        outputCol="features"
    )]
    label_stringIdx = [StringIndexer(inputCol = "polaridade", outputCol = "label")]
    lr = [LogisticRegression(maxIter=100)]
    return Pipeline(stages=tokenizer + ngrams + cv + idf+ assembler + label_stringIdx+lr)

In [31]:
trigramwocs_pipelineFit = build_ngrams_wocs().fit(train_set)
predictions_wocs = trigramwocs_pipelineFit.transform(val_set)
accuracy_wocs = predictions_wocs.filter(predictions_wocs.label == predictions_wocs.prediction).count() / float(val_set.count())
roc_auc_wocs = evaluator.evaluate(predictions_wocs)

In [32]:
# print accuracy, roc_auc
print("Accuracy Score: {0:.4f}".format(accuracy_wocs))
print("ROC-AUC: {0:.4f}".format(roc_auc_wocs))

Accuracy Score: 0.8000
ROC-AUC: 0.9798


In [36]:
test_predictions = trigramwocs_pipelineFit.transform(test_set)
test_accuracy = test_predictions.filter(test_predictions.label == test_predictions.prediction).count() / float(test_set.count())
test_roc_auc = evaluator.evaluate(test_predictions)

In [38]:
print("Accuracy Score: {0:.4f}".format(test_accuracy))
print("ROC-AUC: {0:.4f}".format(test_roc_auc))

Accuracy Score: 0.7500
ROC-AUC: 0.8021
