In [4]:
import re
import pyspark 
from pyspark.streaming import StreamingContext
from pyspark.sql.session import SparkSession
from pyspark.sql import SQLContext, Row, Column
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [5]:
# define the function to get the predicted sentiment on the data received
def get_prediction(tweet_text):
    try:
        # remove the blank tweets
        tweet_text = tweet_text.filter(lambda x: len(x) > 0)
        # create the dataframe with each row contains a tweet text
        rowRdd = tweet_text.map(lambda w: Row(tweet=w))
        wordsDataFrame = spark.createDataFrame(rowRdd)
        # get the sentiments for each row
        test_df = pipelineFit.transform(wordsDataFrame)
        model.transform(test_df).select('tweet','prediction').show()
        with open('test.csv', 'a') as tf:
            tf.write(model)
    except : 
        print('No data')

In [6]:
conf = SparkConf().setAppName("Test").setMaster("local")
sc = SparkContext.getOrCreate(conf)
sc.setLogLevel("ERROR")
ssc = StreamingContext(sc,10)

In [7]:
# reading the data set
print('\n\nReading the dataset...........................\n')
data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('datasetReviewed.csv')
data = data.dropna()
data.show(3, truncate=15)



Reading the dataset...........................

+---------------+---------------+-------------+---------------+--------------+-------------+-------------+---------------+----------+
|       tweet_id|        company|company_count|     created_at|favorite_count|retweet_count|  screen_name|          tweet|polaridade|
+---------------+---------------+-------------+---------------+--------------+-------------+-------------+---------------+----------+
|112898458133...|BANCO DO BRASIL|            1|2019-05-16 1...|             0|            3|AgenciaEstado|RT @colunado...|         0|
|112824522429...|            IRB|            1|2019-05-14 1...|             0|            1|AgenciaEstado|RT @colunado...|         1|
|112754225315...|           ITAÚ|            1|2019-05-12 1...|             0|            4|AgenciaEstado|RT @colunado...|         1|
+---------------+---------------+-------------+---------------+--------------+-------------+-------------+---------------+----------+
only showing

In [8]:
udf = UserDefinedFunction(lambda x: re.sub(r"http\S+", "", x).lower().replace('.','').replace(';','').replace('-','').replace(':','').replace(')','').replace('"','').replace('rt',''), StringType())

data = data.select(*[udf(column).alias(column) for column in data.columns])

In [9]:
print('\n\nDefining the pipeline stages.................\n')

tokenizer = Tokenizer(inputCol="tweet", outputCol="words")

remover = StopWordsRemover(inputCol="words", outputCol="filtered")

hashtf = HashingTF(numFeatures=2**16, inputCol="filtered", outputCol='tf')

idf = IDF(inputCol='tf', outputCol="features", minDocFreq=3)

label_stringIdx = StringIndexer(inputCol = "polaridade", outputCol = "label")

print('\n\nStages Defined................................\n')
pipeline = Pipeline(stages=[tokenizer, remover, hashtf, idf, label_stringIdx])



Defining the pipeline stages.................



Stages Defined................................



In [10]:
print('\n\nFit the pipeline with the training data.......\n')
pipelineFit = pipeline.fit(data)
train_df = pipelineFit.transform(data)



Fit the pipeline with the training data.......



In [11]:
# create the trainer and set its parameters
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

print('\n\nModel Trained....Waiting for the Data!!!!!!!!\n')
# train the model
model = nb.fit(train_df)



Model Trained....Waiting for the Data!!!!!!!!



In [12]:
ssc = StreamingContext(sc, batchDuration= 3)
lines = ssc.socketTextStream("127.0.0.1", 8181)
words = lines.flatMap(lambda line : line.split('TWEET_APP'))

words.foreachRDD(get_prediction)

ssc.start()             # Start the computation
ssc.awaitTermination()  # Wait for the computation to terminate

No data
No data
No data
+-----------+----------+
|      tweet|prediction|
+-----------+----------+
|AAAAAAAAAAA|       0.0|
+-----------+----------+

+-----+----------+
|tweet|prediction|
+-----+----------+
| tete|       0.0|
+-----+----------+

No data
No data
+----------+----------+
|     tweet|prediction|
+----------+----------+
|odeio isso|       0.0|
+----------+----------+

No data
+--------+----------+
|   tweet|prediction|
+--------+----------+
|amo isso|       0.0|
+--------+----------+

No data
No data
+------+----------+
| tweet|prediction|
+------+----------+
|gostei|       0.0|
+------+----------+

No data
No data
+-----+----------+
|tweet|prediction|
+-----+----------+
|odiei|       0.0|
+-----+----------+

No data
No data
No data
No data
+-----------+----------+
|      tweet|prediction|
+-----------+----------+
|gosot muito|       0.0|
+-----------+----------+

No data
No data
No data


KeyboardInterrupt: 