### Inicializando SparkContext

In [1]:
import pyspark 
from pyspark.sql import SQLContext
# to start a spark context

sc=SparkContext.getOrCreate()   
    
spark = SparkSession.builder.appName("TweetSentiApp").getOrCreate()

### Importando o arquivo e retirando NaN encontrados

In [2]:
data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('datasetReviewed.csv')
#data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('hdfs://localhost:9000/user/input/datasetReviewed.csv')
data = data.dropna()

### Separando dataframe em 80% treino e 20% teste

In [3]:
(train_set, test_set) = data.randomSplit([0.8, 0.2], seed = 0)

### Limpando o dataframe

In [4]:
import re
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import *

udf = UserDefinedFunction(lambda x: re.sub(r"http\S+", "", x).lower().replace('.','').replace(';','').replace('-','').replace(':','').replace(')','').replace('"','').replace('rt',''), StringType())

data = data.select(*[udf(column).alias(column) for column in data.columns])

### Tokenização

In [5]:
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
df_token = tokenizer.transform(data)

### Stop Words

In [6]:
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol="words", outputCol="filtered")
df_remover = remover.transform(df_token)

### TDIDF

In [7]:
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.feature import StringIndexer

hashtf = HashingTF(numFeatures=2**16, inputCol="filtered", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=3)
label_stringIdx = StringIndexer(inputCol = "polaridade", outputCol = "label")

### Pipeline

In [8]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[tokenizer, remover, hashtf, idf, label_stringIdx])
pipelineFit = pipeline.fit(train_set)
train_df = pipelineFit.transform(train_set)
test_df = pipelineFit.transform(test_set)

### Naive Bayes classification

In [9]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# create the trainer and set its parameters
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

# train the model
model = nb.fit(train_df)

# select example rows to display.
predictions = model.transform(test_df)

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
evaluator.evaluate(predictions)

0.768796992481203

### Selecionando do dataframe predictions igual a 1 ou 0

In [10]:
predictions.createOrReplaceTempView("temp")
predictions = spark.sql('SELECT * FROM temp WHERE prediction == 1 OR prediction == 0')

predictions.groupBy('polaridade','prediction').count().show()

+----------+----------+-----+
|polaridade|prediction|count|
+----------+----------+-----+
|         1|       0.0|   34|
|        -1|       1.0|   26|
|         0|       0.0|  238|
|         0|       1.0|   38|
|         1|       1.0|  148|
|        -1|       0.0|   20|
+----------+----------+-----+



### Droppando colunas temporárias

In [11]:
drop_list = ['words','filtered','tf','features','label','rawPrediction','probability','column_as_str']
predictions = predictions.select([column for column in predictions.columns if column not in drop_list])

### Salvando arquivo resultado no HDFS

In [12]:
predictions.write.format('com.databricks.spark.csv').mode("overwrite").options(header='true', inferschema='true').save('hdfs://localhost:9000/user/input/result/')

### Verificando arquivos no HDFS

In [1]:
! /home/vineasouza/hadoop/bin/hadoop fs -ls /user/input/result/

Found 2 items
-rw-r--r--   3 vineasouza supergroup          0 2020-07-08 17:49 /user/input/result/_SUCCESS
-rw-r--r--   3 vineasouza supergroup      91585 2020-07-08 17:49 /user/input/result/part-00000-a7fb3af3-6a94-4bd2-86d3-0f97dd44e095-c000.csv
