In [1]:
from pyspark.ml.feature import RegexTokenizer, HashingTF, IDF, CountVectorizer, Normalizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType, DoubleType, StringType, StructType, StructField
from pyspark.sql import SparkSession
from pyspark.ml.feature import StopWordsRemover
from nltk.sentiment import SentimentIntensityAnalyzer

In [2]:
data_file = r"file:///home/jovyan/repos/distributed-sentiment-analysis-on-twitter-data/twitter_scraper/twitter_data_final.csv"
vader_analyzer = SentimentIntensityAnalyzer()

In [3]:
# Initialize a Spark session
spark = SparkSession \
    .builder \
    .appName("SentimentAnalysis") \
    .getOrCreate()

In [4]:
# define the data schema(format/structure) for our twitter data in the csv file
twitter_data_schema = StructType([StructField("date_str", StringType(), True),
                                  StructField("text", StringType(), True),
                                  StructField("user_id", StringType(), True),
                                  StructField("location", StringType(), True)])

In [5]:
df_raw = spark.read.csv(
    data_file, schema=twitter_data_schema
)

In [6]:
df_raw.show(truncate=False)

+-----------------------------------------------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+
|date_str                                             |text              |user_id                                                                                                                                                                                           |location          |
+-----------------------------------------------------+------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+
|Mon Apr 16 04:56:52 +0000 2018                       |985743805822521345|URL                                                        

In [7]:
def sentiment_analysis(text):
    result = vader_analyzer.polarity_scores(str(text))
    return result

udf_sentiment_analysis = udf(sentiment_analysis)

In [8]:
sentiment_analysed = df_raw.withColumn("polarity_score", udf_sentiment_analysis(col("text")))

In [9]:
sentiment_analysed.select('text', 'polarity_score').show(n=100,truncate=False)

+-------------------------------------------------------+------------------------------------------------+
|text                                                   |polarity_score                                  |
+-------------------------------------------------------+------------------------------------------------+
|985743805822521345                                     |{neg=0.0, pos=0.0, compound=0.0, neu=1.0}       |
|985743807391019008                                     |{neg=0.0, pos=0.0, compound=0.0, neu=1.0}       |
|985743807927894016                                     |{neg=0.0, pos=0.0, compound=0.0, neu=1.0}       |
|null                                                   |{neg=0.0, pos=0.0, compound=0.0, neu=1.0}       |
|a0b47ebc53eb0e63                                       |{neg=0.0, pos=0.0, compound=0.0, neu=1.0}       |
|985743808137592834                                     |{neg=0.0, pos=0.0, compound=0.0, neu=1.0}       |
|985743807848288256                  

# Stop Spark

In [None]:
spark.stop()