In [10]:
from pyspark.ml.feature import RegexTokenizer, HashingTF, IDF, CountVectorizer, Normalizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType, DoubleType, StringType, StructType, StructField
from pyspark.sql import SparkSession
from pyspark.ml.feature import StopWordsRemover
from nltk.sentiment import SentimentIntensityAnalyzer

In [11]:
data_file = r"file:///home/jovyan/repos/distributed-sentiment-analysis-on-twitter-data/twitter_scraper/data/twitter_data.csv"
vader_analyzer = SentimentIntensityAnalyzer()

In [12]:
# Initialize a Spark session
spark = SparkSession \
    .builder \
    .appName("SentimentAnalysis") \
    .getOrCreate()

In [13]:
# define the data schema(format/structure) for our twitter data in the csv file
twitter_data_schema = StructType([StructField("date_str", StringType(), True),
                                  StructField("text", StringType(), True),
                                  StructField("user_id", StringType(), True),
                                  StructField("location", StringType(), True)])

In [14]:
df_raw = spark.read.csv(
    data_file, schema=twitter_data_schema
)

In [15]:
df_raw.show(truncate=False)

+------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+----------------------+
|date_str                      |text                                                                                                                                                                      |user_id           |location              |
+------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+----------------------+
|Sat Apr 14 07:27:36 +0000 2018|Y'all I'm so in love                                                                                                                                                      |3466453992        |Brookings, SD         |
|Sat Apr 14 07:2

In [55]:
def sentiment_analysis(text):
    result = vader_analyzer.polarity_scores(str(text))
    return result

udf_sentiment_analysis = udf(sentiment_analysis)

In [52]:
def dummy(text):
    return 111

udf_dummy = udf(dummy)

In [56]:
sentiment_analysed = df_raw.withColumn("polarity_score", udf_sentiment_analysis(col("text")))

In [58]:
sentiment_analysed.select('text', 'polarity_score').show(n=100,truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------+
|text                                                                                                                                                                                                                                                                              |polarity_score                                     |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------+
|Y'all I'm so

# Stop Spark

In [None]:
spark.stop()