In [1]:
%%init_spark
launcher.num_executors = 4
launcher.executor_cores = 2
launcher.driver_memory = '10g'
launcher.packages= ["org.apache.bahir:spark-streaming-twitter_2.12:2.4.0",
                   "org.jfree:jfreechart:1.5.3"]

In [3]:
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.rdd.PairRDDFunctions._
import scala.io.Source
import scala.collection.mutable.ArrayBuffer
import org.jfree.data.xy.{XYSeries, XYSeriesCollection} 
import org.jfree.chart.{ChartFactory, ChartFrame, JFreeChart} 
import org.jfree.chart.plot.{PlotOrientation, XYPlot} 
import org.jfree.chart.util.PaintUtils
import java.awt.Paint
import java.awt.Color._
sc.setLogLevel("ERROR")
val filters = Source.fromFile("fifa2022_words.txt").getLines.toArray

val sentimentFilePath = "AFINN-111.txt"

val wordSentiments = sc.textFile(sentimentFilePath).map { line => 
    (line.split("\t")(0),line.split("\t")(1).toDouble)
}.cache()

def getWords(text: String): Array[String] = {
    text.split(" ").map(a=>a.toArray.filter(b=>b.isLetter)).map(c=>c.mkString)
}

val ssc = new StreamingContext(sc,Seconds(10.toLong))
ssc.checkpoint("checkpoint")
val text = ssc.socketTextStream("localhost", 4444)

//filter tweets only keeping world cup related ones
//Use transform to work on each rdd (text is a DStream object, not an RDD)
//filter using exists and contains
//Also, convert the text to lowercase (all the keywords are in lowercase)
val filteredText = text.transform(
                    rdd => rdd.filter( //rdd
                        a => filters.exists( //element
                            key => a.toLowerCase().contains(key))))

//convert all tweets in filteredText into a single array of words (think flatMap)
val words = filteredText.flatMap(getWords(_))

//get sentiments
//we need to convert each word into a pair (word, 1) to count the number of words
//apply transform to join each rdd with the wordSentiment rdd using fullOuterJoin
//Use match to convert Option to a new paired rdd (count, 1*sentiment)



val sentiment = words.map(word => (word, 1.0))
                  .transform{
                      rdd => rdd.fullOuterJoin(wordSentiments)
                          .flatMap(pair => pair match {
                                    case (word,(Some(count),None)) => Some(0.0,0.0)
                                    case (word,(None,senti)) => None
                                    case (word,(Some(count),Some(senti))) => Some(count,1*senti)
                                    }
                              )
                            }

//Define a window of length 120 that slides every 40 seconds
val sentiment_window = sentiment.window(Seconds(20),Seconds(10))

//Create an empty ArrayBuffer all_sentiments that contains sentiments
//And a second array buffer that contains the (timestamp,moving average)
//Because we'll modify them, these need to be var, not val

val MOVING_AVERAGE_LENGTH = 3

var all_sentiments = ArrayBuffer[Double]()
var all_averages = ArrayBuffer[(String,Double)]()

/*
1. apply foreachRDD to each sentiment window

2. Update all_sentiments by the sentiment of the rdd (divide total sentiment by the count
of all words and multiply by 100.0

3. Calculate the total for count and sentiment (sentiment should be (Double,Double) pairs)
Example of all_sentiments:
res6: scala.collection.mutable.ArrayBuffer[Double] = ArrayBuffer(0.17157852240613647, 0.10092344956350609, 
0.10092175200161475, 0.07737334320123797, 
0.19847944560317568, 0.20185029436501262, 
0.1883936080740118, 0.10765711209796798, 
0.04709998654286099, 0.026914278024491995)

4. Compute the moving average. If the number of elements in all_sentiments is less 
than MOVING_AVERAGE_LENGTH, then a simple average works. If greater, then compute
the average of the last MOVING_AVERAGE_LENGTH elements (the scala function slice may help)

5. Uodate all_averages with the timestamp (cleaned) and the moving average. Example:
ArrayBuffer((5820000,0.17157852240613647), 
(5860000,0.13625098598482127), 
(5900000,0.12447457465708577), 
(5940000,0.09307284825545294), 
(5980000,0.12559151360200946), 
(6020000,0.15923436105647543), 
(6060000,0.19624111601406669), 
(6100000,0.16596700484566412), 
(6140000,0.1143835689049469), 
(6180000,0.060557125555107))

6. You also need to clean the timestamp. Convert it into a string, 
drop the "ms" from the end, and then drop everything other than last 7 digits
You might find the function takeRight useful

*/



sentiment_window.foreachRDD((r,t) => {
    val sum = r.map(t => t._2).fold(0.0)((a,b)=>a+b)
    val count = r.count()
    val clean_timestamp = t.toString.dropRight(3).takeRight(9)
    all_sentiments += (sum/count) * 100.0
    //during some window, the input can be empty, the sentiment will be NaN. 
    //NaN will affect all_averages, so I filter NaN first.
    all_sentiments = all_sentiments.filter(x=> !x.isNaN)
    if (all_sentiments.length >= 1) {
        if (all_sentiments.length == 1) all_averages += ((clean_timestamp, all_sentiments(0)))
        else if (all_sentiments.length < MOVING_AVERAGE_LENGTH)
                all_averages += ((clean_timestamp,all_sentiments.sum/all_sentiments.length))
        else {
    //val slice = all_sentiments.takeRight(MOVING_AVERAGE_LENGTH)
            val slice = all_sentiments.slice(all_sentiments.length-3,all_sentiments.length+1)
            all_averages += ((clean_timestamp,slice.sum/MOVING_AVERAGE_LENGTH))
        }

    //Print new values
        println(all_sentiments(all_sentiments.length-1),all_averages(all_averages.length-1))
    }
})

//Configure and show the (initally empty) chart
//I've done all the chart work for you


//Create a new XYSeries object that holds the data for the graph 
//And a dataset that contains this XYSeries object
//The goal is to update xy whenever there is a new average in all_averages

val xy = new XYSeries("") 
val dataset = new XYSeriesCollection(xy)

//Creates the chart object 
val chart = ChartFactory.createXYLineChart( 
  "2022 World Cup Sentiment Chart",  // chart title 
  "Time",               // x axis label 
  "Sentiment",                   // y axis label 
  dataset,                   // data 
  PlotOrientation.VERTICAL, 
  false,                    // include legend 
  true,                     // tooltips 
  false                     // urls 
)

//From the chart, grab the plot so that we can configure formatting info (done for you)

val plot = chart.getXYPlot() 

def configurePlot(plot: XYPlot): Unit = { 
  plot.setBackgroundPaint(WHITE) 
  plot.setDomainGridlinePaint(BLACK) 
  plot.setRangeGridlinePaint(BLACK) 
  plot.setOutlineVisible(false) 
} 

//A function that shows the chart.
def show(chart: JFreeChart) { 
  val frame = new ChartFrame("plot", chart) 
  frame.pack() 
  frame.setVisible(true) 
}

//Call the plot configuration function
//Call the show chart function (now it will actually pop up)
//Note that the chart is in a separate window so you might need to look for it

configurePlot(plot) 
show(chart) 

import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.rdd.PairRDDFunctions._
import scala.io.Source
import scala.collection.mutable.ArrayBuffer
import org.jfree.data.xy.{XYSeries, XYSeriesCollection}
import org.jfree.chart.{ChartFactory, ChartFrame, JFreeChart}
import org.jfree.chart.plot.{PlotOrientation, XYPlot}
import org.jfree.chart.util.PaintUtils
import java.awt.Paint
import java.awt.Color._
filters: Array[String] = Array(fifa, qatar, soccer, football, world cup, ronaldo, cristiano, messi, usa, brazil, france, ecuador, senegal, netherlands, iran, england, wales, argentina, saudi arabia, mexico, poland, australia, denmark, tunisia, spain, costa rica, germany, japan, belgium, canada, morocco, croatia, serbia, switzerland, cameroon, portugal, ghana, ur...


In [6]:
/*
1. Start the stream
2. Inside a while loop, sleep for a bit (Thread.sleep(10000) for 10 seconds)
3. then check if there are new elements in all_averages

4. To check if there are new elements, initialize a variable index to 0 and,
at each interval (after sleep), check if the array length of all_averages is
greater than index. If it is, there are length-index new elements

5. if there are new elements, add them to xy using addOrUpdate (see documentation linked above)
 add the elements in all_averages.length - previous_length to xy ()
Use addOrUpdate (not add) so that the graph updates

6. The while should run as long as the length of all_averages is less than NUM_BATCHES

7. Call ssc.stop(false) after the while loop

8. Note that once the stream stops, DStream elements are no longer accessible but
RDDs are (all_sentiments and all_averages)

9. Enjoy! Do note that for this to make sense, we should run this for a long time and 
take a moving average of a longer period (e.g., several hours). Treat this as a
learning exercise, not a diagnostic one

*/

val NUM_BATCHES = 10 //So that you don't get banned from twitter
var index = 0
ssc.start
while (all_averages.length < NUM_BATCHES ) {
    Thread.sleep(10000);
    var len = all_averages.length
    if (len > index) {
        index = len
        xy.addOrUpdate(all_averages(all_averages.length-1)._1.toDouble, all_averages(all_averages.length-1)._2)
    }
    
}
ssc.stop(false)

(-2.272727272727273,(383060000,-2.272727272727273))
(-2.272727272727273,(383070000,-2.272727272727273))
(-2.272727272727273,(383080000,-2.272727272727273))
(0.0,(383090000,-1.5151515151515154))
(0.0,(383100000,-0.7575757575757577))
(-1.4492753623188406,(383110000,-0.4830917874396135))
(-3.8461538461538463,(383120000,-1.7651430694908956))
(0.0,(383130000,-1.7651430694908956))
(0.0,(383140000,-1.2820512820512822))
(0.0,(383150000,0.0))
22/12/06 22:19:16 ERROR ReceiverTracker: Deregistered receiver for stream 0: Stopped by driver


NUM_BATCHES: Int = 10
index: Int = 10
