In [18]:
print(s"Spark Version: ${spark.version}\nSpark App Name: ${spark.sparkContext.appName}")

Spark Version: 2.1.0
Spark App Name: Apache Toree

In [19]:
import org.apache.spark.sql.hive.HiveContext

val sqlContext = new HiveContext(sc)
import sqlContext.implicits._

In [20]:
val countryListRDD = sc.textFile("file:///home/ubuntu/UCSD/big-data-3/final-project/country-list.csv")
val countryTupleRDD = countryListRDD.map { row => 
    val line = row.split(",")
    (line(0).trim().toLowerCase(), line(1).trim().toLowerCase())
}

val countryTupleDF = countryTupleRDD.toDF("country", "cc").select("country")
countryTupleDF.take(5)

Array([afghanistan], [albania], [algeria], [american samoa], [andorra])

In [21]:
val tweetsRDD = sc.textFile("file:///home/ubuntu/UCSD/big-data-3/final-project/tweets.csv")
val wordCountRDD = tweetsRDD.flatMap(_.split(" ")).
    filter(!_.isEmpty).
    map(_.replaceAll("""[\p{Punct}&&[^.]]""", "")).
    map(word=>(word.toLowerCase(),1)).
    reduceByKey(_ + _)
val wordCountDF = wordCountRDD.toDF("word", "cnt")

In [22]:
import org.apache.spark.SparkContext._
val joinedDF = countryTupleDF.join(wordCountDF, 
                                   countryTupleDF.col("country") === wordCountDF.col("word"))
joinedDF.createOrReplaceTempView("joined")
joinedDF.cache()
joinedDF.count()

54

In [23]:
val sqlDF1 = sqlContext.sql("SELECT count(distinct country) AS tot_cnt FROM joined")
sqlDF1.show()

                                                                                +-------+
|tot_cnt|
+-------+
|     54|
+-------+



In [24]:
joinedDF.select("country").distinct().count()

54

In [25]:
val sqlDF2 = sqlContext.sql("SELECT country, cnt FROM joined order by cnt desc")
sqlDF2.show(5)

                                                                                +--------+---+
| country|cnt|
+--------+---+
|  france| 79|
| nigeria| 67|
|  norway| 53|
| england| 37|
|slovakia| 30|
+--------+---+
only showing top 5 rows



In [26]:
joinedDF.select($"country", $"cnt").
    sort($"cnt".desc).
    //orderBy($"cnt".desc).
    show(5)

                                                                                +--------+---+
| country|cnt|
+--------+---+
|  france| 79|
| nigeria| 67|
|  norway| 53|
| england| 37|
|slovakia| 30|
+--------+---+
only showing top 5 rows



In [27]:
val sqlDF3 = sqlContext.sql(s"SELECT country, cnt FROM joined where country in (\'kenya\', \'wales\', \'netherlands\', \'iceland\', \'japan\') order by cnt desc")
sqlDF3.show()

                                                                                +-----------+---+
|    country|cnt|
+-----------+---+
|      wales| 24|
|netherlands| 13|
|      japan|  8|
|      kenya|  3|
|    iceland|  2|
+-----------+---+



In [28]:
val sqlDF4 = sqlContext.sql(s"SELECT avg(cnt) as cnt1 FROM joined order by cnt1 desc")
sqlDF4.show()

                                                                                +-----------------+
|             cnt1|
+-----------------+
|9.685185185185185|
+-----------------+



In [29]:
joinedDF.agg(avg("cnt").alias("avg_cnt")).
    sort($"avg_cnt".desc).
    show()

                                                                                +-----------------+
|          avg_cnt|
+-----------------+
|9.685185185185185|
+-----------------+

