In [104]:
print(s"Spark Version: ${spark.version}\nSpark App Name: ${spark.sparkContext.appName}")

Spark Version: 2.1.0
Spark App Name: Apache Toree

In [105]:
val sparkDummy = spark
import sparkDummy.implicits._
import org.apache.spark.sql.functions.{avg, col, upper}

val countriesFileDF = spark.read.
    format("csv").
    load("file:///home/ubuntu/UCSD/big-data-3/final-project/country-list.csv").
        as[(String, String)].
        select('_c0 as 'country)

val countriesDF = countriesFileDF.select(countriesFileDF.columns.map(c => lower(col(c)).alias(c)): _*)
countriesDF.show(5)

+--------------+
|       country|
+--------------+
|   afghanistan|
|       albania|
|       algeria|
|american samoa|
|       andorra|
+--------------+
only showing top 5 rows



In [106]:
val tweetsDF = spark.read.
    format("csv").
    load("file:///home/ubuntu/UCSD/big-data-3/final-project/tweets.csv").toDF("tweets")
val wordCountDF = tweetsDF.explode("tweets","word")((line: String) => line.trim().replaceAll("""[\p{Punct}&&[^.]]""", "").toLowerCase().split(" ")).groupBy("word").count()
wordCountDF.show(5)

+------+-----+
|  word|count|
+------+-----+
|  some|  123|
| still|  104|
|   ...|   68|
| those|   31|
|doubts|    1|
+------+-----+
only showing top 5 rows



In [107]:
import org.apache.spark.SparkContext._
val joinedDF = countriesDF.join(wordCountDF, 
                                   countriesDF.col("country") === wordCountDF.col("word")).select('country, 'count)
joinedDF.createOrReplaceTempView("joined")
joinedDF.cache()
joinedDF.count()
joinedDF.show(5)

+---------+-----+
|  country|count|
+---------+-----+
|  albania|    1|
|argentina|    3|
|australia|    2|
|  austria|    5|
|  bahamas|    1|
+---------+-----+
only showing top 5 rows



In [108]:
val sqlDF1 = spark.sql("SELECT count(distinct country) AS tot_cnt FROM joined")
sqlDF1.show()

                                                                                +-------+
|tot_cnt|
+-------+
|     54|
+-------+



In [109]:
joinedDF.select("country").distinct().count()

54

In [110]:
val sqlDF2 = spark.sql("SELECT country, count FROM joined order by count desc")
sqlDF2.show(5)

+--------+-----+
| country|count|
+--------+-----+
|  france|   79|
| nigeria|   67|
|  norway|   53|
| england|   37|
|slovakia|   30|
+--------+-----+
only showing top 5 rows



In [111]:
joinedDF.select($"country", $"count").
    sort($"count".desc).
    show(5)

+--------+-----+
| country|count|
+--------+-----+
|  france|   79|
| nigeria|   67|
|  norway|   53|
| england|   37|
|slovakia|   30|
+--------+-----+
only showing top 5 rows



In [112]:
val sqlDF3 = spark.sql(s"SELECT country, count FROM joined where country in (\'kenya\', \'wales\', \'netherlands\', \'iceland\', \'japan\') order by count desc")
sqlDF3.show()

+-----------+-----+
|    country|count|
+-----------+-----+
|      wales|   24|
|netherlands|   13|
|      japan|    8|
|      kenya|    3|
|    iceland|    2|
+-----------+-----+



In [113]:
val sqlDF4 = spark.sql(s"SELECT avg(count) as count1 FROM joined order by count1 desc")
sqlDF4.show()

+-----------------+
|           count1|
+-----------------+
|9.666666666666666|
+-----------------+



In [114]:
joinedDF.agg(avg('count).alias("avg_cnt")).
    sort($"avg_cnt".desc).
    show()

+-----------------+
|          avg_cnt|
+-----------------+
|9.666666666666666|
+-----------------+

