### Final project of Big Data Integration and Processing with Spark v1.6.0

In [1]:
print(s"Spark Version: ${sc.version}\nSpark App Name: ${sc.appName}")

Spark Version: 2.1.0
Spark App Name: Apache Toree

In [2]:
import org.apache.spark.SparkContext._
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.functions.{avg, col, lower}

In [3]:
val sqlContext = new HiveContext(sc)
import sqlContext.implicits._

In [4]:
val countryListRDD = sc.textFile("file:///home/ubuntu/UCSD/big-data-3/final-project/country-list.csv")

val countriesDF = countryListRDD.map { row => 
        row.split(",")(0).
            trim().
            toLowerCase()
    }.toDF("country")

countriesDF.show(5)

+--------------+
|       country|
+--------------+
|   afghanistan|
|       albania|
|       algeria|
|american samoa|
|       andorra|
+--------------+
only showing top 5 rows



In [5]:
val tweetsRDD = sc.textFile("file:///home/ubuntu/UCSD/big-data-3/final-project/tweets.csv")

val wordCountDF = tweetsRDD.flatMap(_.trim().toLowerCase().split(" ")).
    map(_.replaceAll("""[\p{Punct}&&[^.]]""", "")).
    map(word => (word, 1)).
    reduceByKey(_ + _).
    toDF("word", "count")

wordCountDF.show(5)

+-------------------+-----+
|               word|count|
+-------------------+-----+
|            timoutd|    1|
|          batshuayi|    8|
|            stedman|    1|
|httpst.cobukkztk4il|    1|
|httpst.coatjgqfo4od|    1|
+-------------------+-----+
only showing top 5 rows



In [6]:
val joinedDF = countriesDF.join(wordCountDF, 
                                   countriesDF.col("country") === wordCountDF.col("word"))

joinedDF.createOrReplaceTempView("joined")
joinedDF.cache()

joinedDF.show(5)

+---------+---------+-----+
|  country|     word|count|
+---------+---------+-----+
|  finland|  finland|    1|
|australia|australia|    2|
| portugal| portugal|   12|
|   israel|   israel|    3|
|  nigeria|  nigeria|   67|
+---------+---------+-----+
only showing top 5 rows



In [7]:
joinedDF.count()

54

In [8]:
val sqlQn1 = sqlContext.sql("SELECT count(distinct country) AS count FROM joined")
sqlQn1.show()

                                                                                +-----+
|count|
+-----+
|   54|
+-----+



In [9]:
joinedDF.select('country).distinct().count()

54

In [10]:
val sqlQn2 = sqlContext.sql("SELECT country, count FROM joined order by count desc")
sqlQn2.show(5)

                                                                                +--------+-----+
| country|count|
+--------+-----+
|  france|   79|
| nigeria|   67|
|  norway|   53|
| england|   37|
|slovakia|   30|
+--------+-----+
only showing top 5 rows



In [11]:
joinedDF.select('country, 'count).
    sort('count.desc).
    //orderBy('count.desc).
    show(5)

                                                                                +--------+-----+
| country|count|
+--------+-----+
|  france|   79|
| nigeria|   67|
|  norway|   53|
| england|   37|
|slovakia|   30|
+--------+-----+
only showing top 5 rows



In [12]:
val sqlQn5 = spark.sql(s"""SELECT country, count FROM joined
    | where country in (\'kenya\', \'wales\', \'netherlands\', \'iceland\', \'japan\')
    | order by count desc""".stripMargin)
sqlQn5.show()

                                                                                +-----------+-----+
|    country|count|
+-----------+-----+
|      wales|   24|
|netherlands|   13|
|      japan|    8|
|      kenya|    3|
|    iceland|    2|
+-----------+-----+



In [13]:
val sqlQn6 = sqlContext.sql(s"SELECT avg(count) as avg_count FROM joined order")
sqlQn6.show()

+-----------------+
|        avg_count|
+-----------------+
|9.685185185185185|
+-----------------+



In [14]:
joinedDF.agg(avg('count).alias("avg_count")).
    show()

                                                                                +-----------------+
|        avg_count|
+-----------------+
|9.685185185185185|
+-----------------+

