# Big Data Integration and Processing

### Final Project

In [1]:
print(s"Spark Version: ${spark.version}\nSpark App Name: ${spark.sparkContext.appName}")

Spark Version: 2.1.0
Spark App Name: Apache Toree

In [2]:
import org.apache.spark.SparkContext._
import org.apache.spark.sql.functions.{avg, col, lower}

//Workaround in Toree to get SparkSession
val sparkDummy = spark
import sparkDummy.implicits._

In [3]:
val countriesFileDS = spark.read.
    format("csv").
    load("file:///home/ubuntu/UCSD/big-data-3/final-project/country-list.csv").
    as[(String, String)].
    select('_c0 as 'country)

val countriesDS = countriesFileDS.select(countriesFileDS.columns.map(c => lower(col(c)).alias(c)): _*)
countriesDS.show(5)

+--------------+
|       country|
+--------------+
|   afghanistan|
|       albania|
|       algeria|
|american samoa|
|       andorra|
+--------------+
only showing top 5 rows



In [4]:
val tweetsDS = spark.read.
    format("csv").
    load("file:///home/ubuntu/UCSD/big-data-3/final-project/tweets.csv").
    as[String].
    select('_c0 as 'tweets)

val wordCountDS = tweetsDS.explode("tweets", "word"){ (line: String) => 
        line.trim().
            toLowerCase().
            replaceAll("""[\p{Punct}&&[^.]]""", "").
            split(" ")
    }.
    filter(!_.toString().trim().isEmpty).
    groupBy("word").
    count()

wordCountDS.show(5)

+------+-----+
|  word|count|
+------+-----+
|  some|  123|
| still|  104|
|   ...|   68|
| those|   31|
|doubts|    1|
+------+-----+
only showing top 5 rows



In [5]:
val joinedDS = countriesDS.join(wordCountDS, 
                                   countriesDS.col("country") === wordCountDS.col("word")).
                                    select('country, 'count)

joinedDS.createOrReplaceTempView("joined")
joinedDS.cache()

joinedDS.show(5)

+---------+-----+
|  country|count|
+---------+-----+
|  albania|    1|
|argentina|    3|
|australia|    2|
|  austria|    5|
|  bahamas|    1|
+---------+-----+
only showing top 5 rows



In [6]:
joinedDS.count()

54

In [7]:
val sqlQn1 = spark.sql("SELECT count(distinct country) AS tot_cnt FROM joined")
sqlQn1.show()

                                                                                +-------+
|tot_cnt|
+-------+
|     54|
+-------+



In [8]:
joinedDS.select('country).distinct().count()

54

In [9]:
val sqlQn2 = spark.sql("SELECT country, count FROM joined order by count desc")
sqlQn2.show(5)

+--------+-----+
| country|count|
+--------+-----+
|  france|   79|
| nigeria|   67|
|  norway|   53|
| england|   37|
|slovakia|   30|
+--------+-----+
only showing top 5 rows



In [10]:
joinedDS.select('country, 'count).
    sort('count.desc).
    //orderBy('count.desc).
    show(5)

+--------+-----+
| country|count|
+--------+-----+
|  france|   79|
| nigeria|   67|
|  norway|   53|
| england|   37|
|slovakia|   30|
+--------+-----+
only showing top 5 rows



In [11]:
val sqlQn5 = spark.sql(s"""SELECT country, count FROM joined
    | where country in (\'kenya\', \'wales\', \'netherlands\', \'iceland\', \'japan\')
    | order by count desc""".stripMargin)
sqlQn5.show()

+-----------+-----+
|    country|count|
+-----------+-----+
|      wales|   24|
|netherlands|   13|
|      japan|    8|
|      kenya|    3|
|    iceland|    2|
+-----------+-----+



In [12]:
val sqlQn6 = spark.sql(s"SELECT avg(count) as avg_count FROM joined")
sqlQn6.show()

+-----------------+
|        avg_count|
+-----------------+
|9.666666666666666|
+-----------------+



In [13]:
joinedDS.agg(avg('count).alias("avg_count")).
    show()

+-----------------+
|        avg_count|
+-----------------+
|9.666666666666666|
+-----------------+

