In [None]:
# Import and create a new SQLContext 
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [None]:
# Read the country CSV file into an RDD.
country_lines = sc.textFile('file:///home/ubuntu/work/notebooks/UCSD/big-data-3/final-project/country-list.csv')

In [None]:
country_lines.collect()

In [None]:
# Convert each line into a pair of words
country_lines.map(lambda a: a.split(",")).collect()

In [None]:
# Convert each pair of words into a tuple
country_tuples = country_lines.map(lambda a: (a.split(",")[0].lower(), a.split(",")[1]))

In [None]:
# Create the DataFrame, look at schema and contents
countryDF = sqlContext.createDataFrame(country_tuples, ["country", "code"])
countryDF.printSchema()
countryDF.take(3)

In [7]:
# Read tweets CSV file into RDD of lines
tweets = sc.textFile('file:///home/ubuntu/work/notebooks/UCSD/big-data-3/final-project/tweets.csv')
tweets.count()

13994

In [8]:
# Clean the data: some tweets are empty. Remove the empty tweets using filter() 
filtered_tweets = tweets.filter(lambda a: len(a) > 0)
filtered_tweets.count()

13390

In [9]:
# Perform WordCount on the cleaned tweet texts. (note: this is several lines.)
word_counts = filtered_tweets.flatMap(lambda a: a.split(" ")) \
    .map(lambda word: (word.lower(), 1)) \
    .reduceByKey(lambda a, b: a + b)

In [None]:
from pyspark.sql import HiveContext
from pyspark.sql.types import *

# sc is an existing SparkContext.
sqlContext = HiveContext(sc)

schemaString = "word count"

fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)

# Create the DataFrame of tweet word counts
tweetsDF = sqlContext.createDataFrame(word_counts, schema)
tweetsDF.printSchema()
tweetsDF.count()

In [None]:
# Join the country and tweet DataFrames (on the appropriate column)
joined = countryDF.join(tweetsDF, countryDF.country == tweetsDF.word)
joined.take(5)
joined.show()

In [None]:
# Question 1: number of distinct countries mentioned
distinct_countries = joined.select("country").distinct()
distinct_countries.show(100)

In [None]:
# Question 2: number of countries mentioned in tweets.
from pyspark.sql.functions import sum
from pyspark.sql import SparkSession
from pyspark.sql import Row

countries_count = joined.groupBy("country")
joined.createOrReplaceTempView("records")
spark.sql("SELECT country, count(*) count1 FROM records group by country order by count1 desc, country asc").show(100)

In [None]:
# Table 1: top three countries and their counts.
from pyspark.sql.functions import desc
from pyspark.sql.functions import col

top_3 = joined.sort(col("count").desc())
top_3.show()

In [None]:
# Table 2: counts for Wales, Iceland, and Japan.
