In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("spark-sql") \
    .getOrCreate()



#### Read the Challenge.csv and add a column 'Mexico' to say yes or no to whether country is Mexico

In [14]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

challenge_schema = StructType([
    StructField("ip_address", StringType()),
    StructField("country", StringType()),
    StructField("domain_name", StringType()),
    StructField("bytes_used", IntegerType())])

In [15]:
df = spark.read.csv("../data/challenge.csv", header=True, schema=challenge_schema)
df.show(5, truncate=False)


+--------------+---------+----------------+----------+
|ip_address    |country  |domain_name     |bytes_used|
+--------------+---------+----------------+----------+
|52.81.192.172 |China    |odnoklassniki.ru|463       |
|119.239.207.13|China    |youtu.be        |51        |
|68.69.217.210 |China    |adobe.com       |10        |
|7.191.21.223  |Bulgaria |linkedin.com    |853       |
|211.13.10.68  |Indonesia|hud.gov         |29        |
+--------------+---------+----------------+----------+
only showing top 5 rows



In [16]:
df = df.withColumn("Mexico", when(df.country == "Mexico", lit('Yes')).otherwise(lit('No'))) 
df.show()

+---------------+--------------+-----------------+----------+------+
|     ip_address|       country|      domain_name|bytes_used|Mexico|
+---------------+--------------+-----------------+----------+------+
|  52.81.192.172|         China| odnoklassniki.ru|       463|    No|
| 119.239.207.13|         China|         youtu.be|        51|    No|
|  68.69.217.210|         China|        adobe.com|        10|    No|
|   7.191.21.223|      Bulgaria|     linkedin.com|       853|    No|
|   211.13.10.68|     Indonesia|          hud.gov|        29|    No|
|   239.80.21.97|      Suriname|       smh.com.au|       218|    No|
|106.214.106.233|       Jamaica|    amazonaws.com|        95|    No|
| 127.242.24.138|         China| surveymonkey.com|       123|    No|
|     99.2.6.139|Czech Republic|     geocities.jp|       322|    No|
|   237.54.11.63|         China|       amazon.com|        83|    No|
| 252.141.157.25|         Japan|      cornell.edu|       374|    No|
|185.220.128.248|       Belgium|  

#### Group by New Column and sum bytes_used

In [17]:
group_df = df.groupBy("Mexico").agg(sum("bytes_used").alias("total_bytes"))
group_df.show()

+------+-----------+
|Mexico|total_bytes|
+------+-----------+
|    No|     508076|
|   Yes|       6293|
+------+-----------+



#### Group by Country & use the countdistinct function to calculate the number of IP adress seen in EACH country

In [24]:
df_country = df.groupBy("country").agg(countDistinct("ip_address").alias("distinct_ip_count"))
# df_country.sort(desc("distinct_ip_count")).show()
df_country.sort("distinct_ip_count", ascending=False).show(5, truncate=False)

+-----------+-----------------+
|country    |distinct_ip_count|
+-----------+-----------------+
|China      |172              |
|Indonesia  |114              |
|Philippines|65               |
|Russia     |56               |
|Brazil     |35               |
+-----------+-----------------+
only showing top 5 rows

