<center><h1>PySpark - Web Crawler Analysis</h1></center>


#### Analyzing Common Crawl Data with RDDs

Here the session gets initialized in a new Spark Context where the domain graph is read in as an RDD.

In [2]:
# Import required modules
from pyspark.sql import SparkSession

# Create a new SparkSession
spark = SparkSession \
    .builder \
    .getOrCreate()

# Get SparkContext
sc = spark.sparkContext

In [3]:
# Read Domains CSV File into an RDD
common_crawl_domain_counts = sc.textFile('./crawl/cc-main-limited-domains.csv')

# Display first few domains from the RDD
common_crawl_domain_counts.take(10)

['367855\t172-in-addr\tarpa\t1',
 '367856\taddr\tarpa\t1',
 '367857\tamphic\tarpa\t1',
 '367858\tbeta\tarpa\t1',
 '367859\tcallic\tarpa\t1',
 '367860\tch\tarpa\t1',
 '367861\td\tarpa\t1',
 '367862\thome\tarpa\t7',
 '367863\tiana\tarpa\t1',
 '367907\tlocal\tarpa\t1']

Applies **fmt_domain_graph_entry** over **common_crawl_domain_counts** and saves the result as a new RDD named **formatted_host_counts**.

In [4]:
def fmt_domain_graph_entry(entry):
    """
    Formats a Common Crawl domain graph entry. Extracts the site_id, 
    top-level domain (tld), domain name, and subdomain count as seperate items.
    """

    # Splits the entry on delimiter ('\t') into site_id, domain, tld, and num_subdomains
    site_id, domain, tld, num_subdomains = entry.split('\t')        
    return int(site_id), domain, tld, int(num_subdomains)

In [5]:
# Apply `fmt_domain_graph_entry` to the raw data RDD
formatted_host_counts = common_crawl_domain_counts\
    .map(lambda e: fmt_domain_graph_entry(e))

formatted_host_counts.take(10)

[(367855, '172-in-addr', 'arpa', 1),
 (367856, 'addr', 'arpa', 1),
 (367857, 'amphic', 'arpa', 1),
 (367858, 'beta', 'arpa', 1),
 (367859, 'callic', 'arpa', 1),
 (367860, 'ch', 'arpa', 1),
 (367861, 'd', 'arpa', 1),
 (367862, 'home', 'arpa', 7),
 (367863, 'iana', 'arpa', 1),
 (367907, 'local', 'arpa', 1)]

In [6]:
def extract_subdomain_counts(entry):
    """
    Extract the subdomain count from a Common Crawl domain graph entry.
    """
    
    # Splits the entry on delimiter ('\t') into site_id, domain, tld, and num_subdomains
    site_id, domain, tld, num_subdomains = entry.split('\t')
    
    # return ONLY the num_subdomains
    return int(num_subdomains)


# Applies `extract_subdomain_counts` to the raw data RDD
host_counts = common_crawl_domain_counts\
    .map(lambda e: extract_subdomain_counts(e))

host_counts.take(10)

[1, 1, 1, 1, 1, 1, 1, 7, 1, 1]

In [7]:
# Reduce the RDD to a single value, the sum of subdomains, with a lambda function as the reduce function
total_host_counts = host_counts\
    .reduce(lambda a, b: a + b)

total_host_counts

595466

In [8]:
# Stops the sparkContext and the SparkSession
spark.stop()

## Exploring Domain Counts with PySpark DataFrames and SQL

Let's create a new SparkSession and implement a sparkContext to read in a new Spark DataFrame named common_crawl.

In [10]:
from pyspark.sql import SparkSession

# Create a new SparkSession
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

# Read in the target file into a DataFrame
common_crawl = spark.read \
    .option('delimiter', '\t') \
    .option('inferSchema', True) \
    .csv('./crawl/cc-main-limited-domains.csv')

common_crawl.show(5, truncate=False)

+------+-----------+----+---+
|_c0   |_c1        |_c2 |_c3|
+------+-----------+----+---+
|367855|172-in-addr|arpa|1  |
|367856|addr       |arpa|1  |
|367857|amphic     |arpa|1  |
|367858|beta       |arpa|1  |
|367859|callic     |arpa|1  |
+------+-----------+----+---+
only showing top 5 rows



Let's rename the DataFrame's features to more context driven names.

In [11]:
# Rename the DataFrame's columns with `withColumnRenamed()`
common_crawl = common_crawl\
    .withColumnRenamed('_c0', 'site_id')\
    .withColumnRenamed('_c1', 'domain')\
    .withColumnRenamed('_c2', 'top_level_domain')\
    .withColumnRenamed('_c3', 'num_subdomains')\
  
common_crawl.show(5, truncate=False)

+-------+-----------+----------------+--------------+
|site_id|domain     |top_level_domain|num_subdomains|
+-------+-----------+----------------+--------------+
|367855 |172-in-addr|arpa            |1             |
|367856 |addr       |arpa            |1             |
|367857 |amphic     |arpa            |1             |
|367858 |beta       |arpa            |1             |
|367859 |callic     |arpa            |1             |
+-------+-----------+----------------+--------------+
only showing top 5 rows



## Reading and Writing Datasets to Disk

In [12]:
# Saves the `common_crawl` DataFrame to a series of parquet files
common_crawl.write.parquet('./results/common_crawl/', mode='overwrite')

Let's read in the parquet file to a new DataFrame to confirm the DataFrame was saved properly.

In [13]:
# Read from parquet directory
common_crawl_domains = spark.read\
    .parquet('./results/common_crawl/')

common_crawl_domains.show(5, truncate=False)

+-------+-----------+----------------+--------------+
|site_id|domain     |top_level_domain|num_subdomains|
+-------+-----------+----------------+--------------+
|367855 |172-in-addr|arpa            |1             |
|367856 |addr       |arpa            |1             |
|367857 |amphic     |arpa            |1             |
|367858 |beta       |arpa            |1             |
|367859 |callic     |arpa            |1             |
+-------+-----------+----------------+--------------+
only showing top 5 rows



## Querying Domain Counts with PySpark DataFrames and SQL

Let's create a local temporary view from common_crawl_domains.

In [14]:
# Creates a temporary view in the metadata for this SparkSession
common_crawl_domains.createOrReplaceTempView('crawl')

Now to calculate the total number of domains for each top-level domain in the dataset.

In [15]:
# Aggregates the DataFrame using SQL
query = """
SELECT 
    top_level_domain, 
    COUNT(top_level_domain) AS total_top_domains
FROM crawl
GROUP BY 1
ORDER BY 2 DESC
"""
spark.sql(query).show(truncate=False)

+----------------+-----------------+
|top_level_domain|total_top_domains|
+----------------+-----------------+
|edu             |18547            |
|gov             |15007            |
|travel          |6313             |
|coop            |5319             |
|jobs            |3893             |
|post            |117              |
|map             |34               |
|arpa            |11               |
+----------------+-----------------+



In [16]:
# Aggregate the DataFrame using SQL
query = """
SELECT 
    top_level_domain, 
    SUM(num_subdomains) AS total_subdomains
FROM crawl
GROUP BY 1
ORDER BY 2 DESC
"""
spark.sql(query).show(truncate=False)

+----------------+----------------+
|top_level_domain|total_subdomains|
+----------------+----------------+
|edu             |484438          |
|gov             |85354           |
|travel          |10768           |
|coop            |8683            |
|jobs            |6023            |
|post            |143             |
|map             |40              |
|arpa            |17              |
+----------------+----------------+



Let's see how many sub-domains nps.gov has. We'll filter the dataset to that website's entry and display the columns "top_level_domain", "domain", and "num_subdomains".

In [17]:
# Filter the DataFrame using SQL
query = """
SELECT
    domain,
    top_level_domain,
    num_subdomains
FROM crawl
WHERE top_level_domain == 'gov'
AND domain == 'nps'
"""

spark.sql(query).show(truncate=False)

+------+----------------+--------------+
|domain|top_level_domain|num_subdomains|
+------+----------------+--------------+
|nps   |gov             |178           |
+------+----------------+--------------+



Finally, let's close the SparkSession and underlying sparkContext

In [18]:
# Stop the notebook's `SparkSession` and `sparkContext`
spark.stop()