# Chapter 3

In the chapter 3 of "Web Scraping with Python" you were able to learn more about the following subjects:
* Tracking internal links of website
  * According to pattern
  * All internal links
* Website mapping
* Collecting data
* Tracking through internet

The following cells aim to practice the contents listed above. For any sugestions, contact *gabriel.vasconcelos@usp.br*

Use the website https://scraping-cap3.netlify.app/ to answer this notebook.

In [1]:
# Import BeautifulSoup and other libraries you find useful

import networkx as nx
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.error import HTTPError, URLError
import re
import random

In [2]:
# Get the website https://scraping-cap3.netlify.app/ and pass it to a BeautifulSoup object 
# with proper error handling

def getBS(site):
    try:
        html = urlopen(site)
    except HTTPError as e:
        return None
    except URLError as e:
        return None
    bs = BeautifulSoup(html, 'html.parser')
    return bs

In [24]:
def makeInternalURL(link: str):
    url = 'https://scraping-cap3.netlify.app/'
    if link.startswith('./'):
        url += link[2:]
    elif link.startswith('../'):
        url += link[3:]
    elif link.startswith('/'):
        url += link[1:]
    else:
        url = link
    
    return url

def getInternalLinks(site):
    site = makeInternalURL(site)
    
    bs = getBS(site)
    urls = [a['href'] for a in bs.find_all('a', {'href':re.compile('^(\.)*\/.+')})]
    return set(urls)

def getExternalLinks(site):
    site = makeInternalURL(site)
    print(site)
    bs = getBS(site)
    print(bs)
    all_urls = set([a['href'] for a in bs.find_all('a', {'href':re.compile('.+')})])
    in_urls  = getInternalLinks(site)

    all_urls.difference_update(in_urls)
    return (in_urls, all_urls)

### a.
Get all internal links from the domain.

In [8]:
# Code below
# Tip: use BFS algorithm to do a full mapping of the domain

def getAllInternalLinks():
    sites = ['https://scraping-cap3.netlify.app/index.html']
    visited = set()

    while(len(sites) > 0):
        site = makeInternalURL(sites.pop(0))
        visited.add(site)
        bs = getBS(site)
        urls = getInternalLinks(site)
        
        for url in urls:
            url = makeInternalURL(url)
            if(not url in visited and not url in sites):
                sites.append(url)
    
    return visited
    #print(sites)

In [9]:
internalLink =  getAllInternalLinks()
internalLink

### b.
Get all external links from the domain.

In [22]:
# Code below
def getAllExternalLinks():
    sites = ['https://scraping-cap3.netlify.app/index.html']
    visited = set()

    while(len(sites) > 0):
        site = makeInternalURL(sites.pop(0))
        visited.add(site)
        
        ex_urls, in_urls = getExternalLinks(site)
        for url in in_urls:
            url = makeInternalURL(url)
            if(not url in visited and not url in sites):
                sites.append(url)
    
    return visited
    #print(sites)

In [25]:
external_links = getAllExternalLinks()
external_links

https://scraping-cap3.netlify.app/index.html
<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Página Inicial</title>
<link href="./styles/index.css" rel="stylesheet"/>
<link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css" rel="stylesheet"/>
</head>
<body>
<header>
<nav>
<ul>
<li><a href="./bandas.html">Bandas</a></li>
<li><a href="./vocalistas.html">Vocalistas</a></li>
<li><a href="./guitarristas.html">Guitarristas</a></li>
<li><a href="./bateristas.html">Bateristas</a></li>
<li><a href="./baixistas.html">Baixistas</a></li>
<li><a href="./tecladistas.html">Tecladistas</a></li>
</ul>
</nav>
<div>
<a href="./index.html">
<img alt="Rock logo" src="./assets/hand.png"/>
<span>Rock Encyclopedia</span>
</a>
</div>
</header>
<main>
<h1>Rock Encyclopedia</h1>
<p>Site que  busca juntar a informação sobre as

AttributeError: 'NoneType' object has no attribute 'find_all'

### c.
Get the title of each page in the website, its url and the first paragraph.

In [None]:
# Code below

### d.
Create an adjacency list of the domain (directed graph).

In [None]:
# Code below

In [1]:
# Use networkx to generate a visualization of the graph
'''
Create an adjacency list in the following format:

adjacencyList = {
    'node A': ['nodeB', 'nodeC'],
    'node B': ['node C']
}
'''

G = nx.DiGraph()

for fromSite in adjacencyList:
    for toSite in adjacencyList[fromSite]:
        G.add_edge(fromSite, toSite)
        
nodes = list(G.nodes)
for node in nodes:
    if len(node) == 1:
        print(node)
        G.remove_node(node)
        
plt.figure(3,figsize=(12,12)) 
nx.draw(G, pos=nx.shell_layout(G), node_size=100, width=0.5)
plt.show()

### e.
From the website external links, choose one randomly and create a internet crawler.

In [None]:
# Code below