# Identifying information from a Website
<br>
Identifying basic technologies used by a website using the `buildwith` library.

In [3]:
import builtwith


tecs = builtwith.parse('https://www.facebook.com')

for tec, value in tecs.items():
    for i in value:
        print("-", tec, ":", i)

- advertising-networks : AppNexus
- javascript-graphics : Javascript Infovis Toolkit
- javascript-frameworks : React
- javascript-frameworks : RequireJS


# Identifying the owner of a website. 
<br>
If a company is known for blocking web crawlers, it would be good to be more conservative with the download rate.

In [4]:
import whois


owner_information = whois.whois('facebook.com')

for information, key in owner_information.items():
    print("-", information, ":", key, "\n")

- domain_name : FACEBOOK.COM 

- registrar : RegistrarSafe, LLC 

- whois_server : whois.registrarsafe.com 

- referral_url : None 

- updated_date : 2022-01-26 16:45:06 

- creation_date : 1997-03-29 05:00:00 

- expiration_date : 2031-03-30 04:00:00 

- name_servers : ['A.NS.FACEBOOK.COM', 'B.NS.FACEBOOK.COM', 'C.NS.FACEBOOK.COM', 'D.NS.FACEBOOK.COM'] 

- status : ['clientDeleteProhibited https://icann.org/epp#clientDeleteProhibited', 'clientTransferProhibited https://icann.org/epp#clientTransferProhibited', 'clientUpdateProhibited https://icann.org/epp#clientUpdateProhibited', 'serverDeleteProhibited https://icann.org/epp#serverDeleteProhibited', 'serverTransferProhibited https://icann.org/epp#serverTransferProhibited', 'serverUpdateProhibited https://icann.org/epp#serverUpdateProhibited', 'serverUpdateProhibited https://www.icann.org/epp#serverUpdateProhibited', 'clientDeleteProhibited https://www.icann.org/epp#clientDeleteProhibited', 'clientTransferProhibited https://www.icann.or

### Isolating content data read from a page's HTML output

We will use `BeautifulSoup` to perform the analysis of the information.

In [193]:
from urllib.request import urlopen
from bs4 import BeautifulSoup


html = urlopen("http://google.com")
bsObj = BeautifulSoup(html.read(), "html.parser")


page_links = bsObj.find_all('a')

for link in page_links: 
    print("-", link.get("href")) 

- http://www.google.com.br/imghp?hl=pt-BR&tab=wi
- http://maps.google.com.br/maps?hl=pt-BR&tab=wl
- https://play.google.com/?hl=pt-BR&tab=w8
- http://www.youtube.com/?gl=BR&tab=w1
- https://news.google.com/?tab=wn
- https://mail.google.com/mail/?tab=wm
- https://drive.google.com/?tab=wo
- https://www.google.com.br/intl/pt-BR/about/products?tab=wh
- http://www.google.com.br/history/optout?hl=pt-BR
- /preferences?hl=pt-BR
- https://accounts.google.com/ServiceLogin?hl=pt-BR&passive=true&continue=http://www.google.com/&ec=GAZAAQ
- /search?ie=UTF-8&q=Manfredo+Fest&oi=ddle&ct=220025307&hl=pt-BR&sa=X&ved=0ahUKEwiCr9qLpd33AhXQBLkGHfgGCRAQPQgD
- /advanced_search?hl=pt-BR&authuser=0
- /intl/pt-BR/ads/
- /services/
- /intl/pt-BR/about.html
- http://www.google.com/setprefdomain?prefdom=BR&prev=http://www.google.com.br/&sig=K_BPZiFpRz2dVecot0dDVASz5xHRg%3D
- /intl/pt-BR/policies/privacy/
- /intl/pt-BR/policies/terms/


### Most complete program that searches for links from any website: scraping news based on names

In [None]:
import re
from urllib.request import urlopen

from bs4 import BeautifulSoup


pages = set()
invalid_pages = set()
new_page = ""

def open_page(url_page):
    global pages
    try:
        if url_page not in invalid_pages:
            html = urlopen(url_page)
            bsObj = BeautifulSoup(html, "html.parser")
            regex_keys = ('.bolsonaro.|.russia.|.ucrania.')
            search = bsObj.findAll("a", href=re.compile(regex_keys))

            for link in search:
                if "href" in link.attrs:
                    if link.attrs['href'] not in pages and link.attrs['href'] not in invalid_pages:
                        new_page = link.attrs['href']
                        print(new_page)
                        pages.add(new_page)
                        open_page(new_page)
    except:
        invalid_pages.add(new_page)

open_page("http://g1.globo.com")

https://g1.globo.com/politica/noticia/2022/05/13/empresario-diz-a-pf-que-pagou-r-95-mil-para-reforma-de-escritorio-de-jair-renan-bolsonaro.ghtml
https://g1.globo.com/politica/politico/jair-bolsonaro/
https://g1.globo.com/politica/noticia/2022/05/13/deputados-conselho-etica-deboche-eduardo-bolsonaro-tortura-miriam-leitao.ghtml
https://g1.globo.com/politica/politico/eduardo-bolsonaro/
https://g1.globo.com/jornal-nacional/noticia/2022/05/04/conselho-de-etica-da-camara-abre-processo-para-apurar-conduta-de-eduardo-bolsonaro.ghtml
 https://g1.globo.com/mundo/ucrania-russia/
https://g1.globo.com/mundo/ucrania-russia/noticia/2022/05/13/forcas-ucranianas-frustram-avanco-de-coluna-blindada-russa-na-regiao-de-donbass-indica-video.ghtml
https://g1.globo.com/tudo-sobre/ucrania/
https://g1.globo.com/globonews/estudio-i/video/ue-promete-mais-500-milhoes-de-euros-em-ajuda-a-ucrania-10573594.ghtml
https://g1.globo.com/mundo/ucrania-russia/noticia/2022/05/13/zelensky-diz-que-esta-preparado-para-conversa

https://g1.globo.com/mundo/noticia/2022/02/17/separatistas-da-ucrania-afirmam-que-foram-atacados-militares-negam.ghtml
https://g1.globo.com/mundo/noticia/2022/02/16/russia-anuncia-fim-das-manobras-militares-na-crimeia-e-retirada-de-tropas.ghtml
https://g1.globo.com/mundo/noticia/2022/02/15/biden-sauda-anuncio-de-retirada-de-tropas-russas-mas-alerta-que-invasao-da-ucrania-ainda-e-forte-possibilidade.ghtml
https://g1.globo.com/mundo/noticia/2022/02/15/algumas-tropas-russas-mobilizadas-na-perto-da-fronteira-com-a-ucrania-retornam-aos-quarteis.ghtml
https://g1.globo.com/mundo/noticia/2022/02/14/ucrania-persistira-com-objetivo-de-ingressar-na-otan-diz-presidente-russia-ve-possibilidade-de-acordo-com-paises-ocidentais.ghtml
https://g1.globo.com/mundo/noticia/2022/02/14/alemanha-pede-a-russia-sinais-imediatos-de-desescalada-na-ucrania.ghtml
https://g1.globo.com/mundo/noticia/2022/02/13/embaixador-brasileiro-na-ucrania-diz-que-situacao-e-tranquila-e-normal-no-pais.ghtmlhttps://g1.globo.com/mun

https://g1.globo.com/mundo/ucrania-russia/noticia/2022/05/12/soldados-russos-sao-flagrados-matando-civis-ucranianos-com-tiros-nas-costas.ghtml
https://g1.globo.com/mundo/ucrania-russia/noticia/2022/05/01/sobreviventes-da-ocupacao-de-bucha-contam-o-que-passaram-na-mao-de-soldados-russos-estamos-vivos-por-acaso.ghtml
https://g1.globo.com/mundo/ucrania-russia/noticia/2022/04/04/massacre-em-bucha-e-crime-de-guerra-ou-genocidio-advogado-brasileiro-que-atua-no-tpi-explica.ghtml
https://api.whatsapp.com/send?text=https://g1.globo.com/mundo/ucrania-russia/noticia/2022/04/04/massacre-em-bucha-e-crime-de-guerra-ou-genocidio-advogado-brasileiro-que-atua-no-tpi-explica.ghtml?utm_source%3Dwhatsapp%26utm_medium%3Dshare-engagement%26utm_campaign%3Dte-materias
https://web.whatsapp.com/send?text=https%3A%2F%2Fg1.globo.com%2Fmundo%2Fucrania-russia%2Fnoticia%2F2022%2F04%2F04%2Fmassacre-em-bucha-e-crime-de-guerra-ou-genocidio-advogado-brasileiro-que-atua-no-tpi-explica.ghtml%3Futm_source%3Dwhatsapp%26utm_

https://g1.globo.com/mundo/blog/sandra-cohen/post/2022/01/26/por-que-eua-e-europa-divergem-sobre-acao-na-ucrania-e-como-putin-se-aproveita-disso.ghtml
https://api.whatsapp.com/send?text=https://g1.globo.com/mundo/blog/sandra-cohen/post/2022/01/26/por-que-eua-e-europa-divergem-sobre-acao-na-ucrania-e-como-putin-se-aproveita-disso.ghtml?utm_source%3Dwhatsapp%26utm_medium%3Dshare-engagement%26utm_campaign%3Dte-materias
https://web.whatsapp.com/send?text=https%3A%2F%2Fg1.globo.com%2Fmundo%2Fblog%2Fsandra-cohen%2Fpost%2F2022%2F01%2F26%2Fpor-que-eua-e-europa-divergem-sobre-acao-na-ucrania-e-como-putin-se-aproveita-disso.ghtml%3Futm_source%3Dwhatsapp%26utm_medium%3Dshare-engagement%26utm_campaign%3Dte-materias
https://telegram.me/share/url?url=https://g1.globo.com/mundo/blog/sandra-cohen/post/2022/01/26/por-que-eua-e-europa-divergem-sobre-acao-na-ucrania-e-como-putin-se-aproveita-disso.ghtml?utm_source%3Dtelegram%26utm_medium%3Dshare-engagement%26utm_campaign%3Dte-materias
tg://msg_url?url=ht