# Identifying information from a Website
<br><br>
Identifying basic technologies used by a website using the `buildwith` library.

In [None]:
import builtwith


tecs = builtwith.parse('https://www.facebook.com')

for tec, value in tecs.items():
    for i in value:
        print("-", tec, ":", i)

## Identifying the owner of a website. 
<br>
If a company is known for blocking web crawlers, it would be good to be more conservative with the download rate.

In [None]:
import whois


owner_information = whois.whois('facebook.com')

for information, key in owner_information.items():
    print("-", information, ":", key, "\n")

### Isolating content data read from a page's HTML output

We will use `BeautifulSoup` to perform the analysis of the information.

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup


html = urlopen("http://google.com")
bsObj = BeautifulSoup(html.read(), "html.parser")


page_links = bsObj.find_all('a')

for link in page_links: 
    print("-", link.get("href")) 

### Most complete program that searches for links from any website

Scraping news based on names.

In [None]:
import re
from urllib.request import urlopen

from bs4 import BeautifulSoup


pages = set()
invalid_pages = set()
new_page = ""

def open_page(url_page):
    global pages
    try:
        if url_page not in invalid_pages:
            html = urlopen(url_page)
            bsObj = BeautifulSoup(html, "html.parser")
            regex_keys = ('.bolsonaro.|.russia.|.ucrania.')
            search = bsObj.findAll("a", href=re.compile(regex_keys))

            for link in search:
                if "href" in link.attrs:
                    if link.attrs['href'] not in pages and link.attrs['href'] not in invalid_pages:
                        new_page = link.attrs['href']
                        print(new_page)
                        pages.add(new_page)
                        open_page(new_page)
    except:
        invalid_pages.add(new_page)

open_page("http://g1.globo.com")

## Information collection


### Target verification:  scan script 
<br>

The socket has an option called `gethostbyname`, which one of the two is indifferent, we can use both. `gethostbyname_ex` allows us to display more information in case the target, for example, the target has one more server that does the same thing.

The `gethostbyname` is the simplest as it takes the host from your name:

In [None]:
import socket 


domain = input("Target: ")

with open('notes/wordlist.txt', 'r') as files:
    brute_force = files.readlines()

for name in brute_force:
    DNS = name.strip("\n") + "." + domain
    try:
        print("-" + DNS + ": " + socket.gethostbyname(DNS)) # get the host from the name
    except socket.gaierror:
        pass

#### Going to DNS Resolver
<br>
We will use query (determine whether each record type exists or not), which will be responsible for getting the information we want to check:

In [None]:
import dns.resolver


domain = input("Target: ")
registers = ["AAAA", "A", "MX", "NS"]

for register in registers:
    result = dns.resolver.resolve(domain, register, raise_on_no_answer = False)
    if result.rrset is not None:
        print("-", result.rrset)
        