# FCC Report Scraping Tool

I want the following tables

```
Businesses

ID - Name - File Count - First Document Last Mod - Last Document Last Mod
```

```
Files

ID - File Number - Date Filed Last Mod - Applicant - Callsign - Subject Type - File Type - Business ID
```

In [71]:
import sqlite3
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm

In [88]:
con = sqlite3.connect("fccreport.db")

In [89]:
cur = con.cursor()
cur.execute("DROP TABLE businesses")
cur.execute("DROP TABLE files")
cur.execute("CREATE TABLE businesses(id, name, file_count, first_document_last_mod, last_document_last_mod)")
cur.execute("CREATE TABLE files(file_id, file_url, document_last_mod, applicant_id, callsign, subject_type, file_type)")

<sqlite3.Cursor at 0x7f0846f1cfc0>

In [125]:
def get_file_page(idx, suffix, cur):
    base_url    = "https://fcc.report"
    url = base_url + suffix
    
    #print("Fetching {}...".format(suffix), end="")
    response = requests.get(url)
    if(response.status_code == 200):
        #print("OK!")
        pass
    else:
        print("Error, Skipping!")
        return "-", "-"
    soup = BeautifulSoup(response.text, "html.parser")
    
    first_date = None
    last_date  = None
    
    for row in soup.find_all("tr")[1:]:
        cols = row.find_all("td")
        # Date
        dflm    = cols[0].text
        if(first_date == None):
            first_date = dflm
        last_date = dflm

        # ID
        #print(cols[1].text)
        id_text = cols[1].text
        file_id = id_text.split(" ")[1].strip() if "IBFS" in id_text else id_text
        file_url = base_url + cols[1].find("a")["href"]

        # Applicant
        applicant_id = idx

        # Callsign
        callsign = cols[3].text

        # Subject Type
        type_name = cols[4].text
        subject_type = type_name.split(":")[0].strip()
        file_type    = type_name.split(":")[-1].strip()

        #print(dflm, file_id, file_url, applicant_id, callsign, subject_type, file_type)
        # TODO: sqlite connection
        cur.execute("INSERT INTO files VALUES ('{}', '{}', '{}', '{}', '{}', '{}', '{}')".format(
            file_id, file_url, dflm, applicant_id, callsign, subject_type, file_type
            
        ))
    
    return first_date, last_date

def get_business_page(cur):
    base_url    = "https://fcc.report"
    list_suffix = "/IBFS/Business-List/"
    url = base_url + list_suffix
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    rows = soup.find_all("tr")[1:]
    total_len = len(rows)
    print("Going through", len(rows), "...")
    
    for idx, row in tqdm(enumerate(rows), total=total_len):
        #print(row)
        
        cols = row.find_all("td")
        company_name  = cols[0].text
        company_suffix = row.find("a")["href"]
        file_counts   = cols[1].text
        
        first_date, last_date = get_file_page(idx, company_suffix, cur)
        
        cur.execute("INSERT INTO businesses VALUES ('{}', '{}', '{}', '{}', '{}')".format(
            idx, company_name, file_counts, first_date, last_date
        ))
    print("DONE!")
    return soup

In [128]:
s = get_business_page(cur)

Going through 5000 ...


100%|█████████████████████████████████████████████████████████████████| 5000/5000 [36:38<00:00,  2.27it/s]

DONE!





In [129]:
con.commit()