In [None]:
from bs4 import BeautifulSoup
import urllib.request
import csv
import sys
import os
import zipfile
import logging
import time
import datetime
import re

#function to generate the first URL
def generate_link(cik, accession):
    cik = str(cik)
    accession = str(accession)
    cik = cik.lstrip('0')
    acc = re.sub(r'[-]', r'', accession)
    url = 'https://www.sec.gov/Archives/edgar/data/' + cik + '/' + acc + '/' + accession + '/-index.htm'
    try:
        page_open = urllib.request.urlopen(url)
        generate_10q_link(url)
    except:
        print("Invalid URL {}: ".format(url))


#function to generate the second URL - 10q
def generate_10q_link(url):
    final_url = ""
    html = urllib.request.urlopen(url)
    soup = BeautifulSoup(html, "html.parser")
    all_tables = soup.find('table', class_='tableFile')
    tr = all_tables.find_all('tr')
    for row in tr:
        final_url = row.findNext("a").attrs['href']
        break
    final_link = "https://www.sec.gov" + final_url
    get_next_page(final_link)
    return (final_link)


def get_next_page(url):
    try:
        p = urllib.request.urlopen(url)
        g = BeautifulSoup(p, "html.parser")
        get_all_tables(g)
    except:
        return None

#function to get all the tables from a link
def get_all_tables(g):
    a = g.find_all('table')
    all_datatables(g, a)
    return 0

	
#function to create a folder name
def get_folder_name(g):
    title = g.find('filename').contents[0]
    if ".htm" in title:
        get_folder_name = title.split(".htm")
        return get_folder_name[0]

		
#function to create a zip folder
def zip_dir(path_dir, path_file_zip=''):
    if not path_file_zip:
        path_file_zip = os.path.join(
            os.path.dirname(path_dir), os.path.basename(path_dir) + '.zip')
    with zipfile.ZipFile(path_file_zip, 'w', zipfile.ZIP_DEFLATED) as zip_file:
        for root, dirs, files in os.walk(path_dir):
            for file_or_dir in files + dirs:
                zip_file.write(
                    os.path.join(root, file_or_dir),
                    os.path.relpath(os.path.join(root, file_or_dir),
                                    os.path.join(path_dir, os.path.pardir)))

#function to check if the path exists
def assure_path_exists(path):
    if not os.path.exists(path):
        os.makedirs(path)



#function to check the header tag
def checkheadertag(param):
    flag="false"
    datatabletags=["center","bold"]
    for x in datatabletags:
        if x in param:
            flag="true"
    return flag

	
#function to get the table name
def get_table_name(table):
    r = []
    q = table.find_all('tr')
    for tr in q:
        data=[]
        t=[]
        a=tr.find_all('td')
        for z in a:
            x=z.text;
            x=re.sub(r"['()]","",str(x))
            x=re.sub(r"[$]"," ",str(x))
            if(len(x)>1):
                x=re.sub(r"[—]","",str(x))
                t.append(x)
        data=([z.encode('utf-8') for z in t])
        r.append([z.decode('utf-8').strip() for z in data])
    return r


#function to check the style of td and tr tags
def checktag(param):
    flag = "false"
    datatabletags = ["background", "bgcolor", "background-color"]
    for x in datatabletags:
        if x in param:
            flag = "true"
    return flag

#function to get the final data tables
def all_datatables(g, a):
    count = 0
    allheaders=[]
    for table in a:
        bluetables = []
        trs = table.find_all('tr')
        for tr in trs:
            global flagtr
            if checktag(str(tr.get('style'))) == "true" or checktag(str(tr)) == "true":
                bluetables = get_table_name(tr.find_parent('table'))
                break
            else:
                tds = tr.find_all('td')
                for td in tds:
                    if checktag(str(td.get('style'))) == "true" or checktag(str(td)) == "true":
                        bluetables = get_table_name(td.find_parent('table'))
                        break
            if not len(bluetables) == 0:
                break
        if not len(bluetables) == 0:
            count += 1
            ptag=table.find_previous('p');
            while ptag is not None and checkheadertag(ptag.get('style'))=="false" and len(ptag.text)<=1:
                ptag=ptag.find_previous('p')
                if checkheadertag(ptag.get('style'))=="true" and len(ptag.text)>=2:
                    global name
                    name=re.sub(r"[^A-Za-z0-9]+","",ptag.text)
                    if name in allheaders:
                        hrcount+=1
                        hrname=name+"_"+str(hrcount)
                        allheaders.append(hrname)
                    else:
                        hrname=name
                        allheaders.append(hrname)
                        break
            folder_name = get_folder_name(g)
            path = str(os.getcwd()) + "/" + folder_name
            assure_path_exists(path)
            if(len(allheaders)==0):
                filename=folder_name+"-"+str(count)
            else:
                filename=allheaders.pop()
            csvname=filename+".csv"
            csvpath = path + "/" + csvname
            with open(csvpath, 'w', encoding='utf-8-sig', newline='') as f:
                writer = csv.writer(f)
                writer.writerows(bluetables)
            zip_dir(path)



def main_function():
    cik = '0000051143'
    accession = '0000051143-13-000007'
    logfilename = 'logfile_'+ cik + '.txt' 
    logging.basicConfig(filename=logfilename, level=logging.DEBUG,
                        format='%(asctime)s - %(levelname)s - %(message)s')
    generate_link(cik, accession)


if __name__ == '__main__':
    main_function()