# Functions

In [3]:
import requests
import json
import os
from xml.etree import ElementTree
from urllib.parse import urljoin

# Function to read CIK numbers from a JSON file
def load_cik_numbers_from_json(json_file_path):
    with open(json_file_path, 'r') as file:
        data = json.load(file)
    return data

# Function to get the CIK number and company name (modified to take directly from JSON)
def get_cik_from_json_entry(entry):
    cik_number = entry.get('cik')
    company_name = entry.get('name')
    if not company_name or not cik_number:
        return None, None
    return cik_number, company_name


# # Example usage
json_file_path = "/content/sample_data/parsed_companies1.json"
cik_number, company_name = get_cik_from_json_entry(json_file_path)
print(f"CIK Number for {company_name}: {cik_number}")
print(f"Ticker Symbol for {company_name}: {ticker_symbol}")


AttributeError: 'str' object has no attribute 'get'

In [2]:
import requests
from bs4 import BeautifulSoup

# Function to load 10-K XBRL URLs using the CIK number
def load_10k_xbrl(cik_num=None, years=None):
    if cik_num is None:
        print("CIK number is required.")
        return []

    url_to_all_10k = f"https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={cik_num}&type=10-K&dateb=&owner=include&count=40&search_text="

    if years is None:
        print("No specific years are given, so all xbrl urls will be returned.")
        return []
    elif isinstance(years, str):
        years = [years]
    elif not isinstance(years, set):
        years = set(years)

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
    }

    try:
        response = requests.get(url_to_all_10k, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', class_='tableFile2')

        if table:
            full_links = []
            for row in table.find_all('tr')[1:]:  # Skipping the header row
                cols = row.find_all('td')
                if len(cols) > 3:
                    filing_type = cols[0].text.strip()
                    filing_date = cols[3].text.strip()

                    if filing_type == '10-K' and any(target_year in filing_date for target_year in years):
                        doc_link = cols[1].find('a', href=True)['href']
                        full_links.append(f"https://www.sec.gov{doc_link}")
            if full_links:
                return full_links
            else:
                print("No 10-K filings found for the specified years.")
                return []
        else:
            print("No table found on the SEC page.")
            return []
    except requests.RequestException as e:
        print(f"Network error: {e}")
        return []

# # Example usage
# urls = load_10k_xbrl(cik_number, ["2019","2020","2021","2022", "2023"])
# print(urls)


In [3]:
import requests
from bs4 import BeautifulSoup
import os
import re
from urllib.parse import urljoin

# Function to get XBRL file links from the given SEC page link
def get_xbrl_links(link):
    session = requests.Session()
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
    }
    session.headers.update(headers)

    response = session.get(link)
    print(f"Accessing URL: {link}")
    print(f"Status Code: {response.status_code}")

    file_links = []
    folder_name = None

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        for a_tag in soup.find_all('a', href=True):
            file_link = urljoin(link, a_tag['href'])

            if file_link.endswith(('.xml', '.xsd')):
                print(f"Found file link: {file_link}")
                folder_name_new = file_link.split('/')[-1]
                folder_name_new = folder_name_new.split('.')[0]

                if folder_name is None:
                    folder_name = folder_name_new

                file_links.append(file_link)
    else:
        print("Failed to retrieve the webpage")

    return file_links, folder_name

# Function to download a file given its link
def download_file(session, file_link, folder_name):
    for attempt in range(3):
        file_response = session.get(file_link)
        if file_response.status_code == 200:
            file_name = file_link.split('/')[-1]
            file_path = os.path.join(folder_name, file_name)
            with open(file_path, 'wb') as file:
                file.write(file_response.content)
            print(f"Downloaded: {file_path}")
            return file_path
        else:
            print(f"Failed to download file: {file_link} (Attempt {attempt + 1})")
    return None

# Main function to handle the downloading of SEC files
def download_sec_files(link, download_dir):
    file_links, folder_name = get_xbrl_links(link)

    if not file_links:
        print("No files to download")
        return folder_name, None

    session = requests.Session()
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
    }
    session.headers.update(headers)

    if not os.path.exists(os.path.join(download_dir, folder_name)):
        os.makedirs(os.path.join(download_dir, folder_name))

    xbrl_name = None

    for file_link in file_links:
        downloaded_file_path = download_file(session, file_link, os.path.join(download_dir, folder_name))
        if downloaded_file_path and '_htm.xml' in downloaded_file_path:
            xbrl_name = downloaded_file_path

    return folder_name, xbrl_name


# Example usage
# download_dir = os.path.abspath("downloads")  # Define your download directory
# sec_link = 'https://www.sec.gov/Archives/edgar/data/1326801/000132680124000012/0001326801-24-000012-index.htm'

# folder, xbrl_file = download_sec_files(urls[1], download_dir)
# print(f"Folder: {folder}, XBRL File: {xbrl_file}")


# Example Usage

In [4]:
# Example usage of the modified code
if __name__ == "__main__":
    json_file_path = "cik_numbers.json"  # Path to your JSON file with CIK numbers
    download_dir = '.'  # Directory to save downloaded files
    years = set(["2023", "2024"])  # Set of filing years

    cik_data = load_cik_numbers_from_json(json_file_path)

    for entry in cik_data:
        cik_number, company_name = get_cik_from_json_entry(entry)
        if cik_number:
            print(f"\n\nProcessing company: {company_name} (CIK: {cik_number})")

            # Load 10-K URLs for the specified years
            urls = load_10k_xbrl(cik_number, years)
            print(f"Found URLs for {company_name}: {urls}")

            # Download the XBRL files
            for url in urls:
                folder, xbrl_file = download_sec_files(url, download_dir)
                print(f"Folder: {folder}, XBRL File: {xbrl_file}")
        else:
            print(f"Invalid entry in JSON: {entry}")



Processing ticker: AMGN
URL used for request: https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=AMGN&output=xml
Response Status Code: 200
CIK Number for AMGN: 0000318154
URLs for AMGN: ['https://www.sec.gov/Archives/edgar/data/318154/000031815424000011/0000318154-24-000011-index.htm', 'https://www.sec.gov/Archives/edgar/data/318154/000031815423000017/0000318154-23-000017-index.htm']

Processing year: 2024
Accessing URL: https://www.sec.gov/Archives/edgar/data/318154/000031815424000011/0000318154-24-000011-index.htm
Status Code: 200
Found file link: https://www.sec.gov/Archives/edgar/data/318154/000031815424000011/amgn-20231231.xsd
Found file link: https://www.sec.gov/Archives/edgar/data/318154/000031815424000011/amgn-20231231_cal.xml
Found file link: https://www.sec.gov/Archives/edgar/data/318154/000031815424000011/amgn-20231231_def.xml
Found file link: https://www.sec.gov/Archives/edgar/data/318154/000031815424000011/amgn-20231231_lab.xml
Found file link: https://www.se