# Download EDGAR index forms for the list of regulatory filings


## 1. Import required libraries

In [2]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import zipfile
import os
from io import BytesIO
import time
import glob
import shutil  # Import shutil to remove directories


- requests: Used to make HTTP requests to download files.
- HTTPAdapter: Used to configure how HTTP requests are retried if they fail.
- Retry: Provides retry logic for failed requests (e.g., 5xx errors).
- zipfile: Helps in handling ZIP files.
- os: Provides utilities for interacting with the operating system (e.g., creating directories).
- BytesIO: Allows treating byte data (from the downloaded file) as a file object.
- time: Used for rate-limiting to avoid making too many requests in a short time.
- glob: Provides a way to find files matching a specific pattern (used for renaming extracted files).

# 2. Function to set up a session with retry logic

In [3]:
def setup_session():
    session = requests.Session()  # Create a session object to persist settings and cookies across requests.
    
    # Configure retry logic
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[403, 500, 502, 503, 504])
    
    # Attach the retry logic to the session
    session.mount('https://', HTTPAdapter(max_retries=retries))
    
    # Set a user-agent header to identify the bot to the server
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (compatible; YourBot/0.1; +http://yourwebsite.com/bot.html)'
    })
    
    return session

+ setup_session: This function sets up a persistent HTTP session.
+ The retry logic is configured to retry failed requests (e.g., for specific status codes like 403, 500, etc.).
+ A custom User-Agent is added to the session headers to identify the bot when sending requests to servers.

In [4]:
# Function to download a file with retry logic
def download_file_with_retry(session, url):
    try:
        response = session.get(url)  # Send a GET request to the URL.
        response.raise_for_status()  # Check if the request was successful (raise an exception if not).
        return response  # Return the response if successful.
    except requests.exceptions.HTTPError as e:
        print(f"Failed to download {url}: {e}")  # Print an error message if the request fails.
        return None  # Return None if the download fails.


+ download_file_with_retry: This function downloads a file using the session and retry logic. It handles HTTP errors by printing a message if the download fails.

# 3. Main function to download and unzip forms with rate limiting

In [5]:
def download_and_unzip_forms(session, start_year, end_year, base_url, target_directory, requests_per_minute=10):
    request_interval = 60 / requests_per_minute  # Calculate the time interval between requests to ensure rate limiting.

    # Loop over each year and quarter to download and unzip files
    for year in range(start_year, end_year + 1):
        for quarter in ['QTR1', 'QTR2', 'QTR3', 'QTR4']:
            zip_url = f"{base_url}/{year}/{quarter}/form.zip"  # Construct the URL for the ZIP file.
            response = download_file_with_retry(session, zip_url)  # Download the file with retry logic.

            if response:  # If the download was successful:
                try:
                    # Use a context manager to open the downloaded ZIP file
                    with zipfile.ZipFile(BytesIO(response.content)) as zfile:
                        # Check if there are any files in the zip before extracting
                        extracted_files = zfile.namelist()

                        if any(file.endswith('.idx') for file in extracted_files):  # Check for .idx files
                            extract_path = os.path.join(target_directory, f"{year}{quarter}")  # Set the extraction directory.
                            os.makedirs(extract_path, exist_ok=True)  # Create the directory only when there are files to extract.
                            zfile.extractall(path=extract_path)  # Extract the ZIP file contents.
                            print(f"Extracted to {extract_path}")  # Inform the user of the extraction location.

                            # Rename the extracted .idx file if found
                            extracted_idx_files = glob.glob(os.path.join(extract_path, '*.idx'))
                            if extracted_idx_files:
                                original_file = extracted_idx_files[0]  # Take the first .idx file (assumed to be the target).
                                new_filename = os.path.join(target_directory, f"{year}{quarter}.idx")  # Set new file name.
                                os.rename(original_file, new_filename)  # Rename the file.
                                print(f"Renamed extracted file to {new_filename}")

                                # Remove the extraction directory after renaming the file
                                shutil.rmtree(extract_path)  # Remove the directory and all its contents
                                print(f"Removed directory: {extract_path}")
                            else:
                                print("No .idx file found to rename.")
                        else:
                            print(f"No .idx files found in {zip_url}. Skipping extraction.")
                except zipfile.BadZipFile as e:
                    print(f"Failed to unzip {zip_url}: {e}")  # Handle errors if the ZIP file is corrupted.

            time.sleep(request_interval)  # Wait before sending the next request (rate limiting).


+ download_and_unzip_forms: The main function that loops over years and quarters to download, unzip, and rename files.
+ request_interval: Used to control the time between requests to avoid overwhelming the server (rate limiting).
+ download_file_with_retry: Called to download the ZIP file.
+ If the download is successful, the ZIP file is extracted, and any .idx files are renamed.
+ If the ZIP file is invalid, it catches and prints the error.

# 4. Run the task

## 4.1 Setting up the session and defining parameters for the file download

In [6]:
session = setup_session()  # Initialize the session with retry logic.

start_year = 2023  # Define the start year.
end_year = 2023  # Define the end year.

base_url = "https://www.sec.gov/Archives/edgar/full-index"  # Define the base URL for downloading files.

target_directory = "D:/EDGAR/Forms"  # Define the directory to store the downloaded forms.

requests_per_minute = 10  # Define the maximum number of requests per minute (rate limiting).

# Ensure the target directory exists
os.makedirs(target_directory, exist_ok=True)  # Create the directory if it doesn't exist.


#### You can customize the code by changing the 'star_year', 'end_year', and 'target_directory';
+ The parameters such as the start and end years, and target directory are defined here.
+ The session is created using the setup_session function.
+ The target directory is ensured to exist by creating it if it doesn't already.

## 4.2 Start the download and unzip process

In [7]:
download_and_unzip_forms(session, start_year, end_year, base_url, target_directory, requests_per_minute)

Extracted to D:/EDGAR/Forms\2023QTR1
Renamed extracted file to D:/EDGAR/Forms\2023QTR1.idx
Removed directory: D:/EDGAR/Forms\2023QTR1
Extracted to D:/EDGAR/Forms\2023QTR2
Renamed extracted file to D:/EDGAR/Forms\2023QTR2.idx
Removed directory: D:/EDGAR/Forms\2023QTR2
Extracted to D:/EDGAR/Forms\2023QTR3
Renamed extracted file to D:/EDGAR/Forms\2023QTR3.idx
Removed directory: D:/EDGAR/Forms\2023QTR3
Extracted to D:/EDGAR/Forms\2023QTR4
Renamed extracted file to D:/EDGAR/Forms\2023QTR4.idx
Removed directory: D:/EDGAR/Forms\2023QTR4
