In [1]:
import os
import time
import random
import json
import requests
import pandas as pd
import zipfile
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from tqdm import tqdm  # Progress bar

# Define User-Agent list at the beginning
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
]

# --- Step 1: Launch Browser and Extract Cookies ---
def get_nse_cookies():
    options = Options()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--start-maximized")  # Open in full screen
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    
    options.add_argument(f"user-agent={random.choice(USER_AGENTS)}")

    # Start WebDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    nse_url = "https://www.nseindia.com/companies-listing/corporate-filings-annual-reports"
    driver.get(nse_url)
    
    # Wait for JavaScript elements to load
    time.sleep(random.randint(3, 6))

    # Scroll to simulate user activity
    driver.find_element(By.TAG_NAME, "body").send_keys(Keys.PAGE_DOWN)
    time.sleep(random.uniform(2, 5))

    # Extract cookies
    cookies = {cookie['name']: cookie['value'] for cookie in driver.get_cookies()}
    
    driver.quit()  # Close browser
    return cookies

# --- Step 2: Load Stock List ---
df_stocks = pd.read_csv("../data/top_250_stocks.csv")  # Ensure this file exists

BASE_FOLDER = "nse_annual_reports"
os.makedirs(BASE_FOLDER, exist_ok=True)

# --- Step 3: Create Session Using Extracted Cookies ---
cookies = get_nse_cookies()

session = requests.Session()
for name, value in cookies.items():
    session.cookies.set(name, value)

# Define headers after USER_AGENTS is initialized
headers = {
    "accept": "*/*",
    "user-agent": random.choice(USER_AGENTS),
    "referer": "https://www.nseindia.com/companies-listing/corporate-filings-annual-reports",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "same-origin"
}

print("✅ Cookies extracted and session initialized successfully.")

# --- Step 4: Functions for Downloading and Processing Reports ---
def clean_filename(name):
    """Sanitize company names for filenames."""
    name = re.sub(r'[^\w\s-]', '', name)  # Remove special characters
    name = name.strip().replace(" ", "_")  # Replace spaces with underscores
    return name[:50]  # Limit name length to avoid issues

def safe_request(url, max_retries=5):
    """Retry requests with exponential backoff."""
    attempt = 0
    while attempt < max_retries:
        try:
            response = session.get(url, headers=headers, timeout=15, stream=True)
            if response.status_code == 200:
                return response
            elif response.status_code in [403, 429]:  # Forbidden or Too Many Requests
                wait_time = 30 + (10 * attempt)
                print(f"⚠️ Server blocking requests. Waiting {wait_time}s before retrying...")
                time.sleep(wait_time)
            else:
                print(f"❌ Unexpected HTTP {response.status_code}. Retrying...")

        except requests.exceptions.RequestException as e:
            print(f"❌ Request failed: {e}. Retrying...")

        attempt += 1
        time.sleep(2 ** attempt)  # Exponential backoff

    print(f"❌ Failed after {max_retries} attempts.")
    return None

def download_and_process_report(company_name, report_url):
    """Download, extract, and save PDFs."""
    clean_company_name = clean_filename(company_name)
    folder_path = os.path.join(BASE_FOLDER, f"{clean_company_name}_AR")
    os.makedirs(folder_path, exist_ok=True)

    filename = report_url.split("/")[-1]
    file_path = os.path.join(folder_path, filename)

    response = safe_request(report_url)
    if response:
        with open(file_path, "wb") as file:
            for chunk in response.iter_content(chunk_size=1024):
                file.write(chunk)
        print(f"✅ Downloaded: {file_path}")

        if filename.endswith(".zip"):
            try:
                with zipfile.ZipFile(file_path, 'r') as zip_ref:
                    zip_ref.extractall(folder_path)
                print(f"📂 Extracted: {file_path} to {folder_path}")

                for file in os.listdir(folder_path):
                    if file.endswith(".pdf"):
                        os.rename(os.path.join(folder_path, file), os.path.join(folder_path, f"{clean_company_name}_{file}"))
                        print(f"📄 Saved PDF: {clean_company_name}_{file}")

                os.remove(file_path)  # Remove ZIP after extraction

            except zipfile.BadZipFile:
                print(f"❌ Corrupted ZIP file: {file_path}")

        elif filename.endswith(".pdf"):
            os.rename(file_path, os.path.join(folder_path, f"{clean_company_name}_{filename}"))
            print(f"📄 Directly saved PDF: {clean_company_name}_{filename}")

    else:
        print(f"❌ Failed to download: {report_url}")

def fetch_nse_reports(symbol, company_name):
    """Fetch report links from NSE API."""
    encoded_company_name = company_name.replace(" ", "%20")
    nse_api_url = f"https://www.nseindia.com/api/annual-reports?index=equities&symbol={symbol}&issuer={encoded_company_name}"

    response = safe_request(nse_api_url)

    if response:
        try:
            data = response.json()
            reports = data.get("data", [])

            if reports:
                for report in reports:
                    report_url = report["fileName"]
                    download_and_process_report(company_name, report_url)
            else:
                print(f"⚠️ No reports found for {company_name} ({symbol})")

        except json.JSONDecodeError:
            print(f"❌ Error parsing JSON response for {company_name} ({symbol})")

    else:
        print(f"❌ Failed to fetch reports for {company_name} ({symbol})")

# --- Step 5: Loop Through Stocks and Fetch Reports ---
for _, row in tqdm(df_stocks.iterrows(), total=len(df_stocks)):
    try:
        fetch_nse_reports(row["Symbol"], row["Company Name"])
    except Exception as e:
        print(f"⚠️ Skipping {row['Company Name']} due to unexpected error: {e}")

print("\n✅ All reports downloaded, extracted, and organized successfully!")


✅ Cookies extracted and session initialized successfully.


  0%|                                                                                          | 0/179 [00:00<?, ?it/s]

✅ Downloaded: nse_annual_reports\360_ONE_WAM_Ltd_AR\AR_26248_360ONE_2023_2024_25092024193639.pdf
📄 Directly saved PDF: 360_ONE_WAM_Ltd_AR_26248_360ONE_2023_2024_25092024193639.pdf
✅ Downloaded: nse_annual_reports\360_ONE_WAM_Ltd_AR\AR_22548_360ONE_2022_2023_28072023201321.zip
❌ Corrupted ZIP file: nse_annual_reports\360_ONE_WAM_Ltd_AR\AR_22548_360ONE_2022_2023_28072023201321.zip


  0%|                                                                                          | 0/179 [00:25<?, ?it/s]


KeyboardInterrupt: 

In [None]:
try to make it such that if os error arises it ignores that company and goes to next company