In [2]:
import os
import time
import random
import json
import requests
import pandas as pd
import zipfile
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from tqdm import tqdm  # Progress bar

# Define User-Agent list at the beginning
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
]

# --- Step 1: Launch Browser and Extract Cookies ---
def get_nse_cookies():
    options = Options()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--start-maximized")  # Open in full screen
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    
    options.add_argument(f"user-agent={random.choice(USER_AGENTS)}")

    # Start WebDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    nse_url = "https://www.nseindia.com/companies-listing/corporate-filings-annual-reports"
    driver.get(nse_url)
    
    # Wait for JavaScript elements to load
    time.sleep(random.randint(3, 6))

    # Scroll to simulate user activity
    driver.find_element(By.TAG_NAME, "body").send_keys(Keys.PAGE_DOWN)
    time.sleep(random.uniform(2, 5))

    # Extract cookies
    cookies = {cookie['name']: cookie['value'] for cookie in driver.get_cookies()}
    
    driver.quit()  # Close browser
    return cookies

# --- Step 2: Load Stock List ---
df_stocks = pd.read_csv("../data/top_250_stocks.csv")  # Ensure this file exists

BASE_FOLDER = "nse_annual_reports"
os.makedirs(BASE_FOLDER, exist_ok=True)

# --- Step 3: Create Session Using Extracted Cookies ---
cookies = get_nse_cookies()

session = requests.Session()
for name, value in cookies.items():
    session.cookies.set(name, value)

# Define headers after USER_AGENTS is initialized
headers = {
    "accept": "*/*",
    "user-agent": random.choice(USER_AGENTS),
    "referer": "https://www.nseindia.com/companies-listing/corporate-filings-annual-reports",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "same-origin"
}

print("✅ Cookies extracted and session initialized successfully.")

# --- Step 4: Functions for Downloading and Processing Reports ---
def clean_filename(name,max_length=50):
    """Sanitize company names for filenames."""
    name = re.sub(r'[^\w\s-]', '', name)  # Remove special characters
    name = name.strip().replace(" ", "_")  # Replace spaces with underscores
    return name[:max_length]  # Limit name length to avoid issues

def safe_request(url, max_retries=5):
    """Retry requests with exponential backoff."""
    attempt = 0
    while attempt < max_retries:
        try:
            response = session.get(url, headers=headers, timeout=15, stream=True)
            if response.status_code == 200:
                return response
            elif response.status_code in [403, 429]:  # Forbidden or Too Many Requests
                wait_time = 30 + (10 * attempt)
                print(f"⚠️ Server blocking requests. Waiting {wait_time}s before retrying...")
                time.sleep(wait_time)
            else:
                print(f"❌ Unexpected HTTP {response.status_code}. Retrying...")

        except requests.exceptions.RequestException as e:
            print(f"❌ Request failed: {e}. Retrying...")

        attempt += 1
        time.sleep(2 ** attempt)  # Exponential backoff

    print(f"❌ Failed after {max_retries} attempts.")
    return None

# def download_and_process_report(company_name, report_url):
#     """Download, extract, and save PDFs."""
#     clean_company_name = clean_filename(company_name)
#     folder_path = os.path.join(BASE_FOLDER, f"{clean_company_name}_AR")
#     os.makedirs(folder_path, exist_ok=True)

#     filename = report_url.split("/")[-1]
#     base_filename=clean_filename(filename.split("_")[0],max_length=20)
#     file_path = os.path.join(folder_path, f"{base_filename}.pdf")

#     response = safe_request(report_url)
#     if response:
#         with open(file_path, "wb") as file:
#             for chunk in response.iter_content(chunk_size=1024):
#                 file.write(chunk)
#         print(f"✅ Downloaded: {file_path}")

#         if filename.endswith(".zip"):
#             try:
#                 with zipfile.ZipFile(file_path, 'r') as zip_ref:
#                     zip_ref.extractall(folder_path)
#                 print(f"📂 Extracted: {file_path} to {folder_path}")

#                 for file in os.listdir(folder_path):
#                     if file.endswith(".pdf"):
#                         os.rename(os.path.join(folder_path, file), os.path.join(folder_path, f"{clean_company_name}_{file}"))
#                         print(f"📄 Saved PDF: {clean_company_name}_{file}")

#                 os.remove(file_path)  # Remove ZIP after extraction

#             except zipfile.BadZipFile:
#                 print(f"❌ Corrupted ZIP file: {file_path}")

#         elif filename.endswith(".pdf"):
#             os.rename(file_path, os.path.join(folder_path, f"{clean_company_name}_{filename}"))
#             print(f"📄 Directly saved PDF: {clean_company_name}_{filename}")

#     else:
#         print(f"❌ Failed to download: {report_url}")
def extract_year_from_filename(filename):
    """Extract the year from the filename using regex."""
    match = re.search(r'(\d{4}_\d{4})', filename)  # Matches patterns like '2023_2024'
    return match.group(1) if match else "Unknown_Year"

def download_and_process_report(company_name, report_url):
    """Download, extract, and save PDFs with correct naming to avoid duplicates."""
    clean_company_name = clean_filename(company_name)
    folder_path = os.path.join(BASE_FOLDER, f"{clean_company_name}_AR")
    os.makedirs(folder_path, exist_ok=True)

    filename = report_url.split("/")[-1]
    financial_year = extract_year_from_filename(filename)
    base_filename = clean_filename(filename.split("_")[0], max_length=20)  # Shorten original filename

    unique_filename = f"{clean_company_name}_{financial_year}.pdf"
    file_path = os.path.join(folder_path, unique_filename)

    response = safe_request(report_url)
    if response:
        with open(file_path, "wb") as file:
            for chunk in response.iter_content(chunk_size=1024):
                file.write(chunk)
        print(f"✅ Downloaded: {file_path}")

        if filename.endswith(".zip"):
            try:
                with zipfile.ZipFile(file_path, 'r') as zip_ref:
                    zip_ref.extractall(folder_path)
                print(f"📂 Extracted: {file_path} to {folder_path}")

                for file in os.listdir(folder_path):
                    if file.endswith(".pdf"):
                        extracted_pdf_path = os.path.join(folder_path, file)
                        new_pdf_path = os.path.join(folder_path, f"{clean_company_name}_{financial_year}.pdf")

                        # Ensure we don't overwrite an existing file
                        counter = 1
                        while os.path.exists(new_pdf_path):
                            new_pdf_path = os.path.join(folder_path, f"{clean_company_name}_{financial_year}_{counter}.pdf")
                            counter += 1

                        os.rename(extracted_pdf_path, new_pdf_path)
                        print(f"📄 Renamed PDF: {new_pdf_path}")

                os.remove(file_path)  # Remove ZIP after extraction

            except zipfile.BadZipFile:
                print(f"❌ Corrupted ZIP file: {file_path}")

    else:
        print(f"❌ Failed to download: {report_url}")
def fetch_nse_reports(symbol, company_name):
    """Fetch report links from NSE API."""
    encoded_company_name = company_name.replace(" ", "%20")
    nse_api_url = f"https://www.nseindia.com/api/annual-reports?index=equities&symbol={symbol}&issuer={encoded_company_name}"

    response = safe_request(nse_api_url)

    if response:
        try:
            data = response.json()
            reports = data.get("data", [])

            if reports:
                for report in reports:
                    report_url = report["fileName"]
                    download_and_process_report(company_name, report_url)
            else:
                print(f"⚠️ No reports found for {company_name} ({symbol})")

        except json.JSONDecodeError:
            print(f"❌ Error parsing JSON response for {company_name} ({symbol})")

    else:
        print(f"❌ Failed to fetch reports for {company_name} ({symbol})")
PROGRESS_FILE="./progress.txt"
if os.path.exists(PROGRESS_FILE):
    with open(PROGRESS_FILE,"r") as f:
        completed_companies=set(line.strip() for line in f.readlines())
else:
    completed_companies=set()
# --- Step 5: Loop Through Stocks and Fetch Reports ---
for _, row in tqdm(df_stocks.iterrows(), total=len(df_stocks)):
    company_name=row["Company Name"]
    symbol=row["Symbol"]
    if company_name in completed_companies:
        print(f" Skipping {company_name},already processed")
        continue
    try:
        fetch_nse_reports(row["Symbol"], row["Company Name"])
        with open(PROGRESS_FILE,"a") as f:
            f.write(company_name+"\n")
    except Exception as e:
        print(f"⚠️ Skipping {row['Company Name']} due to unexpected error: {e}")

print("\n✅ All reports downloaded, extracted, and organized successfully!")


✅ Cookies extracted and session initialized successfully.


  0%|                                                                                          | 0/179 [00:00<?, ?it/s]

 Skipping 360 ONE WAM Ltd.,already processed
 Skipping AU Small Finance Bank Ltd.,already processed
 Skipping Aadhar Housing Finance Ltd.,already processed
 Skipping Aavas Financiers Ltd.,already processed
 Skipping Aditya Birla Capital Ltd.,already processed
 Skipping Aditya Birla Sun Life AMC Ltd.,already processed
 Skipping Anand Rathi Wealth Ltd.,already processed
 Skipping Angel One Ltd.,already processed
 Skipping Aptus Value Housing Finance India Ltd.,already processed
 Skipping Axis Bank Ltd.,already processed
 Skipping BSE Ltd.,already processed
 Skipping Bajaj Finance Ltd.,already processed
 Skipping Bajaj Finserv Ltd.,already processed
 Skipping Bajaj Holdings & Investment Ltd.,already processed
 Skipping Bandhan Bank Ltd.,already processed
 Skipping Bank of Baroda,already processed
 Skipping Bank of India,already processed
 Skipping Bank of Maharashtra,already processed
 Skipping CRISIL Ltd.,already processed
 Skipping Can Fin Homes Ltd.,already processed
 Skipping Canara B

 31%|████████████████████████▉                                                        | 55/179 [00:29<01:07,  1.83it/s]

📂 Extracted: nse_annual_reports\Tech_Mahindra_Ltd_AR\AR.pdf to nse_annual_reports\Tech_Mahindra_Ltd_AR
📄 Saved PDF: Tech_Mahindra_Ltd_AR.pdf
⚠️ Skipping Tech Mahindra Ltd. due to unexpected error: [WinError 183] Cannot create a file when that file already exists: 'nse_annual_reports\\Tech_Mahindra_Ltd_AR\\AR_22069_TECHM_2022_2023_26062023151955.pdf' -> 'nse_annual_reports\\Tech_Mahindra_Ltd_AR\\Tech_Mahindra_Ltd_AR_22069_TECHM_2022_2023_26062023151955.pdf'
 Skipping Abbott India Ltd.,already processed


 32%|█████████████████████████▊                                                       | 57/179 [00:40<01:33,  1.30it/s]

✅ Downloaded: nse_annual_reports\Ajanta_Pharmaceuticals_Ltd_AR\AR.pdf
⚠️ Skipping Ajanta Pharmaceuticals Ltd. due to unexpected error: [WinError 183] Cannot create a file when that file already exists: 'nse_annual_reports\\Ajanta_Pharmaceuticals_Ltd_AR\\AR.pdf' -> 'nse_annual_reports\\Ajanta_Pharmaceuticals_Ltd_AR\\Ajanta_Pharmaceuticals_Ltd_AR_25980_AJANTPHARM_2023_2024_0609202416057.pdf'
 Skipping Akums Drugs and Pharmaceuticals Ltd.,already processed


 33%|██████████████████████████▋                                                      | 59/179 [00:45<01:47,  1.12it/s]

✅ Downloaded: nse_annual_reports\Alembic_Pharmaceuticals_Ltd_AR\AR.pdf
⚠️ Skipping Alembic Pharmaceuticals Ltd. due to unexpected error: [WinError 183] Cannot create a file when that file already exists: 'nse_annual_reports\\Alembic_Pharmaceuticals_Ltd_AR\\AR.pdf' -> 'nse_annual_reports\\Alembic_Pharmaceuticals_Ltd_AR\\Alembic_Pharmaceuticals_Ltd_AR_24161_APLLTD_2023_2024_21062024172948.pdf'


 34%|███████████████████████████▏                                                     | 60/179 [00:52<02:20,  1.18s/it]

✅ Downloaded: nse_annual_reports\Alkem_Laboratories_Ltd_AR\AR.pdf
⚠️ Skipping Alkem Laboratories Ltd. due to unexpected error: [WinError 183] Cannot create a file when that file already exists: 'nse_annual_reports\\Alkem_Laboratories_Ltd_AR\\AR.pdf' -> 'nse_annual_reports\\Alkem_Laboratories_Ltd_AR\\Alkem_Laboratories_Ltd_AR_24674_ALKEM_2023_2024_2607202416257.pdf'


 34%|███████████████████████████▏                                                     | 60/179 [00:58<01:56,  1.02it/s]


KeyboardInterrupt: 

In [None]:
try to make it such that if os error arises it ignores that company and goes to next company