In [None]:
import os
import time
import random
import json
import requests
import pandas as pd
import zipfile
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from tqdm import tqdm  # Progress bar

# --- User-Agent List ---
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
]

# --- Step 1: Launch Browser and Extract Cookies ---
def get_nse_cookies():
    options = Options()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--start-maximized")  
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    options.add_argument(f"user-agent={random.choice(USER_AGENTS)}")

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    nse_url = "https://www.nseindia.com/companies-listing/corporate-filings-annual-reports"
    driver.get(nse_url)
    
    time.sleep(random.randint(3, 6))

    driver.find_element(By.TAG_NAME, "body").send_keys(Keys.PAGE_DOWN)
    time.sleep(random.uniform(2, 5))

    cookies = {cookie['name']: cookie['value'] for cookie in driver.get_cookies()}
    
    driver.quit()
    return cookies

# --- Step 2: Load Stock List ---
df_stocks = pd.read_csv("../data/top_250_stocks.csv")  

BASE_FOLDER = "nse_annual_reports"
os.makedirs(BASE_FOLDER, exist_ok=True)

# --- Step 3: Create Session Using Extracted Cookies ---
cookies = get_nse_cookies()

session = requests.Session()
for name, value in cookies.items():
    session.cookies.set(name, value)

headers = {
    "accept": "*/*",
    "user-agent": random.choice(USER_AGENTS),
    "referer": "https://www.nseindia.com/companies-listing/corporate-filings-annual-reports",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "same-origin"
}

print("✅ Cookies extracted and session initialized successfully.")

# --- Utility Functions ---
def clean_filename(name, max_length=50):
    """Sanitize company names for filenames."""
    name = re.sub(r'[^\w\s-]', '', name)
    name = name.strip().replace(" ", "_")
    return name[:max_length]

def extract_year_from_filename(filename):
    """Extract the year from the filename using regex."""
    match = re.search(r'(\d{4}_\d{4})', filename)  
    if match:
        return match.group(1)

    match = re.search(r'(\d{4})', filename)  
    return match.group(1) if match else "Unknown_Year"

def safe_request(url, max_retries=5):
    """Retry requests with exponential backoff."""
    attempt = 0
    while attempt < max_retries:
        try:
            response = session.get(url, headers=headers, timeout=15, stream=True)
            if response.status_code == 200:
                return response
            elif response.status_code in [403, 429]:
                wait_time = 30 + (10 * attempt)
                print(f"⚠️ Server blocking requests. Waiting {wait_time}s before retrying...")
                time.sleep(wait_time)
            else:
                print(f"❌ Unexpected HTTP {response.status_code}. Retrying...")

        except requests.exceptions.RequestException as e:
            print(f"❌ Request failed: {e}. Retrying...")

        attempt += 1
        time.sleep(2 ** attempt)

    print(f"❌ Failed after {max_retries} attempts.")
    return None

# def download_and_process_report(company_name, report_url):
#     """Download, extract, and save each report before moving to the next."""
#     clean_company_name = clean_filename(company_name)
#     folder_path = os.path.join(BASE_FOLDER, f"{clean_company_name}_AR")
#     os.makedirs(folder_path, exist_ok=True)

#     filename = report_url.split("/")[-1]
    # financial_year = extract_year_from_filename(filename)
    # base_filename = clean_filename(filename.split("_")[0], max_length=20)

    # unique_filename = f"{clean_company_name}_{financial_year}.pdf"
    # file_path = os.path.join(folder_path, unique_filename)

    # response = safe_request(report_url)
    # if response:
    #     with open(file_path, "wb") as file:
    #         for chunk in response.iter_content(chunk_size=1024):
    #             file.write(chunk)
    #     print(f"✅ Downloaded and saved: {file_path}")

    #     if filename.endswith(".zip"):
    #         try:
    #             with zipfile.ZipFile(file_path, 'r') as zip_ref:
    #                 zip_ref.extractall(folder_path)
    #             print(f"📂 Extracted: {file_path} to {folder_path}")

    #             for file in os.listdir(folder_path):
    #                 if file.endswith(".pdf"):
    #                     extracted_pdf_path = os.path.join(folder_path, file)

    #                     extracted_year = extract_year_from_filename(file)
    #                     if extracted_year == "Unknown_Year":
    #                         extracted_year = financial_year  

    #                     new_pdf_path = os.path.join(folder_path, f"{clean_company_name}_{extracted_year}.pdf")

    #                     counter = 1
    #                     while os.path.exists(new_pdf_path):
    #                         new_pdf_path = os.path.join(folder_path, f"{clean_company_name}_{extracted_year}_{counter}.pdf")
    #                         counter += 1

    #                     os.rename(extracted_pdf_path, new_pdf_path)
    #                     print(f"📄 Renamed PDF: {new_pdf_path}")

    #             os.remove(file_path)

    #         except zipfile.BadZipFile:
    #             print(f"❌ Corrupted ZIP file: {file_path}")

    # else:
    #     print(f"❌ Failed to download: {report_url}")
def download_and_process_report(company_name, report_url):
    """Download, extract, and save each report before moving to the next."""
    clean_company_name = clean_filename(company_name)
    folder_path = os.path.join(BASE_FOLDER, f"{clean_company_name}_AR")
    extracted_folder = os.path.join(folder_path, "extracted_reports")  # Separate folder for extraction
    os.makedirs(folder_path, exist_ok=True)
    os.makedirs(extracted_folder, exist_ok=True)

    filename = report_url.split("/")[-1]
    financial_year = extract_year_from_filename(filename)
    base_filename = clean_filename(filename.split("_")[0], max_length=20)

    # Ensure the original downloaded file is not overwritten
    unique_filename = f"{clean_company_name}_{financial_year}.pdf"
    file_path = os.path.join(folder_path, unique_filename)

    response = safe_request(report_url)
    if response:
        with open(file_path, "wb") as file:
            for chunk in response.iter_content(chunk_size=1024):
                file.write(chunk)
        print(f"✅ Downloaded and saved: {file_path}")

        # Handle ZIP extraction (if applicable)
        if filename.endswith(".zip"):
            try:
                with zipfile.ZipFile(file_path, 'r') as zip_ref:
                    zip_ref.extractall(extracted_folder)  # Extract into a separate subfolder
                print(f"📂 Extracted: {file_path} to {extracted_folder}")

                for file in os.listdir(extracted_folder):
                    if file.endswith(".pdf"):
                        extracted_pdf_path = os.path.join(extracted_folder, file)

                        # Extract actual year from each PDF
                        extracted_year = extract_year_from_filename(file)
                        if extracted_year == "Unknown_Year":
                            extracted_year = financial_year  

                        new_pdf_path = os.path.join(folder_path, f"{clean_company_name}_{extracted_year}.pdf")

                        # Ensure unique filename if multiple reports exist
                        counter = 1
                        while os.path.exists(new_pdf_path):
                            new_pdf_path = os.path.join(folder_path, f"{clean_company_name}_{extracted_year}_{counter}.pdf")
                            counter += 1

                        os.rename(extracted_pdf_path, new_pdf_path)
                        print(f"📄 Renamed and saved PDF: {new_pdf_path}")

                os.remove(file_path)  # Delete ZIP only after all extractions are done

            except zipfile.BadZipFile:
                print(f"❌ Corrupted ZIP file: {file_path}")

    else:
        print(f"❌ Failed to download: {report_url}")

def fetch_nse_reports(symbol, company_name):
    """Fetch reports and process one by one (download, save, move to next)."""
    encoded_company_name = company_name.replace(" ", "%20")
    nse_api_url = f"https://www.nseindia.com/api/annual-reports?index=equities&symbol={symbol}&issuer={encoded_company_name}"

    response = safe_request(nse_api_url)

    if response:
        try:
            data = response.json()
            reports = data.get("data", [])

            if reports:
                for report in reports:
                    report_url = report["fileName"]
                    
                    download_and_process_report(company_name, report_url)
            else:
                print(f"⚠️ No reports found for {company_name} ({symbol})")

        except json.JSONDecodeError:
            print(f"❌ Error parsing JSON response for {company_name} ({symbol})")

    else:
        print(f"❌ Failed to fetch reports for {company_name} ({symbol})")

PROGRESS_FILE = "./progress.txt"
completed_companies = set()
if os.path.exists(PROGRESS_FILE):
    with open(PROGRESS_FILE, "r") as f:
        completed_companies = set(line.strip() for line in f.readlines())

for _, row in tqdm(df_stocks.iterrows(), total=len(df_stocks)):
    company_name = row["Company Name"]
    symbol = row["Symbol"]
    if company_name in completed_companies:
        print(f"Skipping {company_name}, already processed")
        continue
    try:
        fetch_nse_reports(symbol, company_name)
        with open(PROGRESS_FILE, "a") as f:
            f.write(company_name + "\n")
    except Exception as e:
        print(f"⚠️ Skipping {company_name} due to error: {e}")

print("\n✅ All reports downloaded successfully!")


✅ Cookies extracted and session initialized successfully.


  0%|                                                                                          | 0/179 [00:00<?, ?it/s]

Skipping 360 ONE WAM Ltd., already processed
Skipping AU Small Finance Bank Ltd., already processed
Skipping Aadhar Housing Finance Ltd., already processed
Skipping Aavas Financiers Ltd., already processed
Skipping Aditya Birla Capital Ltd., already processed
Skipping Aditya Birla Sun Life AMC Ltd., already processed
Skipping Anand Rathi Wealth Ltd., already processed
Skipping Angel One Ltd., already processed
Skipping Aptus Value Housing Finance India Ltd., already processed
Skipping Axis Bank Ltd., already processed
Skipping BSE Ltd., already processed
Skipping Bajaj Finance Ltd., already processed
Skipping Bajaj Finserv Ltd., already processed
Skipping Bajaj Holdings & Investment Ltd., already processed
Skipping Bandhan Bank Ltd., already processed
Skipping Bank of Baroda, already processed
Skipping Bank of India, already processed
Skipping Bank of Maharashtra, already processed
Skipping CRISIL Ltd., already processed
Skipping Can Fin Homes Ltd., already processed
Skipping Canara Ba

 42%|██████████████████████████████████▍                                              | 76/179 [01:58<02:40,  1.56s/it]

✅ Downloaded and saved: nse_annual_reports\Glaxosmithkline_Pharmaceuticals_Ltd_AR\Glaxosmithkline_Pharmaceuticals_Ltd_2009_2009.pdf
📂 Extracted: nse_annual_reports\Glaxosmithkline_Pharmaceuticals_Ltd_AR\Glaxosmithkline_Pharmaceuticals_Ltd_2009_2009.pdf to nse_annual_reports\Glaxosmithkline_Pharmaceuticals_Ltd_AR\extracted_reports
📄 Renamed and saved PDF: nse_annual_reports\Glaxosmithkline_Pharmaceuticals_Ltd_AR\Glaxosmithkline_Pharmaceuticals_Ltd_2009_2009_1.pdf
✅ Downloaded and saved: nse_annual_reports\Glenmark_Pharmaceuticals_Ltd_AR\Glenmark_Pharmaceuticals_Ltd_2023_2024.pdf
✅ Downloaded and saved: nse_annual_reports\Glenmark_Pharmaceuticals_Ltd_AR\Glenmark_Pharmaceuticals_Ltd_2022_2023.pdf
📂 Extracted: nse_annual_reports\Glenmark_Pharmaceuticals_Ltd_AR\Glenmark_Pharmaceuticals_Ltd_2022_2023.pdf to nse_annual_reports\Glenmark_Pharmaceuticals_Ltd_AR\extracted_reports
📄 Renamed and saved PDF: nse_annual_reports\Glenmark_Pharmaceuticals_Ltd_AR\Glenmark_Pharmaceuticals_Ltd_2022_2023_1.p