In [None]:
import pandas as pd
import requests
import time
import random
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import tweepy
import os

# Load the selected 250 NSE stocks
df_selected_stocks = pd.read_csv("top_250_stocks.csv")

# Create directories for saving data
os.makedirs("long_term_data", exist_ok=True)

# Function to introduce random delays
def wait():
    delay = random.randint(10, 12)
    print(f"Sleeping for {delay} seconds to avoid detection...")
    time.sleep(delay)

# ---------------- STEP 1: Load Kaggle Financial Sentiment Dataset ---------------- #

def load_kaggle_dataset():
    kaggle_dataset_path = "./long_term_data/data.csv"
    try:
        df = pd.read_csv(kaggle_dataset_path)
        print("✅ Kaggle dataset loaded successfully!")
        return df
    except:
        print("❌ Kaggle dataset not found. Please download it manually.")
        return None

df_kaggle = load_kaggle_dataset()

# ---------------- STEP 2: Scrape NSE Company Filings for Long-Term Sentiment ---------------- #

user_agent = UserAgent()
headers = {"User-Agent": user_agent.random}

def scrape_nse_filings(stock_name):
    url = f"https://www.nseindia.com/companies-listing/corporate-filings/{stock_name}"
    try:
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        filings = soup.find_all("div", class_="report-item")

        filings_data = []
        for report in filings[:10]:  # Last 10 reports
            filings_data.append({
                "Company Name": stock_name,
                "Date": report.find("span", class_="date").text.strip(),
                "Title": report.find("h3").text.strip(),
                "Link": report.find("a")["href"]
            })
        return filings_data
    except Exception as e:
        print(f"Error scraping NSE filings for {stock_name}: {e}")
        return []

all_filings = []
for stock in df_selected_stocks["Company Name"]:
    all_filings.extend(scrape_nse_filings(stock))
    wait()

df_filings = pd.DataFrame(all_filings)
df_filings.to_csv("long_term_data/nse_filings.csv", index=False)

print("✅ NSE Filings Data Saved Successfully!")

# ---------------- FINAL OUTPUT ---------------- #
print("\n✅ Long-Term Sentiment Data Collection Completed Successfully!")
