In [1]:
import requests
import pandas as pd
import time
import random
from bs4 import BeautifulSoup
from GoogleNews import GoogleNews
from selenium import webdriver
from fake_useragent import UserAgent
import tweepy

# Load previously selected top 250 stocks
df_selected_stocks = pd.read_csv("top_250_stocks.csv")

# Create directories for saving files
import os
os.makedirs("sentiment_data", exist_ok=True)

# Function to introduce random delays
def wait():
    delay = random.randint(5, 15)  # Random delay between 5 to 15 seconds
    print(f"Sleeping for {delay} seconds to avoid rate limits...")
    time.sleep(delay)

# ---------------- STEP 1: FETCH NEWS FROM NEWS API ---------------- #

NEWS_API_KEY = "1dd6845e05bb4e86bc7f87d8a3544ba0"

def fetch_news_api(stock_name):
    url = f"https://newsapi.org/v2/everything?q={stock_name}&language=en&sortBy=publishedAt&apiKey={NEWS_API_KEY}"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        news_data = []
        
        for article in data["articles"]:
            news_data.append({
                "Company Name": stock_name,
                "ISIN Code": df_selected_stocks.loc[df_selected_stocks["Company Name"] == stock_name, "ISIN Code"].values[0],
                "Date": article["publishedAt"],
                "News Title": article["title"],
                "Source": article["source"]["name"],
                "Link": article["url"]
            })
        return news_data
    else:
        print(f"Error fetching news for {stock_name}: {response.status_code}")
        return []

# Loop through selected stocks and fetch news
all_news = []
for stock in df_selected_stocks["Company Name"]:
    all_news.extend(fetch_news_api(stock))
    wait()

# Save news data
df_news = pd.DataFrame(all_news)
df_news.to_csv("sentiment_data/long_term_news_data.csv", index=False)
print("✅ Long-Term News Data Saved Successfully!")

# ---------------- STEP 2: SCRAPE NSE COMPANY FILINGS (With User Agent & Proxy) ---------------- #

# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in background
options.add_argument(f"user-agent={UserAgent().random}")  # Rotate user agents
driver = webdriver.Chrome(options=options)

def get_company_filings(stock_name):
    url = f"https://www.nseindia.com/companies-listing/corporate-filings/{stock_name}"
    
    try:
        driver.get(url)
        wait()  # Random delay

        soup = BeautifulSoup(driver.page_source, "html.parser")
        reports = soup.find_all("div", class_="report-item")

        filings = []
        for report in reports[:10]:  # Get last 10 years' filings
            filings.append({
                "Company Name": stock_name,
                "ISIN Code": df_selected_stocks.loc[df_selected_stocks["Company Name"] == stock_name, "ISIN Code"].values[0],
                "Date": report.find("span", class_="date").text.strip(),
                "Title": report.find("h3").text.strip(),
                "Link": report.find("a")["href"]
            })

        return filings
    except Exception as e:
        print(f"Error fetching filings for {stock_name}: {e}")
        return []

# Get filings for selected stocks
all_filings = []
for stock in df_selected_stocks["Company Name"]:
    all_filings.extend(get_company_filings(stock))
    wait()

driver.quit()

# Save company filings data
df_filings = pd.DataFrame(all_filings)
df_filings.to_csv("sentiment_data/historical_company_filings.csv", index=False)
print("✅ Company Filings Data Saved Successfully!")

# ---------------- STEP 3: FETCH SOCIAL MEDIA SENTIMENT FROM TWITTER ---------------- #

# Twitter API Credentials (Get from developer.twitter.com)
BEARER_TOKEN = "your_bearer_token"

client = tweepy.Client(bearer_token=BEARER_TOKEN)

def fetch_tweets(stock_symbol):
    query = f"{stock_symbol} stock -is:retweet lang:en"
    
    try:
        tweets = client.search_recent_tweets(query=query, max_results=100)
        tweet_data = []
        
        for tweet in tweets.data:
            tweet_data.append({
                "Company Name": stock_symbol,
                "ISIN Code": df_selected_stocks.loc[df_selected_stocks["Company Name"] == stock_symbol, "ISIN Code"].values[0],
                "Date": tweet.created_at,
                "Tweet": tweet.text
            })

        return tweet_data
    except Exception as e:
        print(f"Error fetching tweets for {stock_symbol}: {e}")
        return []

# Fetch tweets for selected stocks
all_tweets = []
for stock in df_selected_stocks["Company Name"]:
    all_tweets.extend(fetch_tweets(stock))
    wait()

# Save Twitter sentiment data
df_tweets = pd.DataFrame(all_tweets)
df_tweets.to_csv("sentiment_data/long_term_twitter_sentiment.csv", index=False)
print("✅ Long-Term Twitter Sentiments Saved Successfully!")

# ---------------- FINAL OUTPUT ---------------- #
print("\n✅ Sentiment Analysis Data Collection Completed Successfully!")


Sleeping for 7 seconds to avoid rate limits...
Sleeping for 13 seconds to avoid rate limits...
Sleeping for 7 seconds to avoid rate limits...
Sleeping for 14 seconds to avoid rate limits...
Sleeping for 12 seconds to avoid rate limits...
Sleeping for 6 seconds to avoid rate limits...
Sleeping for 8 seconds to avoid rate limits...
Sleeping for 6 seconds to avoid rate limits...
Sleeping for 10 seconds to avoid rate limits...
Sleeping for 14 seconds to avoid rate limits...
Sleeping for 15 seconds to avoid rate limits...
Sleeping for 7 seconds to avoid rate limits...
Sleeping for 6 seconds to avoid rate limits...
Sleeping for 5 seconds to avoid rate limits...
Sleeping for 10 seconds to avoid rate limits...
Sleeping for 6 seconds to avoid rate limits...
Sleeping for 14 seconds to avoid rate limits...
Sleeping for 8 seconds to avoid rate limits...
Sleeping for 10 seconds to avoid rate limits...
Sleeping for 10 seconds to avoid rate limits...
Sleeping for 10 seconds to avoid rate limits...
Sl

KeyboardInterrupt: 