


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import requests
import pandas as pd
import time
import random
from bs4 import BeautifulSoup
from googlesearch import search  # Alternative to GoogleNews API
from fake_useragent import UserAgent
import tweepy
from selenium import webdriver

# Load previously selected top 250 stocks
df_selected_stocks = pd.read_csv("../data/top_250_stocks.csv")

# Create directories for saving files
import os
os.makedirs("sentiment_data", exist_ok=True)

# Function to introduce long random delays
def wait():
    delay = random.randint(30, 60)  # Longer delay (30-60 sec) to prevent bans
    print(f"Sleeping for {delay} seconds to avoid detection...")
    time.sleep(delay)

# ---------------- STEP 1: FETCH FINANCIAL NEWS USING GOOGLE SEARCH ---------------- #

def fetch_google_news(stock_name):
    query = f"{stock_name} stock news site:moneycontrol.com OR site:economictimes.indiatimes.com OR site:bloombergquint.com"
    
    news_data = []
    headers = {"User-Agent": UserAgent().random}  # Use random user-agent
    
    try:
        # Search Google for top news articles
        search_results = list(search(query, num_results=5, lang="en"))  # Get top 5 links
        
        for link in search_results:
            response = requests.get(link, headers=headers, timeout=10)  # Fetch page
            soup = BeautifulSoup(response.text, "html.parser")
            
            title = soup.title.string if soup.title else "No Title"
            news_data.append({
                "Company Name": stock_name,
                "ISIN Code": df_selected_stocks.loc[df_selected_stocks["Company Name"] == stock_name, "ISIN Code"].values[0],
                "News URL": link,
                "News Title": title
            })
        
        return news_data
    
    except Exception as e:
        print(f"Error fetching news for {stock_name}: {e}")
        return []

# Loop through selected stocks and fetch news
all_news = []
for stock in df_selected_stocks["Company Name"]:
    all_news.extend(fetch_google_news(stock))
    wait()  # Longer wait to prevent 429

# Save news data
df_news = pd.DataFrame(all_news)
df_news.to_csv("sentiment_data/long_term_news_data.csv", index=False)
print("✅ Long-Term News Data Saved Successfully!")

# ---------------- STEP 2: SCRAPE NSE COMPANY FILINGS ---------------- #

options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in background
options.add_argument(f"user-agent={UserAgent().random}")  # Rotate user agents
driver = webdriver.Chrome(options=options)

def get_company_filings(stock_name):
    url = f"https://www.nseindia.com/companies-listing/corporate-filings/{stock_name}"
    
    try:
        driver.get(url)
        wait()  # Random delay

        soup = BeautifulSoup(driver.page_source, "html.parser")
        reports = soup.find_all("div", class_="report-item")

        filings = []
        for report in reports[:10]:  # Get last 10 years' filings
            filings.append({
                "Company Name": stock_name,
                "ISIN Code": df_selected_stocks.loc[df_selected_stocks["Company Name"] == stock_name, "ISIN Code"].values[0],
                "Date": report.find("span", class_="date").text.strip(),
                "Title": report.find("h3").text.strip(),
                "Link": report.find("a")["href"]
            })

        return filings
    except Exception as e:
        print(f"Error fetching filings for {stock_name}: {e}")
        return []

# Get filings for selected stocks
all_filings = []
for stock in df_selected_stocks["Company Name"]:
    all_filings.extend(get_company_filings(stock))
    wait()

driver.quit()

# Save company filings data
df_filings = pd.DataFrame(all_filings)
df_filings.to_csv("sentiment_data/historical_company_filings.csv", index=False)
print("✅ Company Filings Data Saved Successfully!")

# ---------------- STEP 3: FETCH SOCIAL MEDIA SENTIMENT FROM TWITTER ---------------- #

# Twitter API Credentials (Get from developer.twitter.com)
BEARER_TOKEN = "your_bearer_token"

client = tweepy.Client(bearer_token=BEARER_TOKEN)

def fetch_tweets(stock_symbol):
    query = f"{stock_symbol} stock -is:retweet lang:en"
    
    try:
        tweets = client.search_recent_tweets(query=query, max_results=100)
        tweet_data = []
        
        for tweet in tweets.data:
            tweet_data.append({
                "Company Name": stock_symbol,
                "ISIN Code": df_selected_stocks.loc[df_selected_stocks["Company Name"] == stock_symbol, "ISIN Code"].values[0],
                "Date": tweet.created_at,
                "Tweet": tweet.text
            })

        return tweet_data
    except Exception as e:
        print(f"Error fetching tweets for {stock_symbol}: {e}")
        return []

# Fetch tweets for selected stocks
all_tweets = []
for stock in df_selected_stocks["Company Name"]:
    all_tweets.extend(fetch_tweets(stock))
    wait()

# Save Twitter sentiment data
df_tweets = pd.DataFrame(all_tweets)
df_tweets.to_csv("sentiment_data/long_term_twitter_sentiment.csv", index=False)
print("✅ Long-Term Twitter Sentiments Saved Successfully!")

# ---------------- FINAL OUTPUT ---------------- #
print("\n✅ Sentiment Analysis Data Collection Completed Successfully!")


Error fetching news for 360 ONE WAM Ltd.: 429 Client Error: Too Many Requests for url: https://www.google.com/sorry/index?continue=https://www.google.com/search%3Fq%3D360%2BONE%2BWAM%2BLtd.%2Bstock%2Bnews%2Bsite%253Amoneycontrol.com%2BOR%2Bsite%253Aeconomictimes.indiatimes.com%2BOR%2Bsite%253Abloombergquint.com%26num%3D7%26hl%3Den%26start%3D0%26safe%3Dactive&hl=en&q=EgSVKD42GNWSmb4GIjACo9gWuXbsJkagf5cUJN4k93cArGH12sTn94yDMdiU8g6xxl_lsX1fqBBxEcVhF48yAXJaAUM
Sleeping for 43 seconds to avoid detection...
Error fetching news for AU Small Finance Bank Ltd.: 429 Client Error: Too Many Requests for url: https://www.google.com/sorry/index?continue=https://www.google.com/search%3Fq%3DAU%2BSmall%2BFinance%2BBank%2BLtd.%2Bstock%2Bnews%2Bsite%253Amoneycontrol.com%2BOR%2Bsite%253Aeconomictimes.indiatimes.com%2BOR%2Bsite%253Abloombergquint.com%26num%3D7%26hl%3Den%26start%3D0%26safe%3Dactive&hl=en&q=EgSVKD42GIKTmb4GIjDtGtPVi9wvLNcg1nCjonxINUunKQxe3O1LfMJOcGkY5nhXPgU_aa2B6G-4mD5h7WUyAXJaAUM
Sleeping f