In [5]:
import requests
import pandas as pd
import feedparser
from bs4 import BeautifulSoup
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import datetime
import tweepy
import pytz  # Import pytz for timezone handling

# Initialize sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# --- 1. Scrape News from MoneyControl ---
def get_moneycontrol_news(stock_name, start_date, end_date):
    base_url = f"https://www.moneycontrol.com/news/tags/{stock_name}.html"
    headers = {
        "User  -Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    response = requests.get(base_url, headers=headers)
    if response.status_code != 200:
        print(f"Error fetching MoneyControl news: {response.status_code} - {response.text}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    articles = soup.find_all('li', class_="clearfix")

    news_data = []
    for article in articles:
        title = article.find("h2").text.strip()
        link = article.find("a")["href"]
        date_str = article.find("span", class_="date").text.strip() if article.find("span", class_="date") else "Unknown"
        
        # Convert date string to datetime object
        try:
            date = datetime.datetime.strptime(date_str, "%d %b %Y")
        except ValueError:
            continue  # Skip if date format is not as expected

        # Filter by date range
        if start_date <= date <= end_date:
            sentiment = analyze_sentiment(title)
            news_data.append({"title": title, "date": date_str, "link": link, "sentiment": sentiment})

    return news_data

# --- 2. Fetch Economic Times RSS Feeds ---
def get_economic_times_news(start_date, end_date):
    rss_url = "https://economictimes.indiatimes.com/markets/rssfeeds/1977021501.cms"
    feed = feedparser.parse(rss_url)

    news_data = []
    for entry in feed.entries:
        date_str = entry.published
        try:
            date = datetime.datetime.strptime(date_str, "%a, %d %b %Y %H:%M:%S %z")
        except ValueError:
            continue  # Skip if date format is not as expected

        # Filter by date range
        if start_date <= date <= end_date:
            sentiment = analyze_sentiment(entry.title)
            news_data.append({"title": entry.title, "date": date_str, "link": entry.link, "sentiment": sentiment})

    return news_data

# --- 3. Fetch Tweets for Stock Sentiment ---
def get_twitter_sentiment(stock_symbol):
    BEARER_TOKEN = "AAAAAAAAAAAAAAAAAAAAAA3MzgEAAAAARu1kKjoQ%2F1wWbgM2%2FN3bAOy9Phc%3DJ1iB1ZETpLEihEb6fmdqfuvj8KpsxxZ53BXbQAY5pichL3I5FO"  # Replace with your actual Bearer Token
    client = tweepy.Client(BEARER_TOKEN)
    query = f"{stock_symbol} stock news lang:en -is:retweet"
    
    try:
        tweets = client.search_recent_tweets(query=query, max_results=10)
    except tweepy.TweepyException as e:
        print(f"Error fetching Twitter data: {e}")
        return []

    twitter_data = []
    if tweets.data:
        for tweet in tweets.data:
            sentiment = analyze_sentiment(tweet.text)
            twitter_data.append({"tweet": tweet.text, "sentiment": sentiment})

    return twitter_data

# --- 4. Perform Sentiment Analysis ---
def analyze_sentiment(text):
    blob = TextBlob(text)
    vader_score = analyzer.polarity_scores(text)
    
    sentiment = "Neutral"
    if vader_score['compound'] > 0.05:
        sentiment = "Positive"
    elif vader_score['compound'] < -0.05:
        sentiment = "Negative"

    return sentiment

# --- 5. Main Execution ---
def main(stock_name, start_date_str, end_date_str):
    # Convert string dates to datetime objects and make them timezone-aware
    tz = pytz.timezone('Asia/Kolkata')  # Set the timezone to Indian Standard Time
    start_date = tz.localize(datetime.datetime.strptime(start_date_str, "%Y-%m-%d"))
    end_date = tz.localize(datetime.datetime.strptime(end_date_str, "%Y-%m-%d"))

    print(f"Fetching historical news data for {stock_name} from {start_date_str} to {end_date_str}...\n")

    # Fetch financial news
    moneycontrol_news = get_moneycontrol_news(stock_name, start_date, end_date)
    et_news = get_economic_times_news(start_date, end_date)
    
    # Fetch Twitter sentiment
    twitter_sentiment = get_twitter_sentiment(stock_name)
    
    # Combine all data
    all_news = moneycontrol_news + et_news
    df_news = pd.DataFrame(all_news)
    df_twitter = pd.DataFrame(twitter_sentiment)

    # Save results
    df_news.to_csv(f"{stock_name}_historical_news_sentiment.csv", index=False)
    df_twitter.to_csv(f"{stock_name}_twitter_sentiment.csv", index=False)

    print(f"✅ Data saved: {stock_name}_historical_news_sentiment.csv & {stock_name}_twitter_sentiment.csv")

# Run for TCS (Change for other stocks)
main("tata-consultancy-services", "2020-01-01", "2025-01-01")

Fetching historical news data for tata-consultancy-services from 2020-01-01 to 2025-01-01...

Error fetching MoneyControl news: 400 - <HTML><HEAD><TITLE>Error</TITLE></HEAD><BODY>
An error occurred while processing your request.<p>
Reference&#32;&#35;253&#46;c825c017&#46;1741128523&#46;1ed497b
<P>https&#58;&#47;&#47;errors&#46;edgesuite&#46;net&#47;253&#46;c825c017&#46;1741128523&#46;1ed497b</P>
</BODY></HTML>



TypeError: can't compare offset-naive and offset-aware datetimes