In [1]:
!export FINNHUB_API_KEY='cshqdo9r01qg5burvvogcshqdo9r01qg5burvvp0' # Get an API hey from https://finnhub.io/

In [2]:
import os
os.environ['FINNHUB_API_KEY'] = 'cshqdo9r01qg5burvvogcshqdo9r01qg5burvvp0'

In [3]:
import os
import finnhub
import pandas as pd
import re
from datetime import datetime, timedelta

class StockNewsExtractor:
    def __init__(self, api_key=None, output_dir='stock_news'):
        """
        Initialize the Stock News Extractor
        
        Args:
            api_key (str, optional): Finnhub API key. If not provided, 
                                     will look for environment variable.
            output_dir (str, optional): Directory to save news files
        """
        # Prioritize passed API key, then environment variable
        if api_key:
            self.api_key = api_key
        else:
            self.api_key = os.environ.get('FINNHUB_API_KEY')
        
        if not self.api_key:
            raise ValueError("No Finnhub API key found. Set FINNHUB_API_KEY environment variable or pass key directly.")
        
        self.output_dir = output_dir
        
        # Ensure output directories exist
        os.makedirs(output_dir, exist_ok=True)
        os.makedirs(os.path.join(output_dir, 'summaries'), exist_ok=True)

    def _is_valid_news_article(self, article):
        """
        Validate if the article is a legitimate news piece
        
        Args:
            article (dict): News article dictionary
        
        Returns:
            bool: True if article appears to be valid, False otherwise
        """
        # Comprehensive ad and spam detection
        ad_keywords = [
            'advertisement', 'sponsored', 'promoted', 'advertisement content', 
            'paid content', 'sponsored content', 'this is an advertisement', 
            'ad by', 'ads by', 'sponsored by', 'promoted by',
            'zacks.com', 'financial research', 'stock market analysis'
        ]
        
        # Check for minimum required fields
        if not all(key in article for key in ['headline', 'summary', 'url']):
            return False
        
        # Convert text to lowercase for case-insensitive checking
        headline_lower = article['headline'].lower()
        summary_lower = article.get('summary', '').lower()
        
        # Advanced ad detection
        if any(keyword in headline_lower or keyword in summary_lower for keyword in ad_keywords):
            return False
        
        # Check for minimum length of summary
        if len(str(article.get('summary', '')).strip()) < 20:
            return False
        
        # Reject articles with obvious spam patterns
        spam_patterns = [
            r'\b(buy|sell)\s+now\b',
            r'limited\s+time\s+offer',
            r'exclusive\s+deal',
            r'\$\d+\s*profit'
        ]
        
        if any(re.search(pattern, headline_lower) or re.search(pattern, summary_lower) 
               for pattern in spam_patterns):
            return False
        
        return True

    def extract_and_save_news_summaries(self, ticker_symbol, days_back=30):
        """
        Extract news, save to CSV, and save summaries to a text file
        
        Args:
            ticker_symbol (str): Stock ticker symbol
            days_back (int, optional): Number of days to look back
        
        Returns:
            list: List of valid news summaries with dates
        """
        try:
            # Initialize Finnhub client
            finnhub_client = finnhub.Client(api_key=self.api_key)
            
            # Define time frame
            today = datetime.utcnow()
            past_date = today - timedelta(days=days_back)
            
            # Fetch company news
            news = finnhub_client.company_news(
                ticker_symbol, 
                _from=past_date.strftime('%Y-%m-%d'), 
                to=today.strftime('%Y-%m-%d')
            )
            
            if not news:
                print(f"No news found for ticker {ticker_symbol}")
                return []
            
            # Filter out invalid articles
            valid_news = [article for article in news if self._is_valid_news_article(article)]
            
            if not valid_news:
                print(f"No valid news articles found for ticker {ticker_symbol}")
                return []
            
            # Convert to DataFrame
            news_df = pd.DataFrame(valid_news)
            news_df['ticker'] = ticker_symbol
            # Convert timestamp to readable datetime
            news_df['readable_datetime'] = pd.to_datetime(news_df['datetime'], unit='s').dt.strftime('%Y-%m-%d %H:%M:%S')
            news_df.columns = list(news_df.columns)
            # Create timestamp for filenames
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            
            # Save news to CSV
            csv_filename = f"{ticker_symbol}_news_{timestamp}.csv"
            csv_filepath = os.path.join(self.output_dir, csv_filename)
            news_df.to_csv(csv_filepath, index=False, encoding='utf-8')
            print(f"News saved to {csv_filepath}")
            
            # Prepare summaries with dates
            detailed_summaries = [
                f"[{row['readable_datetime']}] {row['headline']}: {row['summary']}"
                for _, row in news_df.iterrows()
            ]
            
            # Save summaries to text file
            summary_filename = f"{ticker_symbol}_summaries_{timestamp}.txt"
            summary_filepath = os.path.join(self.output_dir, 'summaries', summary_filename)
            with open(summary_filepath, 'w', encoding='utf-8') as f:
                f.write(f"News Summaries for {ticker_symbol} on {timestamp}\n")
                f.write("=" * 50 + "\n\n")
                for idx, summary in enumerate(detailed_summaries, 1):
                    f.write(f"{idx}. {summary}\n\n")
            
            print(f"Summaries saved to {summary_filepath}")
            print(f"Total valid news articles: {len(detailed_summaries)}")
            
            return detailed_summaries
        
        except Exception as e:
            print(f"Error extracting news: {e}")
            return []

def get_nifty_50_tickers():
    """
    Returns a list of NIFTY 50 stock tickers (as of 2024)
    
    Returns:
        list: NIFTY 50 stock tickers
    """
    return [
        'AAPL', 'MSFT', 'AMZN', 'GOOGL', 'GOOG', 'TSLA', 'NVDA', 'JPM'
    ]

def main():
    try:
        # Create extractor without hardcoding the API key
        extractor = StockNewsExtractor()
        
        # Get all NIFTY 50 tickers
        nifty_tickers = get_nifty_50_tickers()
        
        # Collector for all news
        all_news = []
        
        # Extract news for each ticker
        for ticker in nifty_tickers:
            #print(f"\nFetching news for {ticker}")
            ticker_news = extractor.extract_and_save_news_summaries(ticker, days_back=7)
            all_news.extend(ticker_news)
        
        # Display all news
        #print("\n--- Consolidated NIFTY 50 News ---")
        #for idx, news in enumerate(all_news, 1):
            #print(f"{idx}. {news}")
    
    except ValueError as ve:
        print(f"Configuration Error: {ve}")
    except Exception as e:
        print(f"Unexpected error: {e}")

main()

  today = datetime.utcnow()


News saved to stock_news/AAPL_news_20241103_074144.csv
Summaries saved to stock_news/summaries/AAPL_summaries_20241103_074144.txt
Total valid news articles: 132


  today = datetime.utcnow()


News saved to stock_news/MSFT_news_20241103_074145.csv
Summaries saved to stock_news/summaries/MSFT_summaries_20241103_074145.txt
Total valid news articles: 106


  today = datetime.utcnow()


News saved to stock_news/AMZN_news_20241103_074147.csv
Summaries saved to stock_news/summaries/AMZN_summaries_20241103_074147.txt
Total valid news articles: 134


  today = datetime.utcnow()


News saved to stock_news/GOOGL_news_20241103_074148.csv
Summaries saved to stock_news/summaries/GOOGL_summaries_20241103_074148.txt
Total valid news articles: 145


  today = datetime.utcnow()


News saved to stock_news/GOOG_news_20241103_074149.csv
Summaries saved to stock_news/summaries/GOOG_summaries_20241103_074149.txt
Total valid news articles: 153


  today = datetime.utcnow()


News saved to stock_news/TSLA_news_20241103_074151.csv
Summaries saved to stock_news/summaries/TSLA_summaries_20241103_074151.txt
Total valid news articles: 115


  today = datetime.utcnow()


News saved to stock_news/NVDA_news_20241103_074152.csv
Summaries saved to stock_news/summaries/NVDA_summaries_20241103_074152.txt
Total valid news articles: 112


  today = datetime.utcnow()


News saved to stock_news/JPM_news_20241103_074153.csv
Summaries saved to stock_news/summaries/JPM_summaries_20241103_074153.txt
Total valid news articles: 44


In [4]:
import pandas as pd
import glob

# Path to the summaries directory
path = 'stock_news/'

# Use glob to find all CSV files in the directory
csv_files = glob.glob(path + "*.csv")

# Create a dictionary to store each CSV with the stock ticker as the key
dataframes = {}

# Loop through each file and read it into a DataFrame
for file in csv_files:
    # Extract the stock ticker from the filename (e.g., AAPL, AMZN)
    stock_ticker = file.split('/')[-1].split('_')[0]
    
    # Read the CSV file and store it in the dictionary
    dataframes[stock_ticker] = pd.read_csv(file)



In [5]:
import pandas as pd

In [6]:
df = pd.concat(dataframes.values(), ignore_index=True)

In [7]:
dfg = df.groupby('ticker')

In [8]:
import sqlite3

# Database file path
db_file_path = 'stock_news.db'

# Connect to the SQLite database (it will be created if it doesn't exist)
connection = sqlite3.connect(db_file_path)
cursor = connection.cursor()

# Create a table for news articles
cursor.execute('''
CREATE TABLE IF NOT EXISTS news_articles (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    stock_symbol TEXT,
    headline TEXT,
    published_date TEXT,
    sentiment REAL
)
''')


<sqlite3.Cursor at 0x7bc084ba58c0>

In [9]:
for ticker, group in dfg:
    # Insert each row as a record into the database
    for idx, row in group.iterrows():
        cursor.execute('''
        INSERT INTO news_articles (stock_symbol, headline, published_date)
        VALUES (?, ?, ?)
        ''', (row['ticker'], row['headline'], row['readable_datetime']))
        print(f"Inserted {row['headline']}")

Inserted CVS Health, Apollo Global Management, And Novo Nordisk Reported Results
Inserted Warren Buffett is sitting on over $325 billion cash as Berkshire Hathaway keeps selling Apple stock
Inserted Decoding Apple Inc (AAPL): A Strategic SWOT Insight
Inserted The Noisiest 10 Days Of The Year
Inserted Apple acquires photo editor Pixelmator
Inserted Breaking Down Magnificent 7 Earnings Results
Inserted Mag 7 earnings, election volatility, bitcoin: Market Takeaways
Inserted AI Spending Dominated Big Tech Earnings. Why These 2 Stocks Got a Pass From Investors.
Inserted Globalstar Soars 64% on Expanded Apple Partnership, Forecasts Revenue Doubling
Inserted Apple needs AI hype to drive upgrades but can't disappoint users
Inserted Stock Market Today: Indexes Fade Into The Close; Amazon Holds Above Buy Point (Live Coverage)
Inserted Apple, Amazon, Intel, Boeing, Super Micro, Exxon, and More Stock Market Movers
Inserted Sector Update: Tech Stocks Gain Late Afternoon
Inserted Globalstar stock su

In [10]:
print(dfg)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7bc09a7e5a90>


In [11]:
connection.commit()

In [None]:
query = 'SELECT * FROM news_articles'

In [24]:
stock_req = get_nifty_50_tickers()

In [20]:
articles = cursor.execute(query).fetchall()

In [25]:
query = 'SELECT * FROM news_articles WHERE stock_symbol = ?'

In [None]:
for ticker in stock_req:
    articles = cursor.execute(query, (ticker,)).fetchall()
    print(f"\n--- {ticker} News ---")
    for article in articles:
        print(article)