In [44]:
import requests
import json
import datetime
import os
import re
import feedparser
from bs4 import BeautifulSoup
from transformers import pipeline

In [45]:
# Create a txt file with financial keywords
def create_keyword_file_if_not_exists():
    if not os.path.exists('financial_keywords.txt'):
        with open('financial_keywords.txt', 'w') as f:
            keywords = [
                "bank",
                "finance",
                "market",
                "invest",
                "stock",
                "share",
                "bond",
                "asset",
                "capital",
                "fund",
                "trading",
                "financial",
                "securities",
                "investment",
                "debt",
                "corporate",
                "equity",
                "merger",
                "acquisition",
                "investor",
                "portfolio",
                "management",
                "risk",
                "liquidity",
                "profit",
                "revenue",
                "growth",
                "fiscal",
                "economic",
                "global",
                "advisor"
            ]
            f.write('\n'.join(keywords))


def read_keywords(filename):
    """Read keywords from a file."""
    with open(filename, 'r') as f:
        return [line.strip().lower() for line in f if line.strip()]

def calculate_relevance_score(text, keywords):
    """Calculate relevance score based on keyword occurrence."""
    if not text:
        return 0
    
    text = text.lower()
    score = 0
    
    for keyword in keywords:
        # Count occurrences of the keyword in the text
        # Use word boundary for single words to prevent partial matches
        if len(keyword.split()) == 1:
            # Use regex with word boundaries for single words
            count = len(re.findall(r'\b' + re.escape(keyword) + r'\b', text))
        else:
            count = text.count(keyword.lower())
        
        # Add to score (multi-word keywords get higher weight)
        word_count = len(keyword.split())
        score += count * word_count
        
    return score


def clean_html_content(html_content):
    """Extract readable text from HTML content."""
    if not html_content or not isinstance(html_content, str):
        return ""
    
    # Check if it looks like HTML
    if '<' in html_content and '>' in html_content:
        try:
            soup = BeautifulSoup(html_content, 'html.parser')
            # Remove script and style elements
            for script in soup(["script", "style"]):
                script.extract()
            # Get text and clean it
            text = soup.get_text(strip=True)
            # Replace multiple spaces with single space
            text = re.sub(r'\s+', ' ', text)
            return text[:300] + "..." if len(text) > 300 else text
        except Exception as e:
            print(f"Error parsing HTML: {e}")
            return html_content[:300]
    else:
        # Not HTML, just return as is with length limit
        return html_content[:300] + "..." if len(html_content) > 300 else html_content


def create_custom_summary(article):
    """Create a simple summary based on article data."""
    title = article.get('title', '')
    description = article.get('description', '')
    content = article.get('content', '')
    source = article.get('source', {}).get('name', 'Unknown Source')
    
    summary = f"Title: {title}\n"
    
    # First try to get a clean description
    clean_desc = ""
    if description:
        clean_desc = clean_html_content(description)
        # Clean up the description
        clean_desc = re.sub(r'(subscribe|cookie|privacy|terms|browser)', '', clean_desc, flags=re.IGNORECASE)
    
    # If description didn't yield good results, try content
    if not clean_desc or len(clean_desc) < 20 or clean_desc.startswith("<a href="):
        if content:
            clean_content = clean_html_content(content)
            summary += f"Summary: {clean_content}\n"
        else:
            # If both are missing, just note that
            summary += "Summary: No summary available.\n"
    else:
        summary += f"Summary: {clean_desc}\n"
    
    summary += f"Source: {source}"
    return summary

In [46]:
def fetch_news_api_articles(query, days_back=7, api_key="1fd881e0e51d4752a86d6bdb7590201b"):
    """Fetch articles from News API."""
    print(f"Fetching articles from News API for: {query}")
    last_week = datetime.datetime.now() - datetime.timedelta(days=days_back)
    
    url = (
        'https://newsapi.org/v2/everything?'
        f'q="{query}"&'
        f'from={last_week.strftime("%Y-%m-%d")}&'
        'sortBy=popularity&'
        f'apiKey={api_key}'
    )
    
    try:
        response = requests.get(url)
        data = response.json()
        
        if data.get('status') == 'ok':
            articles = data.get('articles', [])
            print(f"Retrieved {len(articles)} articles from News API")
            return articles
        else:
            print(f"Error fetching from News API: {data.get('message', 'Unknown error')}")
            return []
    except Exception as e:
        print(f"Exception when fetching from News API: {e}")
        return []

def fetch_google_news_articles(query):
    """Fetch articles from Google News via RSS feed."""
    print(f"Fetching articles from Google News for: {query}")
    query_formatted = query.replace(' ', '+')
    rss_url = f"https://news.google.com/rss/search?q={query_formatted}&hl=en-US&gl=US&ceid=US:en"
    
    try:
        feed = feedparser.parse(rss_url)
        
        articles = []
        for entry in feed.entries:
            article = {
                'title': entry.title,
                'description': entry.get('description', ''),
                'content': entry.get('content', ''),
                'url': entry.link,
                'publishedAt': entry.get('published', ''),
                'source': {'name': 'Google News'}
            }
            
            # Try to get the actual source from the title
            title_parts = entry.title.split(' - ')
            if len(title_parts) > 1:
                article['source']['name'] = title_parts[-1]
                article['title'] = ' - '.join(title_parts[:-1])
                
            articles.append(article)
        
        print(f"Retrieved {len(articles)} articles from Google News")
        return articles
    except Exception as e:
        print(f"Exception when fetching from Google News: {e}")
        return []

def process_articles(articles, keywords):
    """Process and filter articles."""
    filtered_articles = []
    
    for article in articles:
        title = article.get('title', '') or ''
        description = article.get('description', '') or ''
        content = article.get('content', '') or ''
        
        # Check if 'nomura' appears in the title, description or content
        if ('nomura' in title.lower() or 
            'nomura' in description.lower() or 
            'nomura' in content.lower()):
            
            # Combine all text for relevance scoring
            combined_text = f"{title} {description} {content}"
            relevance_score = calculate_relevance_score(combined_text, keywords)
            
            # Add relevance score to article
            article['relevance_score'] = relevance_score
            filtered_articles.append(article)
    
    return filtered_articles


In [47]:
def analyze_sentiment(articles, max_articles=5):
    """Analyze sentiment for the top articles."""
    if not articles:
        return [], []
    
    print(f"Analyzing sentiment for top {min(max_articles, len(articles))} articles:")
    
    # Use a sentiment analysis pipeline
    sentiment_analyzer = pipeline("text-classification", model="tabularisai/multilingual-sentiment-analysis")
    
    # Process top articles
    analyzed_articles = []
    sentiments = []
    
    for i, article in enumerate(articles[:max_articles]):
        # Create a custom summary
        summary = create_custom_summary(article)
        
        # Get sentiment
        sentiment_result = sentiment_analyzer(summary)[0]
        sentiment = sentiment_result['label']
        
        # Store results
        analyzed_articles.append({
            'index': i+1,
            'title': article['title'],
            'summary': summary,
            'relevance_score': article['relevance_score'],
            'sentiment': sentiment,
            'url': article.get('url', '')
        })
        sentiments.append(sentiment)
        
        """
        # Print analysis
        print(f"\nArticle [{i+1}] - {article['title']}")
        print(f"Source: {article.get('source', {}).get('name', 'Unknown')}")
        print(f"Relevance Score: {article['relevance_score']}")
        print(f"URL: {article.get('url', 'Unknown URL')}")
        print(f"Sentiment: {sentiment}")
        """
    
    return analyzed_articles, sentiments

def generate_analysis_report(articles, sentiments):
    """Generate a detailed analysis report."""
    if not articles:
        return "No articles available for analysis."
    
    report = "# Nomura Holdings News Analysis Report\n\n"
    report += f"Analysis Date: {datetime.datetime.now().strftime('%Y-%m-%d')}\n"
    report += f"Articles Analyzed: {len(articles)}\n\n"
    
    # Sentiment breakdown
    report += "## Sentiment Breakdown\n\n"
    sentiment_counts = {}
    for sentiment in sentiments:
        sentiment_counts[sentiment] = sentiment_counts.get(sentiment, 0) + 1
    
    for sentiment, count in sentiment_counts.items():
        percentage = (count / len(sentiments)) * 100
        report += f"- {sentiment}: {count} articles ({percentage:.1f}%)\n"
    
    # Most relevant articles
    report += "\n## Most Relevant Articles\n\n"
    for article in articles:
        report += f"### [{article['index']}] {article['title']}\n"
        report += f"Relevance Score: {article['relevance_score']}\n"
        report += f"Sentiment: {article['sentiment']}\n"
        
        # Extract and format summary
        summary_text = article['summary']
        # Remove HTML tags if present
        summary_text = re.sub(r'<.*?>', '', summary_text)
        # Remove "Title: " prefix if present in the summary
        summary_text = re.sub(r'^Title: .*\n', '', summary_text)
        # Ensure we don't have empty lines
        summary_text = '\n'.join(line for line in summary_text.split('\n') if line.strip())
        
        report += f"{summary_text}\n"
        report += f"URL: {article['url']}\n\n"
    
    return report


In [49]:
def main():
    # Initialize keyword file and read keywords
    create_keyword_file_if_not_exists()
    keywords = read_keywords('financial_keywords.txt')
    print(f"Using keywords: {', '.join(keywords[:10])}... and {len(keywords)-10} more")
    
    # Fetch articles from both sources
    query = "Nomura Holdings"
    news_api_articles = fetch_news_api_articles(query)
    google_news_articles = fetch_google_news_articles(query)
    
    # Combine articles from both sources
    all_articles = news_api_articles + google_news_articles
    print(f"Total articles collected: {len(all_articles)}")
    
    # Process and filter articles
    filtered_articles = process_articles(all_articles, keywords)
    
    # Sort articles by relevance score
    filtered_articles.sort(key=lambda x: x.get('relevance_score', 0), reverse=True)
    
    # Print only the top 10 filtered articles
    print('\nNomura Holdings news from the last week (Top 10):')
    max_articles_to_print = min(10, len(filtered_articles))
    for i, article in enumerate(filtered_articles[:max_articles_to_print]):
        print(f"[{i+1}] Relevance Score: {article['relevance_score']}")
        print(article['title'])
        print(f"Source: {article.get('source', {}).get('name', 'Unknown')}")
        print(article.get('url', 'No URL'))
        print()
    
    print(f"Total relevant articles: {len(filtered_articles)} (showing top {max_articles_to_print})")
    
    # Analyze sentiment for top articles
    if filtered_articles:
        analyzed_articles, sentiments = analyze_sentiment(filtered_articles, max_articles=10)
        
        # Generate and print analysis report
        if analyzed_articles:
            report = generate_analysis_report(analyzed_articles, sentiments)
            print("\n" + "="*50 + "\n")
            print(report)
            
    else:
        print("No relevant Nomura Holdings articles found.")

if __name__ == "__main__":
    main()

Using keywords: bank, finance, market, invest, stock, share, bond, asset, capital, fund... and 21 more
Fetching articles from News API for: Nomura Holdings
Retrieved 11 articles from News API
Fetching articles from Google News for: Nomura Holdings
Retrieved 100 articles from Google News
Total articles collected: 111

Nomura Holdings news from the last week (Top 10):
[1] Relevance Score: 6
Nomura Asset Management Co. Ltd. Has $37.51 Million Stock Holdings in SharkNinja, Inc. (NYSE:SN)
Source: MarketBeat
https://news.google.com/rss/articles/CBMiwgFBVV95cUxQaFpTaThSZzgyTjlzUGNER01wbzJtR2x4MGtzS2htUXVLcElaRkJCNDdESVVuVXBZdTdOcWJOZm5CMldwX2xHNW0wZWVKMEx2QzJmWWhBbWNONU9LS0tNcFkwekNIV2U0QnZscWVxZzVjcnNCSFc0TFNlNTNhZXlRR3B0X25MbG5OQ3hick1kYTlJc1Rtb0dVUkFyN1dtMjhRb2pCckhHd0hqU0lncTVja0E4X0stOFJCTUQxYlFSeGwwZw?oc=5

[2] Relevance Score: 6
Nomura Asset Management Co. Ltd. Grows Stock Position in Booz Allen Hamilton Holding Co. (NYSE:BAH)
Source: MarketBeat
https://news.google.com/rss/articles/CBM