# Task
The task is to run a financial news pipeline. This pipeline will scrape financial news articles from specified sources, deduplicate and filter them for quality, and then attempt to rewrite a selection of these articles using the available models from Openrouterapi. The final output will be a summary of the pipeline's execution, including the number of articles scraped, aggregated, and rewritten, along with a sample of the rewritten content if the model loading was successful.

In [1]:
# ========================================
# INSTALLATION - RUN THIS FIRST!
# ========================================

# # Fast installation without compilation
# !pip install beautifulsoup4 requests pandas lxml fake-useragent cloudscraper huggingface-hub --quiet

# # Install pre-built llama-cpp-python (no compilation!)
# !pip install llama-cpp-python --quiet

# !pip install crewai crewai-tools beautifulsoup4 requests pandas newspaper3k lxml[html_clean] litellm -q


!pip install beautifulsoup4 requests pandas newspaper3k lxml[html_clean] openai -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
[?25l     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/7.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m‚îÅ[0m[91m‚ï∏[0m[90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.4/7.4 MB[0m [31m10.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[90m‚ï∫[0m[90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2.3/7.4 MB[0m [31m31.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[91m‚ï∏[0m[90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m6.3/7.4 MB[0m [31m57.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚î

In [4]:
"""
Financial News AI Pipeline - OPTIMIZED VERSION
‚úì High-quality article filtering
‚úì Human-like conversational rewriting
‚úì Less jargon, more accessible
‚úì Optimized for Google Colab
"""

# ===========================
# IMPORTS
# ===========================

import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import json
import feedparser
from datetime import datetime
from typing import List, Dict
from newspaper import Article
from openai import OpenAI
from urllib.parse import quote_plus

# ===========================
# CONFIGURATION
# ===========================

class Config:
    """Configuration"""

    FREE_MODELS = [
        "mistralai/mistral-7b-instruct:free",
        "meta-llama/llama-4-maverick:free",
        "google/gemini-2.0-flash-exp:free",
        "minimax/minimax-m2:free",
    ]

    DEFAULT_MODEL = "google/gemini-2.0-flash-exp:free"  # Best for conversational writing

# ===========================
# HELPER FUNCTIONS
# ===========================

def get_headers():
    """Generate headers"""
    user_agents = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
    ]
    return {
        'User-Agent': random.choice(user_agents),
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.9',
    }

# ===========================
# QUALITY SCORING FUNCTION
# ===========================

def calculate_quality_score(article: Dict) -> float:
    """
    Calculate quality score for an article (0-100)
    Higher score = better quality
    """
    score = 0

    # Content length (30 points max)
    content_length = len(article.get('content', ''))
    if content_length > 1000:
        score += 30
    elif content_length > 500:
        score += 20
    elif content_length > 200:
        score += 10

    # Title quality (20 points max)
    title = article.get('title', '')
    if title and title != 'N/A':
        title_words = len(title.split())
        if 8 <= title_words <= 15:  # Optimal title length
            score += 20
        elif 5 <= title_words <= 20:
            score += 10

    # Has numerical data (15 points) - quality financial articles have numbers
    content = article.get('content', '')
    if any(char.isdigit() for char in content):
        score += 15

    # Source credibility (20 points max)
    trusted_sources = ['LiveMint', 'Moneycontrol', 'Economic Times', 'Business Standard']
    if any(source.lower() in article.get('source', '').lower() for source in trusted_sources):
        score += 20

    # Recent date (15 points)
    try:
        article_date = article.get('date', '')
        if article_date and article_date != 'N/A':
            # Boost recent articles
            score += 15
    except:
        pass

    return score

# ===========================
# SCRAPING FUNCTIONS
# ===========================

def scrape_nse_news(max_articles: int = 15) -> List[Dict]:
    """Scrape NSE news from Google News RSS feed"""
    articles = []

    try:
        query = "NSE India stock market"
        encoded_query = quote_plus(query)
        rss_url = f"https://news.google.com/rss/search?q={encoded_query}&hl=en-IN&gl=IN&ceid=IN:en"

        feed = feedparser.parse(rss_url)

        if feed.entries:
            for entry in feed.entries[:max_articles]:
                # Get actual URL
                actual_url = entry.link
                if 'news.google.com' in actual_url:
                    try:
                        response = requests.head(actual_url, allow_redirects=True, timeout=5)
                        actual_url = response.url
                    except:
                        pass

                # Get summary
                summary = entry.get('summary', '')
                if summary:
                    summary = BeautifulSoup(summary, 'html.parser').get_text()

                # Try to get full article content
                try:
                    article_obj = Article(actual_url)
                    article_obj.download()
                    article_obj.parse()
                    if len(article_obj.text) > len(summary):
                        summary = article_obj.text[:2000]
                except:
                    pass

                articles.append({
                    'source': 'Google News (NSE)',
                    'title': entry.title,
                    'content': summary or entry.title,
                    'url': actual_url,
                    'date': entry.get('published', datetime.now().strftime('%Y-%m-%d')),
                })

            print(f"‚úì Google News (NSE): {len(articles)} articles")

    except Exception as e:
        print(f"‚ö†Ô∏è Google News (NSE) error: {e}")

    return articles

def scrape_livemint(max_articles: int = 15) -> List[Dict]:
    """Scrape LiveMint with quality filtering"""
    articles = []

    try:
        base_url = "https://www.livemint.com/market"
        headers = get_headers()

        response = requests.get(base_url, headers=headers, timeout=15)
        soup = BeautifulSoup(response.content, 'lxml')

        article_links = []
        for link in soup.find_all('a', href=True):
            href = link['href']
            if any(s in href for s in ['/market/', '/companies/', '/money/', '/economy/']):
                if href.startswith('/'):
                    href = 'https://www.livemint.com' + href
                if href not in article_links and href.startswith('http'):
                    article_links.append(href)

        for url in article_links[:max_articles]:
            try:
                article = Article(url)
                article.download()
                article.parse()

                # Quality filter: at least 300 words
                if len(article.text) > 300:
                    articles.append({
                        'source': 'LiveMint',
                        'title': article.title or 'N/A',
                        'content': article.text[:2000],
                        'url': url,
                        'date': str(article.publish_date) if article.publish_date else datetime.now().strftime('%Y-%m-%d'),
                    })
                time.sleep(1)
            except:
                continue

        print(f"‚úì LiveMint: {len(articles)} articles")

    except Exception as e:
        print(f"‚ùå LiveMint: {e}")

    return articles

def scrape_moneycontrol(max_articles: int = 15) -> List[Dict]:
    """Scrape Moneycontrol with quality filtering"""
    articles = []

    try:
        sections = [
            'https://www.moneycontrol.com/news/business',
            'https://www.moneycontrol.com/news/business/markets',
            'https://www.moneycontrol.com/news/business/economy',
        ]

        all_links = []
        for section_url in sections:
            try:
                response = requests.get(section_url, headers=get_headers(), timeout=15)
                soup = BeautifulSoup(response.content, 'lxml')

                for link in soup.find_all('a', href=True):
                    href = link['href']
                    if '/news/business/' in href and href not in all_links:
                        if not href.startswith('http'):
                            href = 'https://www.moneycontrol.com' + href
                        all_links.append(href)
                time.sleep(1)
            except:
                continue

        for url in all_links[:max_articles]:
            try:
                article = Article(url)
                article.download()
                article.parse()

                # Quality filter: at least 300 words
                if len(article.text) > 300:
                    articles.append({
                        'source': 'Moneycontrol',
                        'title': article.title or 'N/A',
                        'content': article.text[:2000],
                        'url': url,
                        'date': str(article.publish_date) if article.publish_date else datetime.now().strftime('%Y-%m-%d'),
                    })
                time.sleep(1)
            except:
                continue

        print(f"‚úì Moneycontrol: {len(articles)} articles")

    except Exception as e:
        print(f"‚ùå Moneycontrol: {e}")

    return articles

# ===========================
# AI PROCESSING (Optimized)
# ===========================

class AIProcessor:
    """Direct OpenRouter API client with quality focus"""

    def __init__(self, api_key: str, model: str):
        self.client = OpenAI(
            base_url="https://openrouter.ai/api/v1",
            api_key=api_key,
        )
        self.model = model

    def call_llm(self, system_prompt: str, user_prompt: str, temperature: float = 0.7) -> str:
        """Make LLM call"""
        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=temperature,
                max_tokens=2500,  # Increased for better content
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            print(f"‚ùå LLM Error: {e}")
            return None

    def aggregate_articles(self, articles: List[Dict]) -> List[Dict]:
        """Aggregate and filter articles with QUALITY SCORING"""

        # Calculate quality scores
        for article in articles:
            article['quality_score'] = calculate_quality_score(article)

        # Sort by quality score
        articles_sorted = sorted(articles, key=lambda x: x['quality_score'], reverse=True)

        print(f"\nüìä Quality Scores:")
        for i, article in enumerate(articles_sorted[:5], 1):
            print(f"  {i}. {article['title'][:60]}... (Score: {article['quality_score']:.0f}/100)")

        # Use LLM to deduplicate and categorize top articles
        top_articles = articles_sorted[:20]
        articles_text = json.dumps([{
            'title': a['title'],
            'source': a['source'],
            'content': a['content'][:500]
        } for a in top_articles], indent=2)

        system_prompt = """You are a financial content curator for Indian markets.
Your task is to select the BEST, most unique articles."""

        user_prompt = f"""From these high-quality articles, select the top 8-10 UNIQUE ones:

{articles_text}

Remove:
- Duplicates (similar topics/titles)
- Low-quality summaries
- Generic market updates

Select articles with:
- Specific company news
- Policy changes
- Economic data
- Market analysis

Return ONLY a JSON array with article titles and categories.
Format: [{{"title": "...", "category": "stocks|markets|economy|banking|policy"}}]"""

        response = self.call_llm(system_prompt, user_prompt, temperature=0.3)

        if response:
            try:
                json_start = response.find('[')
                json_end = response.rfind(']') + 1
                if json_start != -1 and json_end > json_start:
                    selected = json.loads(response[json_start:json_end])

                    # Match selected titles with original articles
                    filtered = []
                    for sel in selected:
                        for article in top_articles:
                            if sel['title'].lower() in article['title'].lower():
                                article['category'] = sel.get('category', 'general')
                                filtered.append(article)
                                break

                    print(f"‚úì Selected {len(filtered)} high-quality unique articles")
                    return filtered
            except Exception as e:
                print(f"‚ö†Ô∏è Selection error: {e}, using top scored articles")

        # Fallback: return top scored
        return articles_sorted[:8]

    def rewrite_article(self, article: Dict) -> Dict:
        """Rewrite article in HUMAN, CONVERSATIONAL style"""

        system_prompt = """You are a conversational financial writer for everyday readers in India.

Writing style:
- Write like you're explaining to a friend over coffee
- Use simple, everyday language
- Avoid jargon - explain financial terms naturally
- Use short sentences and paragraphs
- Add context that makes sense to regular people
- Keep it engaging and easy to understand
- Use "you" and "we" to connect with readers

Avoid:
- Complex financial terminology without explanation
- Long, dense paragraphs
- Corporate speak or press release language
- Overly formal tone"""

        user_prompt = f"""Rewrite this financial news in a simple, conversational way:

Original Title: {article['title']}
Content: {article['content'][:1200]}
Source: {article['source']}

Instructions:
1. Create a catchy, simple headline (like BuzzFeed, not Wall Street Journal)
2. Rewrite in 350-450 words
3. Start with the most interesting fact
4. Explain any financial terms simply
5. Use Indian context (‚Çπ in crores/lakhs, relate to daily life)
6. Make it feel like a conversation, not a report
7. Keep ALL facts and numbers accurate

Example tone:
"The stock market had a rough day today. If you've been watching your portfolio, you probably noticed..."

Output format:
HEADLINE: [Your simple, catchy headline]

CONTENT:
[Your conversational rewrite - 350-450 words]"""

        response = self.call_llm(system_prompt, user_prompt, temperature=0.8)  # Higher temp for creativity

        if response:
            try:
                headline_marker = "HEADLINE:"
                content_marker = "CONTENT:"

                headline_start = response.find(headline_marker)
                content_start = response.find(content_marker)

                if headline_start != -1 and content_start != -1:
                    new_headline = response[headline_start + len(headline_marker):content_start].strip()
                    new_content = response[content_start + len(content_marker):].strip()

                    article['rewritten_title'] = new_headline
                    article['rewritten_content'] = new_content

                    print(f"‚úì Rewrote: {article['title'][:50]}...")
                else:
                    article['rewritten_title'] = article['title']
                    article['rewritten_content'] = response

            except Exception as e:
                print(f"‚ö†Ô∏è Rewrite parse error: {e}")
                article['rewritten_title'] = article['title']
                article['rewritten_content'] = article['content']

        return article

# ===========================
# MAIN PIPELINE
# ===========================

class FinancialNewsPipeline:
    """Optimized pipeline with quality focus"""

    def __init__(self, api_key: str, model: str):
        self.api_key = api_key
        self.model = model
        self.ai = AIProcessor(api_key, model)

    def run_pipeline(self, max_articles_per_source: int = 15):
        """Execute optimized pipeline"""

        print("\n" + "="*80)
        print("üöÄ FINANCIAL NEWS AI PIPELINE - OPTIMIZED")
        print("="*80)

        # PHASE 1: SCRAPING (More articles for better selection)
        print("\n[PHASE 1: SCRAPING]")
        print("-"*80)

        all_articles = []

        print("\nüì∞ Scraping Google News (NSE)...")
        all_articles.extend(scrape_nse_news(max_articles_per_source))

        print("\nüì∞ Scraping LiveMint...")
        all_articles.extend(scrape_livemint(max_articles_per_source))

        print("\nüì∞ Scraping Moneycontrol...")
        all_articles.extend(scrape_moneycontrol(max_articles_per_source))

        print(f"\n‚úì Total scraped: {len(all_articles)} articles")

        # PHASE 2: QUALITY FILTERING & AGGREGATION
        print("\n[PHASE 2: QUALITY FILTERING & SELECTION]")
        print("-"*80)

        filtered_articles = self.ai.aggregate_articles(all_articles)
        print(f"‚úì Selected {len(filtered_articles)} high-quality articles")

        # PHASE 3: HUMAN-STYLE REWRITING
        print("\n[PHASE 3: CONVERSATIONAL REWRITING]")
        print("-"*80)

        rewritten_articles = []
        for i, article in enumerate(filtered_articles[:8], 1):
            print(f"\n[{i}/{min(8, len(filtered_articles))}] Rewriting...")
            rewritten = self.ai.rewrite_article(article)
            rewritten_articles.append(rewritten)
            time.sleep(2)  # Rate limiting

        print("\n" + "="*80)
        print("‚úÖ PIPELINE COMPLETED!")
        print("="*80)

        return rewritten_articles

    def save_results(self, articles: List[Dict]):
        """Save results"""
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

        # Save as JSON
        json_file = f'financial_news_{timestamp}.json'
        with open(json_file, 'w', encoding='utf-8') as f:
            json.dump(articles, f, indent=2, ensure_ascii=False)
        print(f"\n‚úì Saved JSON: {json_file}")

        # Save as readable text
        txt_file = f'financial_news_{timestamp}.txt'
        with open(txt_file, 'w', encoding='utf-8') as f:
            f.write("="*80 + "\n")
            f.write("CONVERSATIONAL FINANCIAL NEWS ARTICLES\n")
            f.write("="*80 + "\n\n")

            for i, article in enumerate(articles, 1):
                f.write(f"\n{'='*80}\n")
                f.write(f"ARTICLE {i}\n")
                f.write(f"{'='*80}\n\n")
                f.write(f"Source: {article['source']}\n")
                f.write(f"Category: {article.get('category', 'N/A')}\n")
                f.write(f"Quality Score: {article.get('quality_score', 0):.0f}/100\n")
                f.write(f"Original: {article['title']}\n")
                f.write(f"URL: {article['url']}\n\n")
                f.write(f"NEW HEADLINE:\n{article.get('rewritten_title', 'N/A')}\n\n")
                f.write(f"REWRITTEN (CONVERSATIONAL):\n{article.get('rewritten_content', 'N/A')}\n\n")

        print(f"‚úì Saved Text: {txt_file}")

        # Try Colab download
        try:
            from google.colab import files
            files.download(json_file)
            files.download(txt_file)
            print("‚úì Downloads initiated")
        except:
            print("‚ÑπÔ∏è Files saved locally")

        # Display summary
        print(f"\nüìä SUMMARY:")
        print(f"High-quality articles: {len(articles)}")
        avg_score = sum(a.get('quality_score', 0) for a in articles) / len(articles)
        print(f"Average quality score: {avg_score:.1f}/100")
        print(f"\nBy category:")
        categories = {}
        for a in articles:
            cat = a.get('category', 'general')
            categories[cat] = categories.get(cat, 0) + 1
        for cat, count in categories.items():
            print(f"  - {cat}: {count}")

# ===========================
# MAIN EXECUTION
# ===========================

def main():
    """Main function"""

    print("="*80)
    print("üìä FINANCIAL NEWS AI PIPELINE - OPTIMIZED")
    print("‚úì Quality-focused scraping")
    print("‚úì Human-like conversational rewriting")
    print("="*80)

    print("\nüîë Get API key: https://openrouter.ai/keys")

    from google.colab import userdata
    api_key = userdata.get('OPENROUTER_API_KEY')
    # api_key = input("Enter OpenRouter API key: ").strip()

    if not api_key:
        print("‚ùå API key required!")
        return

    # Model selection
    print("\nü§ñ Free Models:")
    for i, model in enumerate(Config.FREE_MODELS, 1):
        print(f"{i}. {model}")

    choice = input(f"\nSelect (1-{len(Config.FREE_MODELS)}, default=3 Gemini): ").strip()

    try:
        idx = int(choice) - 1 if choice else 2
        model = Config.FREE_MODELS[idx]
    except:
        model = Config.DEFAULT_MODEL

    print(f"\n‚úì Using: {model}")

    # Articles per source
    max_articles = int(input("\nArticles per source (default=15): ").strip() or "15")

    # Run pipeline
    pipeline = FinancialNewsPipeline(api_key, model)

    try:
        results = pipeline.run_pipeline(max_articles)

        if results:
            pipeline.save_results(results)
            print("\n‚úÖ SUCCESS! Check your conversational articles!")
        else:
            print("\n‚ö†Ô∏è No results generated")

    except Exception as e:
        print(f"\n‚ùå Error: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()


üìä FINANCIAL NEWS AI PIPELINE - OPTIMIZED
‚úì Quality-focused scraping
‚úì Human-like conversational rewriting

üîë Get API key: https://openrouter.ai/keys

ü§ñ Free Models:
1. mistralai/mistral-7b-instruct:free
2. meta-llama/llama-4-maverick:free
3. google/gemini-2.0-flash-exp:free
4. minimax/minimax-m2:free

Select (1-4, default=3 Gemini): 2

‚úì Using: meta-llama/llama-4-maverick:free

Articles per source (default=15): 16

üöÄ FINANCIAL NEWS AI PIPELINE - OPTIMIZED

[PHASE 1: SCRAPING]
--------------------------------------------------------------------------------

üì∞ Scraping Google News (NSE)...
‚úì Google News (NSE): 16 articles

üì∞ Scraping LiveMint...
‚úì LiveMint: 3 articles

üì∞ Scraping Moneycontrol...
‚úì Moneycontrol: 16 articles

‚úì Total scraped: 35 articles

[PHASE 2: QUALITY FILTERING & SELECTION]
--------------------------------------------------------------------------------

üìä Quality Scores:
  1. Tata Motors Share Price LIVE: Stock falls over 3% ahea

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

‚úì Downloads initiated

üìä SUMMARY:
High-quality articles: 4
Average quality score: 80.0/100

By category:
  - markets: 3
  - stocks: 1

‚úÖ SUCCESS! Check your conversational articles!
