In [1]:
import os
import pandas as pd
import requests
import json
from datetime import datetime
import matplotlib.pyplot as plt

# Set up the project directory structure
base_dir = '/data/chats/46etsd/workspace/data_processing'
subdirs = ['market_data', 'news_sentiment', 'ml_models', 'sample_output', 'docs']

for subdir in subdirs:
    os.makedirs(os.path.join(base_dir, subdir), exist_ok=True)

# API Analysis - We'll evaluate the major financial data APIs and document their capabilities

# Initialize a dictionary to store API comparison information
api_comparison = {
    'TwelveData': {
        'description': 'Financial data provider with real-time and historical stock, forex, cryptocurrency, and index data',
        'endpoints': {
            'time_series': '/time_series - Get OHLCV data',
            'quote': '/quote - Get latest price',
            'price': '/price - Get real-time price',
            'forex_pairs': '/forex_pairs - List available forex pairs',
            'indices': '/indices - List available indices'
        },
        'supported_assets': {
            'US100': 'NASDAQ 100 (NDX)',
            'US30': 'Dow Jones Industrial Average (DJI)',
            'EUR/USD': 'EUR/USD forex pair',
            'GBP/USD': 'GBP/USD forex pair',
            'Crude Oil': 'WTI and Brent crude oil futures'
        },
        'pricing': {
            'free_tier': '8 API calls/minute, limited endpoints',
            'starter': '$12/month - 600 API calls/day',
            'standard': '$29/month - 3000 API calls/day',
            'premium': '$99/month - 15000 API calls/day'
        },
        'data_frequency': 'As low as 1 minute on paid plans',
        'pros': [
            'Comprehensive coverage of all our required assets',
            'Simple API structure',
            'WebSocket support for real-time data'
        ],
        'cons': [
            'Limited free tier',
            'Higher pricing for real-time data'
        ]
    },
    
    'Finnhub': {
        'description': 'Real-time RESTful APIs for stocks, forex, and crypto',
        'endpoints': {
            'quote': '/quote - Get real-time quote',
            'forex/rates': '/forex/rates - Get forex rates',
            'forex/candle': '/forex/candle - Get candlestick data',
            'news': '/news - Get market news',
            'indices/constituents': '/indices/constituents - Get index composition'
        },
        'supported_assets': {
            'US100': 'Via index symbol ^NDX',
            'US30': 'Via index symbol ^DJI',
            'EUR/USD': 'Via forex endpoint',
            'GBP/USD': 'Via forex endpoint',
            'Crude Oil': 'Via commodity symbols CL and BZ'
        },
        'pricing': {
            'free_tier': '60 API calls/minute, basic data only',
            'basic': '$15/month - 60 API calls/minute, more endpoints',
            'standard': '$35/month - 120 API calls/minute, full access',
            'premium': '$150/month - 600 API calls/minute, premium data'
        },
        'data_frequency': '1 minute candles minimum',
        'pros': [
            'Good news sentiment API included',
            'Strong forex and crypto coverage',
            'WebSocket support'
        ],
        'cons': [
            'Index data sometimes delayed',
            'API structure more complex than TwelveData'
        ]
    },
    
    'Alpha Vantage': {
        'description': 'Provides free APIs for realtime and historical data on stocks, forex, and cryptocurrencies',
        'endpoints': {
            'TIME_SERIES_INTRADAY': 'Get intraday time series',
            'GLOBAL_QUOTE': 'Get current price and volume',
            'FX_DAILY': 'Get daily forex rates',
            'FX_INTRADAY': 'Get intraday forex data',
            'NEWS_SENTIMENT': 'Get news sentiment data'
        },
        'supported_assets': {
            'US100': 'Via index symbol NDX',
            'US30': 'Via index symbol DJI',
            'EUR/USD': 'Via forex endpoint EUR/USD',
            'GBP/USD': 'Via forex endpoint GBP/USD',
            'Crude Oil': 'Limited support through commodity ETFs like USO'
        },
        'pricing': {
            'free_tier': '5 API calls/minute, 500/day',
            'premium': '$50/month - 120 API calls/minute, 5000 API calls/day',
            'enterprise': 'Custom pricing'
        },
        'data_frequency': 'As low as 1 minute, but with limitations on free tier',
        'pros': [
            'Generous free tier for prototyping',
            'Good documentation',
            'CSV format option'
        ],
        'cons': [
            'Rate limits can be restrictive',
            'Less comprehensive coverage for indices',
            'No WebSocket for real-time data'
        ]
    }
}

# Sample API response structure for each API
sample_responses = {
    'TwelveData': {
        'time_series': {
            'meta': {
                'symbol': 'EUR/USD',
                'interval': '1min',
                'currency_base': 'Euro',
                'currency_quote': 'US Dollar',
                'type': 'Physical Currency'
            },
            'values': [
                {
                    'datetime': '2023-01-01 00:00:00',
                    'open': '1.0701',
                    'high': '1.0702',
                    'low': '1.0698',
                    'close': '1.0700',
                    'volume': '1547'
                }
            ],
            'status': 'ok'
        }
    },
    'Finnhub': {
        'quote': {
            'c': 148.56,  # Current price
            'h': 149.89,  # High price of the day
            'l': 147.95,  # Low price of the day
            'o': 148.06,  # Open price of the day
            'pc': 147.92,  # Previous close price
            't': 1698346800  # Unix timestamp
        }
    },
    'Alpha Vantage': {
        'Time Series FX (1min)': {
            '2023-01-01 00:00:00': {
                '1. open': '1.0701',
                '2. high': '1.0702',
                '3. low': '1.0698',
                '4. close': '1.0700'
            }
        },
        'Meta Data': {
            '1. Information': 'FX Intraday (1min) Time Series',
            '2. From Symbol': 'EUR',
            '3. To Symbol': 'USD',
            '4. Last Refreshed': '2023-01-01 00:00:00'
        }
    }
}

# Write out the API comparison documentation
with open(os.path.join(base_dir, 'docs', 'api_comparison.md'), 'w') as f:
    f.write('# Financial Data API Comparison\n\n')
    f.write('This document compares the main features of financial data APIs for our trading assistant platform.\n\n')
    
    for api_name, details in api_comparison.items():
        f.write(f'## {api_name}\n\n')
        f.write(f'**Description**: {details["description"]}\n\n')
        
        f.write('### Supported Assets\n')
        for asset, desc in details['supported_assets'].items():
            f.write(f'- **{asset}**: {desc}\n')
        f.write('\n')
        
        f.write('### Key Endpoints\n')
        for endpoint, desc in details['endpoints'].items():
            f.write(f'- **{endpoint}**: {desc}\n')
        f.write('\n')
        
        f.write('### Pricing Tiers\n')
        for tier, price in details['pricing'].items():
            f.write(f'- **{tier}**: {price}\n')
        f.write('\n')
        
        f.write('### Data Frequency\n')
        f.write(f'{details["data_frequency"]}\n\n')
        
        f.write('### Pros\n')
        for pro in details['pros']:
            f.write(f'- {pro}\n')
        f.write('\n')
        
        f.write('### Cons\n')
        for con in details['cons']:
            f.write(f'- {con}\n')
        f.write('\n')
        
        f.write('---\n\n')

# Create a sample API response file for reference
with open(os.path.join(base_dir, 'docs', 'sample_api_responses.json'), 'w') as f:
    json.dump(sample_responses, f, indent=4)

# Create a requirements.txt file
with open(os.path.join(base_dir, 'requirements.txt'), 'w') as f:
    f.write('pandas>=1.3.0\n')
    f.write('requests>=2.25.0\n')
    f.write('matplotlib>=3.4.0\n')
    f.write('scikit-learn>=0.24.0\n')
    f.write('nltk>=3.6.0\n')
    f.write('textblob>=0.15.0\n')
    f.write('vaderSentiment>=3.3.0\n')
    f.write('numpy>=1.20.0\n')
    f.write('joblib>=1.0.0\n')
    f.write('flask>=2.0.0\n')  # For API endpoints
    f.write('websocket-client>=1.0.0\n')  # For real-time data

# Create an initial API integration module
with open(os.path.join(base_dir, 'market_data', 'api_client.py'), 'w') as f:
    f.write('''
import requests
from datetime import datetime
import pandas as pd
import os
import json
import time

class MarketDataClient:
    def __init__(self, api_provider='twelvedata', api_key=None):
        """
        Initialize the market data client with the chosen API provider
        
        Args:
            api_provider (str): The API provider to use ('twelvedata', 'finnhub', or 'alphavantage')
            api_key (str): API key for the chosen provider
        """
        self.api_provider = api_provider.lower()
        self.api_key = api_key
        
        # Base URLs for each provider
        self.base_urls = {
            'twelvedata': 'https://api.twelvedata.com',
            'finnhub': 'https://finnhub.io/api/v1',
            'alphavantage': 'https://www.alphavantage.co/query'
        }
        
        # Symbol mapping for different providers
        self.symbol_mapping = {
            'US100': {
                'twelvedata': 'NDX',
                'finnhub': '^NDX',
                'alphavantage': 'NDX'
            },
            'US30': {
                'twelvedata': 'DJI',
                'finnhub': '^DJI',
                'alphavantage': 'DJI'
            },
            'EUR/USD': {
                'twelvedata': 'EUR/USD',
                'finnhub': 'EURUSD',
                'alphavantage': 'EUR/USD'
            },
            'GBP/USD': {
                'twelvedata': 'GBP/USD',
                'finnhub': 'GBPUSD',
                'alphavantage': 'GBP/USD'
            },
            'Crude Oil WTI': {
                'twelvedata': 'WTI',
                'finnhub': 'CL',
                'alphavantage': 'USO'  # Using ETF as proxy
            },
            'Crude Oil Brent': {
                'twelvedata': 'BRENT',
                'finnhub': 'BZ',
                'alphavantage': 'BNO'  # Using ETF as proxy
            }
        }
    
    def get_current_price(self, asset):
        """Get the current price of an asset
        
        Args:
            asset (str): Asset symbol from our standardized list
            
        Returns:
            dict: Current price data
        """
        if self.api_provider not in self.base_urls:
            raise ValueError(f"Unsupported API provider: {self.api_provider}")
            
        symbol = self.symbol_mapping.get(asset, {}).get(self.api_provider)
        if not symbol:
            raise ValueError(f"Asset {asset} not supported for {self.api_provider}")
            
        if self.api_provider == 'twelvedata':
            url = f"{self.base_urls['twelvedata']}/price"
            params = {
                'symbol': symbol,
                'apikey': self.api_key
            }
            response = requests.get(url, params=params)
            return response.json()
            
        elif self.api_provider == 'finnhub':
            url = f"{self.base_urls['finnhub']}/quote"
            params = {
                'symbol': symbol,
                'token': self.api_key
            }
            response = requests.get(url, params=params)
            return response.json()
            
        elif self.api_provider == 'alphavantage':
            is_forex = '/' in asset
            
            if is_forex:
                url = self.base_urls['alphavantage']
                from_currency, to_currency = symbol.split('/')
                params = {
                    'function': 'CURRENCY_EXCHANGE_RATE',
                    'from_currency': from_currency,
                    'to_currency': to_currency,
                    'apikey': self.api_key
                }
            else:
                url = self.base_urls['alphavantage']
                params = {
                    'function': 'GLOBAL_QUOTE',
                    'symbol': symbol,
                    'apikey': self.api_key
                }
                
            response = requests.get(url, params=params)
            return response.json()
            
    def get_historical_data(self, asset, interval='1h', count=100):
        """Get historical OHLCV data
        
        Args:
            asset (str): Asset symbol from our standardized list
            interval (str): Time interval (1m, 5m, 15m, 1h, 4h, 1d)
            count (int): Number of candles to return
            
        Returns:
            pandas.DataFrame: Historical price data
        """
        # Implementation would vary based on provider
        # This is a placeholder for the actual implementation
        pass
        
    def format_response(self, raw_response, asset, data_type='price'):
        """
        Standardize the API response format for frontend consumption
        
        Args:
            raw_response (dict): API response from the provider
            asset (str): Asset symbol
            data_type (str): Type of data ('price' or 'historical')
            
        Returns:
            dict: Standardized response
        """
        # Implementation would standardize responses across different providers
        # This is a placeholder for the actual implementation
        pass
''')

# Create README with setup instructions
with open(os.path.join(base_dir, 'README.md'), 'w') as f:
    f.write('''# Trading Assistant Platform - Data Processing Pipeline

This component handles the data processing for the trading assistant platform, including:

1. Real-time market data retrieval
2. News sentiment analysis
3. ML prediction models for market signals
4. Data formatting for frontend consumption

## Directory Structure

- `/market_data`: API clients and data retrieval modules
- `/news_sentiment`: News API integration and sentiment analysis
- `/ml_models`: Prediction models and training scripts
- `/sample_output`: Example output data formats
- `/docs`: Documentation on APIs, data flow, etc.

## Getting Started

1. Install requirements:
   ```
   pip install -r requirements.txt
   ```

2. Set up API keys:
   - Create a `.env` file with your API keys:
   ```
   TWELVEDATA_API_KEY=your_key_here
   FINNHUB_API_KEY=your_key_here
   ALPHAVANTAGE_API_KEY=your_key_here
   NEWS_API_KEY=your_key_here
   ```

3. Run the data collection service:
   ```
   python market_data/collector.py
   ```

4. Run the sentiment analysis service:
   ```
   python news_sentiment/analyzer.py
   ```

5. Run the prediction model:
   ```
   python ml_models/predictor.py
   ```

## API Decision

After analyzing the available options, we recommend using **TwelveData** as our primary data provider for the following reasons:

1. Comprehensive coverage of all our required assets (US100, US30, EUR/USD, GBP/USD, Crude Oil)
2. Simple and consistent API structure
3. WebSocket support for real-time data
4. Reasonable pricing for our expected usage

As a backup or complementary service, **Finnhub** offers good news sentiment analysis that we can integrate with our own VADER/TextBlob implementation.
''')

print("Project structure and API analysis completed. Files have been created in the following directories:")
for dirpath, dirnames, filenames in os.walk(base_dir):
    level = dirpath.replace(base_dir, '').count(os.sep)
    indent = ' ' * 4 * level
    print(f"{indent}{os.path.basename(dirpath)}/")
    sub_indent = ' ' * 4 * (level + 1)
    for f in filenames:
        print(f"{sub_indent}{f}")

# Conclusion on best API choice
print("\nAPI Recommendation:")
print("Based on our analysis, TwelveData appears to be the best option for this trading platform due to:")
print("1. Comprehensive coverage of all required assets (US100, US30, EUR/USD, GBP/USD, Crude Oil)")
print("2. Simple API structure and consistent data formats")
print("3. Support for WebSockets for real-time data")
print("4. Reasonable pricing plans based on expected usage")
print("5. Good documentation and reliability")

Project structure and API analysis completed. Files have been created in the following directories:
data_processing/
    README.md
    requirements.txt
    docs/
        api_comparison.md
        sample_api_responses.json
    market_data/
        api_client.py
    ml_models/
    news_sentiment/
    sample_output/

API Recommendation:
Based on our analysis, TwelveData appears to be the best option for this trading platform due to:
1. Comprehensive coverage of all required assets (US100, US30, EUR/USD, GBP/USD, Crude Oil)
2. Simple API structure and consistent data formats
3. Support for WebSockets for real-time data
4. Reasonable pricing plans based on expected usage
5. Good documentation and reliability


In [2]:
# Check key project structures
print("Project base directory:", base_dir)
print("\nAPI providers analyzed:", list(api_comparison.keys()))
print("\nDirectories created:", subdirs)
print("\nKey features of MarketDataClient:")
print("- Supported API providers:", ["twelvedata", "finnhub", "alphavantage"])
print("- Supported assets:", list(api_comparison["TwelveData"]["supported_assets"].keys()))

Project base directory: /data/chats/46etsd/workspace/data_processing

API providers analyzed: ['TwelveData', 'Finnhub', 'Alpha Vantage']

Directories created: ['market_data', 'news_sentiment', 'ml_models', 'sample_output', 'docs']

Key features of MarketDataClient:
- Supported API providers: ['twelvedata', 'finnhub', 'alphavantage']
- Supported assets: ['US100', 'US30', 'EUR/USD', 'GBP/USD', 'Crude Oil']


In [3]:
import os
import requests
import pandas as pd
import json
import time
from datetime import datetime, timedelta
from threading import Timer
import re
from typing import List, Dict, Any, Union, Tuple

# Import sentiment analysis libraries
import nltk
from nltk.tokenize import sent_tokenize
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Base directory for our project
base_dir = '/data/chats/46etsd/workspace/data_processing'
news_dir = os.path.join(base_dir, 'news_sentiment')

# Ensure NLTK resources are downloaded
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)

class NewsAPIClient:
    """
    Client for fetching financial news from various API sources.
    Supports NewsAPI, Finnhub, and Alpha Vantage News endpoints.
    """
    def __init__(self, api_provider='newsapi', api_key=None):
        """
        Initialize the news API client.
        
        Args:
            api_provider (str): The API provider to use ('newsapi', 'finnhub', or 'alphavantage')
            api_key (str): API key for the chosen provider
        """
        self.api_provider = api_provider.lower()
        self.api_key = api_key
        
        # Base URLs for each provider
        self.base_urls = {
            'newsapi': 'https://newsapi.org/v2',
            'finnhub': 'https://finnhub.io/api/v1',
            'alphavantage': 'https://www.alphavantage.co/query'
        }
        
        # Asset to keyword mapping for better news results
        self.asset_keywords = {
            'US100': ['NASDAQ 100', 'NDX', 'tech stocks', 'technology sector', 'NASDAQ index'],
            'US30': ['Dow Jones', 'DJIA', 'Dow 30', 'industrial average', 'blue chip stocks'],
            'EUR/USD': ['EURUSD', 'Euro Dollar', 'EUR USD', 'euro forex', 'european currency'],
            'GBP/USD': ['GBPUSD', 'British Pound', 'Sterling Dollar', 'GBP USD', 'pound forex'],
            'Crude Oil WTI': ['WTI crude', 'oil price', 'crude oil', 'petroleum market', 'oil futures'],
            'Crude Oil Brent': ['Brent crude', 'oil price', 'crude oil', 'petroleum market', 'oil futures']
        }
    
    def get_news_for_asset(self, asset: str, days: int = 1, max_items: int = 10) -> List[Dict]:
        """
        Fetch recent news articles related to the specified asset.
        
        Args:
            asset (str): Asset symbol from our standardized list
            days (int): Number of days to look back
            max_items (int): Maximum number of news items to return
            
        Returns:
            List[Dict]: List of news articles with title, description, url, published_at, and source
        """
        if self.api_provider not in self.base_urls:
            raise ValueError(f"Unsupported API provider: {self.api_provider}")
        
        # Get the appropriate keywords for the asset
        keywords = self.asset_keywords.get(asset, [asset])
        keywords_str = ' OR '.join(keywords)
        
        # Calculate date range
        end_date = datetime.now()
        start_date = end_date - timedelta(days=days)
        
        if self.api_provider == 'newsapi':
            url = f"{self.base_urls['newsapi']}/everything"
            params = {
                'q': keywords_str,
                'from': start_date.strftime('%Y-%m-%d'),
                'to': end_date.strftime('%Y-%m-%d'),
                'language': 'en',
                'sortBy': 'publishedAt',
                'pageSize': max_items,
                'apiKey': self.api_key
            }
            
            response = requests.get(url, params=params)
            data = response.json()
            
            if data.get('status') != 'ok':
                print(f"Error fetching news: {data.get('message', 'Unknown error')}")
                return []
            
            articles = data.get('articles', [])
            return [
                {
                    'title': article.get('title', ''),
                    'description': article.get('description', ''),
                    'content': article.get('content', ''),
                    'url': article.get('url', ''),
                    'published_at': article.get('publishedAt', ''),
                    'source': article.get('source', {}).get('name', '')
                }
                for article in articles[:max_items]
            ]
            
        elif self.api_provider == 'finnhub':
            url = f"{self.base_urls['finnhub']}/news"
            
            # For forex pairs, use a different approach with Finnhub
            if asset in ['EUR/USD', 'GBP/USD']:
                category = 'forex'
            elif 'Crude Oil' in asset:
                category = 'general'  # Finnhub doesn't have a specific commodity category
            else:
                category = 'general'
                
            params = {
                'category': category,
                'token': self.api_key
            }
            
            response = requests.get(url, params=params)
            data = response.json()
            
            # Filter news based on keywords
            filtered_news = []
            for article in data:
                text = (article.get('headline', '') + ' ' + article.get('summary', '')).lower()
                if any(keyword.lower() in text for keyword in keywords):
                    filtered_news.append({
                        'title': article.get('headline', ''),
                        'description': article.get('summary', ''),
                        'content': article.get('summary', ''),
                        'url': article.get('url', ''),
                        'published_at': datetime.fromtimestamp(article.get('datetime', 0)).strftime('%Y-%m-%dT%H:%M:%SZ'),
                        'source': article.get('source', '')
                    })
                    
                    if len(filtered_news) >= max_items:
                        break
                        
            return filtered_news
            
        elif self.api_provider == 'alphavantage':
            url = self.base_urls['alphavantage']
            
            # Alpha Vantage uses tickers for news
            ticker_mapping = {
                'US100': 'NDX',
                'US30': 'DJI',
                'EUR/USD': 'EUR',
                'GBP/USD': 'GBP',
                'Crude Oil WTI': 'USO',
                'Crude Oil Brent': 'BNO'
            }
            
            ticker = ticker_mapping.get(asset, asset)
            
            params = {
                'function': 'NEWS_SENTIMENT',
                'tickers': ticker,
                'time_from': start_date.strftime('%Y%m%dT%H%M'),
                'limit': max_items,
                'apikey': self.api_key
            }
            
            response = requests.get(url, params=params)
            data = response.json()
            
            feed = data.get('feed', [])
            return [
                {
                    'title': article.get('title', ''),
                    'description': article.get('summary', ''),
                    'content': article.get('summary', ''),
                    'url': article.get('url', ''),
                    'published_at': article.get('time_published', ''),
                    'source': article.get('source', '')
                }
                for article in feed[:max_items]
            ]
        
        return []


class SentimentAnalyzer:
    """
    Analyzes the sentiment of financial news using VADER and TextBlob.
    """
    def __init__(self):
        """Initialize the sentiment analyzer with VADER and TextBlob"""
        self.vader = SentimentIntensityAnalyzer()
        
        # Augment VADER lexicon with financial terms
        financial_terms = {
            'bull': 2.0,
            'bear': -2.0,
            'bullish': 2.0,
            'bearish': -2.0,
            'rally': 1.5,
            'correction': -1.5,
            'profit': 1.8,
            'loss': -1.8,
            'gains': 1.5,
            'decline': -1.5,
            'upgrade': 1.7,
            'downgrade': -1.7,
            'outperform': 1.5,
            'underperform': -1.5,
            'buy': 1.0,
            'sell': -1.0,
            'hold': 0.0,
            'volatile': -0.5,
            'crash': -3.0,
            'boom': 3.0,
            'recession': -2.5,
            'growth': 1.8,
            'inflation': -1.0,
            'deflation': -1.0,
        }
        
        # Update the VADER lexicon
        self.vader.lexicon.update(financial_terms)
    
    def clean_text(self, text: str) -> str:
        """
        Clean the text by removing URLs, special characters, etc.
        
        Args:
            text (str): Input text
            
        Returns:
            str: Cleaned text
        """
        if not text:
            return ""
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        
        # Remove special characters
        text = re.sub(r'[^\w\s]', ' ', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def analyze_vader_sentiment(self, text: str) -> Dict[str, float]:
        """
        Analyze text sentiment using VADER.
        
        Args:
            text (str): Input text
            
        Returns:
            Dict[str, float]: VADER sentiment scores (neg, neu, pos, compound)
        """
        cleaned_text = self.clean_text(text)
        if not cleaned_text:
            return {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
            
        return self.vader.polarity_scores(cleaned_text)
    
    def analyze_textblob_sentiment(self, text: str) -> Tuple[float, float]:
        """
        Analyze text sentiment using TextBlob.
        
        Args:
            text (str): Input text
            
        Returns:
            Tuple[float, float]: Polarity (-1 to 1) and subjectivity (0 to 1)
        """
        cleaned_text = self.clean_text(text)
        if not cleaned_text:
            return (0.0, 0.0)
            
        blob = TextBlob(cleaned_text)
        return (blob.sentiment.polarity, blob.sentiment.subjectivity)
    
    def get_combined_sentiment(self, text: str) -> Dict[str, float]:
        """
        Get combined sentiment scores from VADER and TextBlob.
        
        Args:
            text (str): Input text
            
        Returns:
            Dict[str, float]: Combined sentiment scores
        """
        if not text:
            return {
                'vader_compound': 0.0,
                'vader_pos': 0.0,
                'vader_neg': 0.0,
                'vader_neu': 1.0,
                'textblob_polarity': 0.0,
                'textblob_subjectivity': 0.0,
                'combined_score': 0.0,
                'sentiment_label': 'neutral'
            }
        
        # Get VADER sentiment
        vader_scores = self.analyze_vader_sentiment(text)
        
        # Get TextBlob sentiment
        textblob_polarity, textblob_subjectivity = self.analyze_textblob_sentiment(text)
        
        # Combine scores (weighted average)
        vader_weight = 0.7
        textblob_weight = 0.3
        
        # Normalize TextBlob polarity to match VADER's compound range
        normalized_textblob = textblob_polarity
        
        # Weighted average
        combined_score = (vader_scores['compound'] * vader_weight) + (normalized_textblob * textblob_weight)
        
        # Determine sentiment label
        if combined_score >= 0.05:
            sentiment_label = 'positive'
        elif combined_score <= -0.05:
            sentiment_label = 'negative'
        else:
            sentiment_label = 'neutral'
        
        return {
            'vader_compound': vader_scores['compound'],
            'vader_pos': vader_scores['pos'],
            'vader_neg': vader_scores['neg'],
            'vader_neu': vader_scores['neu'],
            'textblob_polarity': textblob_polarity,
            'textblob_subjectivity': textblob_subjectivity,
            'combined_score': combined_score,
            'sentiment_label': sentiment_label
        }


class NewsSentimentEngine:
    """
    Main engine for news sentiment analysis that combines fetching news
    and analyzing sentiment.
    """
    def __init__(self, news_client: NewsAPIClient, sentiment_analyzer: SentimentAnalyzer):
        """
        Initialize the news sentiment engine.
        
        Args:
            news_client (NewsAPIClient): News API client
            sentiment_analyzer (SentimentAnalyzer): Sentiment analyzer
        """
        self.news_client = news_client
        self.sentiment_analyzer = sentiment_analyzer
        self.news_cache = {}  # Cache for news articles
        self.sentiment_cache = {}  # Cache for sentiment scores
    
    def get_news_with_sentiment(self, asset: str, days: int = 1, max_items: int = 10) -> List[Dict]:
        """
        Get news articles with sentiment scores for a specific asset.
        
        Args:
            asset (str): Asset symbol
            days (int): Number of days to look back
            max_items (int): Maximum number of news items to return
            
        Returns:
            List[Dict]: News articles with sentiment scores
        """
        # Get news articles
        news_articles = self.news_client.get_news_for_asset(asset, days, max_items)
        
        # Calculate sentiment for each article
        for article in news_articles:
            # Combine title and description for sentiment analysis
            text = article.get('title', '') + ' ' + article.get('description', '')
            
            # Get sentiment scores
            sentiment = self.sentiment_analyzer.get_combined_sentiment(text)
            
            # Add sentiment to article
            article['sentiment'] = sentiment
        
        return news_articles
    
    def get_aggregated_sentiment(self, asset: str, days: int = 1, max_items: int = 10) -> Dict:
        """
        Get aggregated sentiment for a specific asset.
        
        Args:
            asset (str): Asset symbol
            days (int): Number of days to look back
            max_items (int): Maximum number of news items to analyze
            
        Returns:
            Dict: Aggregated sentiment scores
        """
        news_with_sentiment = self.get_news_with_sentiment(asset, days, max_items)
        
        if not news_with_sentiment:
            return {
                'asset': asset,
                'avg_sentiment_score': 0.0,
                'sentiment_label': 'neutral',
                'sentiment_distribution': {'positive': 0, 'neutral': 0, 'negative': 0},
                'news_count': 0,
                'timestamp': datetime.now().isoformat()
            }
        
        # Calculate average sentiment
        sentiment_scores = [article['sentiment']['combined_score'] for article in news_with_sentiment]
        avg_score = sum(sentiment_scores) / len(sentiment_scores)
        
        # Count sentiment distribution
        sentiment_labels = [article['sentiment']['sentiment_label'] for article in news_with_sentiment]
        sentiment_distribution = {
            'positive': sentiment_labels.count('positive'),
            'neutral': sentiment_labels.count('neutral'),
            'negative': sentiment_labels.count('negative')
        }
        
        # Determine overall sentiment label
        if avg_score >= 0.1:
            label = 'positive'
        elif avg_score <= -0.1:
            label = 'negative'
        else:
            label = 'neutral'
        
        return {
            'asset': asset,
            'avg_sentiment_score': avg_score,
            'sentiment_label': label,
            'sentiment_distribution': sentiment_distribution,
            'news_count': len(news_with_sentiment),
            'timestamp': datetime.now().isoformat()
        }
    
    def get_key_topics(self, asset: str, days: int = 1, max_items: int = 10) -> List[str]:
        """
        Extract key topics from news articles for a specific asset.
        This is a simplified implementation using frequency-based extraction.
        
        Args:
            asset (str): Asset symbol
            days (int): Number of days to look back
            max_items (int): Maximum number of news items to analyze
            
        Returns:
            List[str]: Key topics
        """
        news_articles = self.news_client.get_news_for_asset(asset, days, max_items)
        
        if not news_articles:
            return []
        
        # Combine all text for analysis
        all_text = ' '.join([
            (article.get('title', '') + ' ' + article.get('description', ''))
            for article in news_articles
        ])
        
        # Clean text
        cleaned_text = self.sentiment_analyzer.clean_text(all_text)
        
        # Simple frequency-based extraction
        # In a real implementation, we would use more sophisticated NLP techniques
        common_words = ['the', 'a', 'an', 'is', 'are', 'was', 'were', 'and', 'or', 
                        'but', 'in', 'on', 'at', 'to', 'for', 'by', 'of', 'with']
        
        words = cleaned_text.lower().split()
        word_freq = {}
        
        for word in words:
            if len(word) > 3 and word not in common_words:
                word_freq[word] = word_freq.get(word, 0) + 1
        
        # Get top topics
        topics = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:5]
        return [topic[0] for topic in topics]
    
    def save_to_file(self, data, asset: str, data_type: str):
        """
        Save data to file.
        
        Args:
            data: Data to save
            asset (str): Asset symbol
            data_type (str): Type of data ('news', 'sentiment', 'topics')
        """
        # Create directory if it doesn't exist
        os.makedirs(os.path.join(news_dir, 'data'), exist_ok=True)
        
        # Create filename with timestamp
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        filename = f"{asset.replace('/', '_')}_{data_type}_{timestamp}.json"
        filepath = os.path.join(news_dir, 'data', filename)
        
        # Save to file
        with open(filepath, 'w') as f:
            json.dump(data, f, indent=4)
        
        return filepath


# Create a sample module for demonstration purposes
if __name__ == "__main__":
    # For demonstration, we'll use mock data to avoid API key requirements
    def run_demo():
        # Sample news articles for demonstration
        sample_news = [
            {
                "title": "US Dollar Strengthens Against Euro as Fed Signals Rate Hike",
                "description": "The USD gained against the EUR following the Federal Reserve's announcement indicating a potential interest rate increase next month.",
                "content": "The USD gained against the EUR following the Federal Reserve's announcement indicating a potential interest rate increase next month. Analysts suggest this could further strengthen the dollar in the coming weeks.",
                "url": "https://example.com/news/1",
                "published_at": "2023-05-15T14:30:00Z",
                "source": "Financial Times"
            },
            {
                "title": "Tech Stocks Rally Pushes NASDAQ 100 to New Heights",
                "description": "The NASDAQ 100 reached an all-time high today as technology stocks continued their impressive rally led by semiconductor and AI companies.",
                "content": "The NASDAQ 100 reached an all-time high today as technology stocks continued their impressive rally led by semiconductor and AI companies. Investors remain bullish on tech despite valuation concerns.",
                "url": "https://example.com/news/2",
                "published_at": "2023-05-15T16:45:00Z",
                "source": "Wall Street Journal"
            },
            {
                "title": "Oil Prices Drop on Increased Supply Concerns",
                "description": "Crude oil prices fell by 3% as OPEC+ members consider increasing production amid global economic slowdown fears.",
                "content": "Crude oil prices fell by 3% as OPEC+ members consider increasing production amid global economic slowdown fears. The move comes as inventories show higher than expected levels.",
                "url": "https://example.com/news/3",
                "published_at": "2023-05-15T10:15:00Z",
                "source": "Bloomberg"
            }
        ]
        
        # Create a mock news client that returns our sample data
        class MockNewsClient:
            def get_news_for_asset(self, asset, days=1, max_items=10):
                # Return appropriate mock news based on asset
                if asset == 'EUR/USD':
                    return [sample_news[0]]
                elif asset == 'US100':
                    return [sample_news[1]]
                elif 'Crude Oil' in asset:
                    return [sample_news[2]]
                else:
                    return sample_news
        
        # Create the sentiment analyzer and news engine
        sentiment_analyzer = SentimentAnalyzer()
        mock_news_client = MockNewsClient()
        news_engine = NewsSentimentEngine(mock_news_client, sentiment_analyzer)
        
        # Run analysis for each asset
        assets = ['US100', 'US30', 'EUR/USD', 'GBP/USD', 'Crude Oil WTI', 'Crude Oil Brent']
        
        results = {}
        for asset in assets:
            print(f"Analyzing news sentiment for {asset}...")
            
            # Get news with sentiment
            news_with_sentiment = news_engine.get_news_with_sentiment(asset)
            
            # Get aggregated sentiment
            agg_sentiment = news_engine.get_aggregated_sentiment(asset)
            
            # Get key topics
            key_topics = news_engine.get_key_topics(asset)
            
            # Store results
            results[asset] = {
                'news': news_with_sentiment,
                'sentiment': agg_sentiment,
                'topics': key_topics
            }
            
            # Save to file
            for data_type, data in results[asset].items():
                filepath = news_engine.save_to_file(data, asset, data_type)
                print(f"Saved {data_type} data for {asset} to {filepath}")
        
        # Create a sample output for frontend integration
        sample_output = {
            'timestamp': datetime.now().isoformat(),
            'assets': {}
        }
        
        for asset, data in results.items():
            sample_output['assets'][asset] = {
                'sentiment_score': data['sentiment']['avg_sentiment_score'],
                'sentiment_label': data['sentiment']['sentiment_label'],
                'key_topics': data['topics'],
                'news_count': data['sentiment']['news_count'],
                'latest_headlines': [
                    {'title': news['title'], 'sentiment': news['sentiment']['sentiment_label']}
                    for news in data['news'][:3]
                ]
            }
        
        # Save sample output for frontend integration
        os.makedirs(os.path.join(base_dir, 'sample_output'), exist_ok=True)
        with open(os.path.join(base_dir, 'sample_output', 'sentiment_data.json'), 'w') as f:
            json.dump(sample_output, f, indent=4)
        
        print("\nDemo completed. Sample output saved for frontend integration.")

    # Run the demo
    run_demo()
    
    # Create a README for the news sentiment module
    with open(os.path.join(news_dir, 'README.md'), 'w') as f:
        f.write('''# News Sentiment Analysis Module

This module provides sentiment analysis for financial news related to trading assets.

## Features

- Fetches news from various API providers (NewsAPI, Finnhub, Alpha Vantage)
- Analyzes sentiment using both VADER and TextBlob
- Provides combined sentiment scores with financial domain adaptations
- Extracts key topics from news articles
- Caches results to minimize API calls
- Outputs standardized data for frontend consumption

## Usage


SyntaxError: incomplete input (3302583515.py, line 623)

In [4]:
with open(os.path.join(news_dir, 'README.md'), 'w') as f:
    f.write('''# News Sentiment Analysis Module

This module provides sentiment analysis for financial news related to trading assets.

## Features

- Fetches news from various API providers (NewsAPI, Finnhub, Alpha Vantage)
- Analyzes sentiment using both VADER and TextBlob
- Provides combined sentiment scores with financial domain adaptations
- Extracts key topics from news articles
- Caches results to minimize API calls
- Outputs standardized data for frontend consumption

## Usage

SyntaxError: incomplete input (4166102312.py, line 2)

In [5]:
with open(os.path.join(news_dir, 'README.md'), 'w') as f:
    f.write('''# News Sentiment Analysis Module
    ...
    ## Usage

SyntaxError: incomplete input (1524001397.py, line 2)

In [6]:
print("API providers analyzed:")
for api_name in api_comparison.keys():
    print(f"- {api_name}")

print("\nProject directory structure:")
print(f"Base directory: {base_dir}")
print("Subdirectories:", subdirs)

print("\nAvailable asset mappings in MarketDataClient:")
for asset in sample_responses.keys():
    print(f"- {asset}")

# Check the content of the API client to understand its functionality
import os
with open(os.path.join(base_dir, 'market_data', 'api_client.py'), 'r') as f:
    api_client_code = f.read()
    print("\nAPI client methods:")
    for line in api_client_code.split('\n'):
        if "def " in line and "self" in line:
            print(f"- {line.strip()}")

API providers analyzed:
- TwelveData
- Finnhub
- Alpha Vantage

Project directory structure:
Base directory: /data/chats/46etsd/workspace/data_processing
Subdirectories: ['market_data', 'news_sentiment', 'ml_models', 'sample_output', 'docs']

Available asset mappings in MarketDataClient:
- TwelveData
- Finnhub
- Alpha Vantage

API client methods:
- def __init__(self, api_provider='twelvedata', api_key=None):
- def get_current_price(self, asset):
- def get_historical_data(self, asset, interval='1h', count=100):
- def format_response(self, raw_response, asset, data_type='price'):


In [7]:
```python
import os
import requests
import pandas as pd
import json
from datetime import datetime, timedelta
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
from collections import Counter

# Define base directory
base_dir = '/data/chats/46etsd/workspace/data_processing'
news_dir = os.path.join(base_dir, 'news_sentiment')

# Create news sentiment class
class NewsSentimentAnalyzer:
    def __init__(self, news_api_key=None, finnhub_api_key=None):
        """
        Initialize the news sentiment analyzer with API keys
        
        Args:
            news_api_key (str): API key for NewsAPI
            finnhub_api_key (str): API key for Finnhub (alternative news source)
        """
        self.news_api_key = news_api_key
        self.finnhub_api_key = finnhub_api_key
        
        # Download necessary NLTK resources if not already available
        try:
            nltk.data.find('tokenizers/punkt')
        except LookupError:
            nltk.download('punkt')
            
        try:
            nltk.data.find('corpora/stopwords')
        except LookupError:
            nltk.download('stopwords')
            
        # Initialize sentiment analyzers
        self.vader = SentimentIntensityAnalyzer()
        
        # Asset keyword mapping for news search
        self.asset_keywords = {
            'US100': ['NASDAQ', 'NASDAQ 100', 'NDX', 'tech stocks', 'technology sector'],
            'US30': ['Dow Jones', 'DJIA', 'DJI', 'Dow 30', 'industrial average'],
            'EUR/USD': ['EUR/USD', 'Euro Dollar', 'Eurozone', 'ECB', 'Federal Reserve', 'forex'],
            'GBP/USD': ['GBP/USD', 'British Pound', 'Sterling', 'Bank of England', 'Cable', 'forex'],
            'Crude Oil WTI': ['WTI', 'crude oil', 'oil prices', 'OPEC', 'energy market', 'petroleum'],
            'Crude Oil Brent': ['Brent', 'crude oil', 'oil prices', 'OPEC', 'energy market', 'petroleum']
        }
        
    def get_news_from_newsapi(self, asset, days=1, max_articles=10):
        """
        Fetch news articles related to a specific asset from NewsAPI
        
        Args:
            asset (str): Asset symbol from our standardized list
            days (int): Number of days to look back for news
            max_articles (int): Maximum number of articles to return
            
        Returns:
            list: List of news article dictionaries
        """
        if not self.news_api_key:
            raise ValueError("NewsAPI key is required")
            
        # Get relevant keywords for the asset
        keywords = self.asset_keywords.get(asset, [asset])
        
        # Prepare the query string (OR operator between keywords)
        query = ' OR '.join([f'"{keyword}"' for keyword in keywords])
        
        # Calculate date range
        to_date = datetime.now().strftime('%Y-%m-%d')
        from_date = (datetime.now() - timedelta(days=days)).strftime('%Y-%m-%d')
        
        # Make the API request
        url = 'https://newsapi.org/v2/everything'
        params = {
            'q': query,
            'from': from_date,
            'to': to_date,
            'language': 'en',
            'sortBy': 'publishedAt',
            'pageSize': max_articles,
            'apiKey': self.news_api_key
        }
        
        try:
            response = requests.get(url, params=params)
            data = response.json()
            
            if data.get('status') != 'ok':
                print(f"NewsAPI error: {data.get('message', 'Unknown error')}")
                return []
                
            articles = data.get('articles', [])
            
            # Process and clean the articles
            processed_articles = []
            for article in articles:
                processed_articles.append({
                    'title': article.get('title', ''),
                    'description': article.get('description', ''),
                    'content': article.get('content', ''),
                    'url': article.get('url', ''),
                    'source': article.get('source', {}).get('name', ''),
                    'published_at': article.get('publishedAt', '')
                })
                
            return processed_articles
            
        except Exception as e:
            print(f"Error fetching news from NewsAPI: {e}")
            return []
            
    def get_news_from_finnhub(self, asset, days=1, max_articles=10):
        """
        Fetch news articles related to a specific asset from Finnhub
        
        Args:
            asset (str): Asset symbol from our standardized list
            days (int): Number of days to look back for news
            max_articles (int): Maximum number of articles to return
            
        Returns:
            list: List of news article dictionaries
        """
        if not self.finnhub_api_key:
            raise ValueError("Finnhub API key is required")
            
        # Map our asset to Finnhub symbol
        asset_map = {
            'US100': 'COMP.INDX',  # Nasdaq Composite Index
            'US30': 'DJI.INDX',    # Dow Jones Industrial Average
            'EUR/USD': 'OANDA:EUR_USD',
            'GBP/USD': 'OANDA:GBP_USD',
            'Crude Oil WTI': 'COMM:WTI',
            'Crude Oil Brent': 'COMM:BRENT'
        }
        
        symbol = asset_map.get(asset, '')
        
        # Calculate date range (Unix timestamp)
        to_date = int(datetime.now().timestamp())
        from_date = int((datetime.now() - timedelta(days=days)).timestamp())
        
        # Make the API request
        url = 'https://finnhub.io/api/v1/company-news'
        params = {
            'symbol': symbol,
            'from': datetime.fromtimestamp(from_date).strftime('%Y-%m-%d'),
            'to': datetime.fromtimestamp(to_date).strftime('%Y-%m-%d'),
            'token': self.finnhub_api_key
        }
        
        try:
            response = requests.get(url, params=params)
            data = response.json()
            
            # Process and clean the articles
            processed_articles = []
            for article in data[:max_articles]:
                processed_articles.append({
                    'title': article.get('headline', ''),
                    'description': article.get('summary', ''),
                    'content': article.get('summary', ''),  # Finnhub doesn't provide full content
                    'url': article.get('url', ''),
                    'source': article.get('source', ''),
                    'published_at': datetime.fromtimestamp(article.get('datetime', 0)).isoformat()
                })
                
            return processed_articles
            
        except Exception as e:
            print(f"Error fetching news from Finnhub: {e}")
            return []
            
    def analyze_sentiment_vader(self, text):
        """
        Analyze sentiment of text using VADER
        
        Args:
            text (str): Text to analyze
            
        Returns:
            dict: Sentiment scores
        """
        if not text:
            return {
                'compound': 0.0,
                'neg': 0.0,
                'neu': 0.0,
                'pos': 0.0
            }
            
        return self.vader.polarity_scores(text)
        
    def analyze_sentiment_textblob(self, text):
        """
        Analyze sentiment of text using TextBlob
        
        Args:
            text (str): Text to analyze
            
        Returns:
            dict: Sentiment scores
        """
        if not text:
            return {
                'polarity': 0.0,
                'subjectivity': 0.0
            }
            
        blob = TextBlob(text)
        return {
            'polarity': blob.sentiment.polarity,
            'subjectivity': blob.sentiment.subjectivity
        }
        
    def extract_keywords(self, text, num_keywords=5):
        """
        Extract key topics/keywords from text
        
        Args:
            text (str): Text to analyze
            num_keywords (int): Number of keywords to extract
            
        Returns:
            list: List of keywords
        """
        if not text:
            return []
            
        # Tokenize and clean text
        tokens = word_tokenize(text.lower())
        stop_words = set(stopwords.words('english'))
        
        # Remove stop words, punctuation, and short words
        filtered_tokens = [
            word for word in tokens 
            if word not in stop_words 
            and re.match(r'^[a-zA-Z]{3,}$', word)
        ]
        
        # Get most common words
        word_counts = Counter(filtered_tokens)
        return [word for word, count in word_counts.most_common(num_keywords)]
        
    def analyze_articles(self, articles):
        """
        Analyze sentiment and extract keywords from a list of articles
        
        Args:
            articles (list): List of news article dictionaries
            
        Returns:
            list: List of articles with sentiment analysis
        """
        analyzed_articles = []
        
        for article in articles:
            # Combine title and description for better analysis
            analysis_text = f"{article['title']} {article['description']}"
            
            # Get sentiment scores
            vader_sentiment = self.analyze_sentiment_vader(analysis_text)
            textblob_sentiment = self.analyze_sentiment_textblob(analysis_text)
            
            # Extract keywords
            keywords = self.extract_keywords(analysis_text)
            
            # Determine sentiment label
            if vader_sentiment['compound'] >= 0.05:
                sentiment_label = 'positive'
            elif vader_sentiment['compound'] <= -0.05:
                sentiment_label = 'negative'
            else:
                sentiment_label = 'neutral'
                
            # Add analysis to article
            analyzed_article = article.copy()
            analyzed_article.update({
                'vader_sentiment': vader_sentiment,
                'textblob_sentiment': textblob_sentiment,
                'keywords': keywords,
                'sentiment_label': sentiment_label,
                'sentiment_score': vader_sentiment['compound']  # Use compound score as primary metric
            })
            
            analyzed_articles.append(analyzed_article)
            
        return analyzed_articles
        
    def get_asset_sentiment_summary(self, asset, days=1, max_articles=10):
        """
        Get overall sentiment summary for an asset
        
        Args:
            asset (str): Asset symbol from our standardized list
            days (int): Number of days to look back for news
            max_articles (int): Maximum number of articles to return
            
        Returns:
            dict: Sentiment summary statistics
        """
        # Try to get news from NewsAPI first
        articles = self.get_news_from_newsapi(asset, days, max_articles)
        
        # If NewsAPI fails or returns no results, try Finnhub
        if not articles and self.finnhub_api_key:
            articles = self.get_news_from_finnhub(asset, days, max_articles)
            
        # If we have articles, analyze them
        if articles:
            analyzed_articles = self.analyze_articles(articles)
            
            # Calculate overall sentiment
            sentiment_scores = [article['sentiment_score'] for article in analyzed_articles]
            avg_sentiment = sum(sentiment_scores) / len(sentiment_scores)
            
            # Count sentiment labels
            sentiment_counts = Counter([article['sentiment_label'] for article in analyzed_articles])
            
            # Collect all keywords
            all_keywords = []
            for article in analyzed_articles:
                all_keywords.extend(article['keywords'])
                
            top_keywords = [kw for kw, count in Counter(all_keywords).most_common(5)]
            
            return {
                'asset': asset,
                'articles_analyzed': len(analyzed_articles),
                'average_sentiment': avg_sentiment,
                'sentiment_distribution': dict(sentiment_counts),
                'top_keywords': top_keywords,
                'sentiment_label': 'positive' if avg_sentiment >= 0.05 else 'negative' if avg_sentiment <= -0.05 else 'neutral',
                'articles': analyzed_articles
            }
        else:
            # Return empty summary if no articles found
            return {
                'asset': asset,
                'articles_analyzed': 0,
                'average_sentiment': 0.0,
                'sentiment_distribution': {'positive': 0, 'neutral': 0, 'negative': 0},
                'top_keywords': [],
                'sentiment_label': 'neutral',
                'articles': []
            }
            
    def generate_sentiment_features(self, asset, days=7):
        """
        Generate sentiment features for ML model
        
        Args:
            asset (str): Asset symbol
            days (int): Number of days for historical sentiment
            
        Returns:
            dict: Feature dictionary for ML model
        """
        summary = self.get_asset_sentiment_summary(asset, days=days, max_articles=20)
        
        # Create features for ML model
        features = {
            'sentiment_score': summary['average_sentiment'],
            'sentiment_positive_ratio': summary['sentiment_distribution'].get('positive', 0) / 
                                       max(1, summary['articles_analyzed']),
            'sentiment_negative_ratio': summary['sentiment_distribution'].get('negative', 0) / 
                                       max(1, summary['articles_analyzed']),
            'article_count': summary['articles_analyzed']
        }
        
        return features

# Create example module usage script
with open(os.path.join(news_dir, 'analyzer.py'), 'w') as f:
    f.write('''
import os
import sys
import json
from datetime import datetime
import pandas as pd
from dotenv import load_dotenv

# Add parent directory to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from news_sentiment.sentiment_analyzer import NewsSentimentAnalyzer

# Load environment variables from .env file
load_dotenv()

def main():
    # Get API keys from environment variables
    news_api_key = os.getenv('NEWS_API_KEY')
    finnhub_api_key = os.getenv('FINNHUB_API_KEY')
    
    # Initialize sentiment analyzer
    analyzer = NewsSentimentAnalyzer(
        news_api_key=news_api_key,
        finnhub_api_key=finnhub_api_key
    )
    
    # Define assets to analyze
    assets = ['US100', 'US30', 'EUR/USD', 'GBP/USD', 'Crude Oil WTI', 'Crude Oil Brent']
    
    # Analyze sentiment for each asset
    results = {}
    for asset in assets:
        print(f"Analyzing sentiment for {asset}...")
        sentiment_summary = analyzer.get_asset_sentiment_summary(asset, days=1)
        results[asset] = sentiment_summary
        
        # Print summary
        print(f"  Articles analyzed: {sentiment_summary['articles_analyzed']}")
        print(f"  Average sentiment: {sentiment_summary['average_sentiment']:.4f}")
        print(f"  Sentiment label: {sentiment_summary['sentiment_label']}")
        print(f"  Top keywords: {', '.join(sentiment_summary['top_keywords'])}")
        print()
    
    # Save results to file
    output_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'sample_output')
    os.makedirs(output_dir, exist_ok=True)
    
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_file = os.path.join(output_dir, f'sentiment_analysis_{timestamp}.json')
    
    with open(output_file, 'w') as f:
        # Save only the summary part, not the full articles
        summary_results = {}
        for asset, data in results.items():
            summary_results[asset] = {k: v for k, v in data.items() if k != 'articles'}
            
        json.dump(summary_results, f, indent=2)
        
    print(f"Results saved to {output_file}")

if __name__ == '__main__':
    main()
''')

# Create main sentiment analyzer module file
with open(os.path.join(news_dir, 'sentiment_analyzer.py'), 'w') as f:
    f.write('''
import requests
import pandas as pd
import json
from datetime import datetime, timedelta
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
from collections import Counter

class NewsSentimentAnalyzer:
    def __init__(self, news_api_key=None, finnhub_api_key=None):
        """
        Initialize the news sentiment analyzer with API keys
        
        Args:
            news_api_key (str): API key for NewsAPI
            finnhub_api_key (str): API key for Finnhub (alternative news source)
        """
        self.news_api_key = news_api_key
        self.finnhub_api_key = finnhub_api_key
        
        # Download necessary NLTK resources if not already available
        try:
            nltk.data.find('tokenizers/punkt')
        except LookupError:
            nltk.download('punkt')
            
        try:
            nltk.data.find('corpora/stopwords')
        except LookupError:
            nltk.download('stopwords')
            
        # Initialize sentiment analyzers
        self.vader = SentimentIntensityAnalyzer()
        
        # Asset keyword mapping for news search
        self.asset_keywords = {
            'US100': ['NASDAQ', 'NASDAQ 100', 'NDX', 'tech stocks', 'technology sector'],
            'US30': ['Dow Jones', 'DJIA', 'DJI', 'Dow 30', 'industrial average'],
            'EUR/USD': ['EUR/USD', 'Euro Dollar', 'Eurozone', 'ECB', 'Federal Reserve', 'forex'],
            'GBP/USD': ['GBP/USD', 'British Pound', 'Sterling', 'Bank of England', 'Cable', 'forex'],
            'Crude Oil WTI': ['WTI', 'crude oil', 'oil prices', 'OPEC', 'energy market', 'petroleum'],
            'Crude Oil Brent': ['Brent', 'crude oil', 'oil prices', 'OPEC', 'energy market', 'petroleum']
        }
        
    def get_news_from_newsapi(self, asset, days=1, max_articles=10):
        """
        Fetch news articles related to a specific asset from NewsAPI
        
        Args:
            asset (str): Asset symbol from our standardized list
            days (int): Number of days to look back for news
            max_articles (int): Maximum number of articles to return
            
        Returns:
            list: List of news article dictionaries
        """
        if not self.news_api_key:
            raise ValueError("NewsAPI key is required")
            
        # Get relevant keywords for the asset
        keywords = self.asset_keywords.get(asset, [asset])
        
        # Prepare the query string (OR operator between keywords)
        query = ' OR '.join([f'"{keyword}"' for keyword in keywords])
        
        # Calculate date range
        to_date = datetime.now().strftime('%Y-%m-%d')
        from_date = (datetime.now() - timedelta(days=days)).strftime('%Y-%m-%d')
        
        # Make the API request
        url = 'https://newsapi.org/v2/everything'
        params = {
            'q': query,
            'from': from_date,
            'to': to_date,
            'language': 'en',
            'sortBy': 'publishedAt',
            'pageSize': max_articles,
            'apiKey': self.news_api_key
        }
        
        try:
            response = requests.get(url, params=params)
            data = response.json()
            
            if data.get('status') != 'ok':
                print(f"NewsAPI error: {data.get('message', 'Unknown error')}")
                return []
                
            articles = data.get('articles', [])
            
            # Process and clean the articles
            processed_articles = []
            for article in articles:
                processed_articles.append({
                    'title': article.get('title', ''),
                    'description': article.get('description', ''),
                    'content': article.get('content', ''),
                    'url': article.get('url', ''),
                    'source': article.get('source', {}).get('name', ''),
                    'published_at': article.get('publishedAt', '')
                })
                
            return processed_articles
            
        except Exception as e:
            print(f"Error fetching news from NewsAPI: {e}")
            return []
            
    def get_news_from_finnhub(self, asset, days=1, max_articles=10):
        """
        Fetch news articles related to a specific asset from Finnhub
        
        Args:
            asset (str): Asset symbol from our standardized list
            days (int): Number of days to look back for news
            max_articles (int): Maximum number of articles to return
            
        Returns:
            list: List of news article dictionaries
        """
        if not self.finnhub_api_key:
            raise ValueError("Finnhub API key is required")
            
        # Map our asset to Finnhub symbol
        asset_map = {
            'US100': 'COMP.INDX',  # Nasdaq Composite Index
            'US30': 'DJI.INDX',    # Dow Jones Industrial Average
            'EUR/USD': 'OANDA:EUR_USD',
            'GBP/USD': 'OANDA:GBP_USD',
            'Crude Oil WTI': 'COMM:WTI',
            'Crude Oil Brent': 'COMM:BRENT'
        }
        
        symbol = asset_map.get(asset, '')
        
        # Calculate date range (Unix timestamp)
        to_date = int(datetime.now().timestamp())
        from_date = int((datetime.now() - timedelta(days=days)).timestamp())
        
        # Make the API request
        url = 'https://finnhub.io/api/v1/company-news'
        params = {
            'symbol': symbol,
            'from': datetime.fromtimestamp(from_date).strftime('%Y-%m-%d'),
            'to': datetime.fromtimestamp(to_date).strftime('%Y-%m-%d'),
            'token': self.finnhub_api_key
        }
        
        try:
            response = requests.get(url, params=params)
            data = response.json()
            
            # Process and clean the articles
            processed_articles = []
            for article in data[:max_articles]:
                processed_articles.append({
                    'title': article.get('headline', ''),
                    'description': article.get('summary', ''),
                    'content': article.get('summary', ''),  # Finnhub doesn't provide full content
                    'url': article.get('url', ''),
                    'source': article.get('source', ''),
                    'published_at': datetime.fromtimestamp(article.get('datetime', 0)).isoformat()
                })
                
            return processed_articles
            
        except Exception as e:
            print(f"Error fetching news from Finnhub: {e}")
            return []
            
    def analyze_sentiment_vader(self, text):
        """
        Analyze sentiment of text using VADER
        
        Args:
            text (str): Text to analyze
            
        Returns:
            dict: Sentiment scores
        """
        if not text:
            return {
                'compound': 0.0,
                'neg': 0.0,
                'neu': 0.0,
                'pos': 0.0
            }
            
        return self.vader.polarity_scores(text)
        
    def analyze_sentiment_textblob(self, text):
        """
        Analyze sentiment of text using TextBlob
        
        Args:
            text (str): Text to analyze
            
        Returns:
            dict: Sentiment scores
        """
        if not text:
            return {
                'polarity': 0.0,
                'subjectivity': 0.0
            }
            
        blob = TextBlob(text)
        return {
            'polarity': blob.sentiment.polarity,
            'subjectivity': blob.sentiment.subjectivity
        }
        
    def extract_keywords(self, text, num_keywords=5):
        """
        Extract key topics/keywords from text
        
        Args:
            text (str): Text to analyze
            num_keywords (int): Number of keywords to extract
            
        Returns:
            list: List of keywords
        """
        if not text:
            return []
            
        # Tokenize and clean text
        tokens = word_tokenize(text.lower())
        stop_words = set(stopwords.words('english'))
        
        # Remove stop words, punctuation, and short words
        filtered_tokens = [
            word for word in tokens 
            if word not in stop_words 
            and re.match(r'^[a-zA-Z]{3,}$', word)
        ]
        
        # Get most common words
        word_counts = Counter(filtered_tokens)
        return [word for word, count in word_counts.most_common(num_keywords)]
        
    def analyze_articles(self, articles):
        """
        Analyze sentiment and extract keywords from a list of articles
        
        Args:
            articles (list): List of news article dictionaries
            
        Returns:
            list: List of articles with sentiment analysis
        """
        analyzed_articles = []
        
        for article in articles:
            # Combine title and description for better analysis
            analysis_text = f"{article['title']} {article['description']}"
            
            # Get sentiment scores
            vader_sentiment = self.analyze_sentiment_vader(analysis_text)
            textblob_sentiment = self.analyze_sentiment_textblob(analysis_text)
            
            # Extract keywords
            keywords = self.extract_keywords(analysis_text)
            
            # Determine sentiment label
            if vader_sentiment['compound'] >= 0.05:
                sentiment_label = 'positive'
            elif vader_sentiment['compound'] <= -0.05:
                sentiment_label = 'negative'
            else:
                sentiment_label = 'neutral'
                
            # Add analysis to article
            analyzed_article = article.copy()
            analyzed_article.update({
                'vader_sentiment': vader_sentiment,
                'textblob_sentiment': textblob_sentiment,
                'keywords': keywords,
                'sentiment_label': sentiment_label,
                'sentiment_score': vader_sentiment['compound']  # Use compound score as primary metric
            })
            
            analyzed_articles.append(analyzed_article)
            
        return analyzed_articles
        
    def get_asset_sentiment_summary(self, asset, days=1, max_articles=10):
        """
        Get overall sentiment summary for an asset
        
        Args:
            asset (str): Asset symbol from our standardized list
            days (int): Number of days to look back for news
            max_articles (int): Maximum number of articles to return
            
        Returns:
            dict: Sentiment summary statistics
        """
        # Try to get news from NewsAPI first
        articles = self.get_news_from_newsapi(asset, days, max_articles)
        
        # If NewsAPI fails or returns no results, try Finnhub
        if not articles and self.finnhub_api_key:
            articles = self.get_news_from_finnhub(asset, days, max_articles)
            
        # If we have articles, analyze them
        if articles:
            analyzed_articles = self.analyze_articles(articles)
            
            # Calculate overall sentiment
            sentiment_scores = [article['sentiment_score'] for article in analyzed_articles]
            avg_sentiment = sum(sentiment_scores) / len(sentiment_scores)
            
            # Count sentiment labels
            sentiment_counts = Counter([article['sentiment_label'] for article in analyzed_articles])
            
            # Collect all keywords
            all_keywords = []
            for article in analyzed_articles:
                all_keywords.extend(article['keywords'])
                
            top_keywords = [kw for kw, count in Counter(all_keywords).most_common(5)]
            
            return {
                'asset': asset,
                'articles_analyzed': len(analyzed_articles),
                'average_sentiment': avg_sentiment,
                'sentiment_distribution': dict(sentiment_counts),
                'top_keywords': top_keywords,
                'sentiment_label': 'positive' if avg_sentiment >= 0.05 else 'negative' if avg_sentiment <= -0.05 else 'neutral',
                'articles': analyzed_articles
            }
        else:
            # Return empty summary if no articles found
            return {
                'asset': asset,
                'articles_analyzed': 0,
                'average_sentiment': 0.0,
                'sentiment_distribution': {'positive': 0, 'neutral': 0, 'negative': 0},
                'top_keywords': [],
                'sentiment_label': 'neutral',
                'articles': []
            }
            
    def generate_sentiment_features(self, asset, days=7):
        """
        Generate sentiment features for ML model
        
        Args:
            asset (str): Asset symbol
            days (int): Number of days for historical sentiment
            
        Returns:
            dict: Feature dictionary for ML model
        """
        summary = self.get_asset_sentiment_summary(asset, days=days, max_articles=20)
        
        # Create features for ML model
        features = {
            'sentiment_score': summary['average_sentiment'],
            'sentiment_positive_ratio': summary['sentiment_distribution'].get('positive', 0) / 
                                       max(1, summary['articles_analyzed']),
            'sentiment_negative_ratio': summary['sentiment_distribution'].get('negative', 0) / 
                                       max(1, summary['articles_analyzed']),
            'article_count': summary['articles_analyzed']
        }
        
        return features
''')

# Create a sample output file to demonstrate the data format
sample_data = {
    'US100': {
        'asset': 'US100',
        'articles_analyzed': 10,
        'average_sentiment': 0.243,
        'sentiment_distribution': {'positive': 6, 'neutral': 3, 'negative': 1},
        'top_keywords': ['tech', 'growth', 'nasdaq', 'earnings', 'rally'],
        'sentiment_label': 'positive'
    },
    'EUR/USD': {
        'asset': 'EUR/USD',
        'articles_analyzed': 8,
        'average_sentiment': -0.126,
        'sentiment_distribution': {'positive': 2, 'neutral': 3, 'negative': 3},
        'top_keywords': ['euro', 'dollar', 'inflation', 'central', 'interest'],
        'sentiment_label': 'negative'
    }
}

with open(os.path.join(base_dir, 'sample_output', 'sentiment_sample.json'), 'w') as f:
    json.dump(sample_data

SyntaxError: invalid syntax (2772580159.py, line 1)

In [8]:
json.dump(sample_data, f, indent=4)

NameError: name 'sample_data' is not defined

In [9]:
Cell In[7], line 1
       ```python
       ^
   SyntaxError: invalid syntax
   ```
   This indicates that I was using a code block format (```python) inside Python code, which is invalid.

2. Second, there was a NameError when trying to use `sample_data` and `f` variables:
   ```
   NameError: name 'sample_data' is not defined
   ```
   This suggests that the code was split incorrectly and not executed in the proper sequence, causing the variables to go out of scope.

The main issue appears to be with the structure of my implementation. I was trying to both define the code and write it to files in the same code block, which led to scope issues. Additionally, I didn't properly close some of the file operations.

[improved impl]:

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 4)

In [11]:
# Print key variables from the completed tasks

# Print API comparison data
print("Available API providers:")
for api_name in api_comparison.keys():
    print(f"- {api_name}")

# Print project directory structure
print("\nProject directory structure:")
base_dir = '/data/chats/46etsd/workspace/data_processing'
for subdir in subdirs:
    print(f"- {os.path.join(base_dir, subdir)}")

# Print available asset mappings from the MarketDataClient
print("\nSupported assets and their symbols across providers:")
# Extract the symbol_mapping from the code
symbol_mapping = {
    'US100': {
        'twelvedata': 'NDX',
        'finnhub': '^NDX',
        'alphavantage': 'NDX'
    },
    'US30': {
        'twelvedata': 'DJI',
        'finnhub': '^DJI',
        'alphavantage': 'DJI'
    },
    'EUR/USD': {
        'twelvedata': 'EUR/USD',
        'finnhub': 'EURUSD',
        'alphavantage': 'EUR/USD'
    },
    'GBP/USD': {
        'twelvedata': 'GBP/USD',
        'finnhub': 'GBPUSD',
        'alphavantage': 'GBP/USD'
    },
    'Crude Oil WTI': {
        'twelvedata': 'WTI',
        'finnhub': 'CL',
        'alphavantage': 'USO'
    },
    'Crude Oil Brent': {
        'twelvedata': 'BRENT',
        'finnhub': 'BZ',
        'alphavantage': 'BNO'
    }
}
for asset, mappings in symbol_mapping.items():
    print(f"Asset: {asset}")
    for provider, symbol in mappings.items():
        print(f"  - {provider}: {symbol}")

Available API providers:
- TwelveData
- Finnhub
- Alpha Vantage

Project directory structure:
- /data/chats/46etsd/workspace/data_processing/market_data
- /data/chats/46etsd/workspace/data_processing/news_sentiment
- /data/chats/46etsd/workspace/data_processing/ml_models
- /data/chats/46etsd/workspace/data_processing/sample_output
- /data/chats/46etsd/workspace/data_processing/docs

Supported assets and their symbols across providers:
Asset: US100
  - twelvedata: NDX
  - finnhub: ^NDX
  - alphavantage: NDX
Asset: US30
  - twelvedata: DJI
  - finnhub: ^DJI
  - alphavantage: DJI
Asset: EUR/USD
  - twelvedata: EUR/USD
  - finnhub: EURUSD
  - alphavantage: EUR/USD
Asset: GBP/USD
  - twelvedata: GBP/USD
  - finnhub: GBPUSD
  - alphavantage: GBP/USD
Asset: Crude Oil WTI
  - twelvedata: WTI
  - finnhub: CL
  - alphavantage: USO
Asset: Crude Oil Brent
  - twelvedata: BRENT
  - finnhub: BZ
  - alphavantage: BNO


In [12]:
import os
import pandas as pd
import numpy as np
import requests
import json
import re
from datetime import datetime, timedelta
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Set the base directory
base_dir = '/data/chats/46etsd/workspace/data_processing'
news_dir = os.path.join(base_dir, 'news_sentiment')

class NewsAPIClient:
    """
    A client for fetching financial news from various news APIs
    """
    def __init__(self, api_provider='newsapi', api_key=None):
        """
        Initialize the news API client
        
        Args:
            api_provider (str): The news API provider ('newsapi', 'finnhub')
            api_key (str): API key for the provider
        """
        self.api_provider = api_provider.lower()
        self.api_key = api_key
        
        # Base URLs for news APIs
        self.base_urls = {
            'newsapi': 'https://newsapi.org/v2',
            'finnhub': 'https://finnhub.io/api/v1'
        }
        
        # Keywords mapping for each asset to use in news search
        self.asset_keywords = {
            'US100': ['NASDAQ', 'NASDAQ 100', 'NDX', 'tech stocks', 'technology sector'],
            'US30': ['Dow Jones', 'Dow 30', 'DJIA', 'Dow Jones Industrial Average'],
            'EUR/USD': ['EUR/USD', 'euro dollar', 'euro forex', 'euro currency', 'ECB', 'Federal Reserve'],
            'GBP/USD': ['GBP/USD', 'pound dollar', 'sterling', 'Bank of England', 'Brexit'],
            'Crude Oil WTI': ['crude oil', 'WTI', 'oil prices', 'OPEC', 'oil market'],
            'Crude Oil Brent': ['Brent crude', 'Brent oil', 'oil prices', 'OPEC', 'oil market']
        }

    def get_news_for_asset(self, asset, days_back=3, max_articles=10):
        """
        Fetch news articles related to a specific asset
        
        Args:
            asset (str): The asset name from our standard list
            days_back (int): Number of days to look back for news
            max_articles (int): Maximum number of articles to fetch
            
        Returns:
            list: List of news articles
        """
        if self.api_provider not in self.base_urls:
            raise ValueError(f"Unsupported API provider: {self.api_provider}")
            
        keywords = self.asset_keywords.get(asset)
        if not keywords:
            raise ValueError(f"No keywords defined for asset: {asset}")
            
        from_date = (datetime.now() - timedelta(days=days_back)).strftime('%Y-%m-%d')
        
        articles = []
        
        if self.api_provider == 'newsapi':
            # For each keyword, get articles and merge them
            for keyword in keywords:
                url = f"{self.base_urls['newsapi']}/everything"
                params = {
                    'q': keyword,
                    'from': from_date,
                    'sortBy': 'publishedAt',
                    'language': 'en',
                    'apiKey': self.api_key
                }
                
                try:
                    response = requests.get(url, params=params)
                    if response.status_code == 200:
                        data = response.json()
                        if data.get('status') == 'ok':
                            articles.extend(data.get('articles', []))
                    else:
                        print(f"Error fetching news from NewsAPI: {response.status_code}")
                except Exception as e:
                    print(f"Exception when fetching news: {e}")
                    
        elif self.api_provider == 'finnhub':
            # Use Finnhub's news endpoint
            for keyword in keywords:
                url = f"{self.base_urls['finnhub']}/news"
                params = {
                    'category': 'general',
                    'token': self.api_key
                }
                
                try:
                    response = requests.get(url, params=params)
                    if response.status_code == 200:
                        data = response.json()
                        # Filter articles containing our keyword
                        for article in data:
                            if keyword.lower() in article.get('headline', '').lower() or \
                               keyword.lower() in article.get('summary', '').lower():
                                articles.append({
                                    'title': article.get('headline'),
                                    'description': article.get('summary'),
                                    'url': article.get('url'),
                                    'publishedAt': article.get('datetime'),
                                    'source': {'name': article.get('source')}
                                })
                except Exception as e:
                    print(f"Exception when fetching news: {e}")
        
        # Deduplicate articles based on title
        unique_articles = []
        titles = set()
        for article in articles:
            if article['title'] not in titles:
                titles.add(article['title'])
                unique_articles.append(article)
                
                # Add the asset reference
                article['asset'] = asset
        
        # Sort by publication date and return the most recent ones
        sorted_articles = sorted(unique_articles, 
                                 key=lambda x: x.get('publishedAt', ''),
                                 reverse=True)
        
        return sorted_articles[:max_articles]

class SentimentAnalyzer:
    """
    A class for analyzing sentiment of financial news articles
    using both VADER and TextBlob
    """
    def __init__(self):
        self.vader = SentimentIntensityAnalyzer()
        
    def analyze_text(self, text):
        """
        Analyze sentiment of a text using both VADER and TextBlob
        
        Args:
            text (str): The text to analyze
            
        Returns:
            dict: Sentiment scores from both analyzers
        """
        if not text:
            return {
                'vader_compound': 0,
                'vader_pos': 0,
                'vader_neg': 0,
                'vader_neu': 0,
                'textblob_polarity': 0,
                'textblob_subjectivity': 0,
                'sentiment_label': 'neutral'
            }
        
        # VADER sentiment analysis
        vader_scores = self.vader.polarity_scores(text)
        
        # TextBlob sentiment analysis
        tb = TextBlob(text)
        textblob_polarity = tb.sentiment.polarity
        textblob_subjectivity = tb.sentiment.subjectivity
        
        # Combined sentiment label
        # Use VADER's compound score as the primary indicator
        if vader_scores['compound'] >= 0.05:
            sentiment_label = 'positive'
        elif vader_scores['compound'] <= -0.05:
            sentiment_label = 'negative'
        else:
            sentiment_label = 'neutral'
        
        return {
            'vader_compound': vader_scores['compound'],
            'vader_pos': vader_scores['pos'],
            'vader_neg': vader_scores['neg'],
            'vader_neu': vader_scores['neu'],
            'textblob_polarity': textblob_polarity,
            'textblob_subjectivity': textblob_subjectivity,
            'sentiment_label': sentiment_label
        }
    
    def analyze_article(self, article):
        """
        Analyze sentiment of a news article
        
        Args:
            article (dict): News article dictionary
            
        Returns:
            dict: Original article with added sentiment analysis
        """
        # Create a combined text from title and description
        title = article.get('title', '')
        description = article.get('description', '')
        combined_text = f"{title}. {description}"
        
        # Get sentiment scores
        sentiment_scores = self.analyze_text(combined_text)
        
        # Add to the article dictionary
        article.update({
            'sentiment': sentiment_scores,
            'analyzed_at': datetime.now().isoformat()
        })
        
        return article
    
    def extract_key_phrases(self, text, max_phrases=5):
        """
        Extract key phrases from text
        
        Args:
            text (str): Input text
            max_phrases (int): Maximum number of phrases to extract
            
        Returns:
            list: List of key phrases
        """
        if not text:
            return []
            
        # Simple noun phrase extraction
        tb = TextBlob(text)
        noun_phrases = list(tb.noun_phrases)
        
        # Simple keyword extraction based on POS tagging
        important_tags = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ']
        words = tb.tags  # Get words with POS tags
        keywords = [word for word, tag in words if tag in important_tags]
        
        # Combine phrases and keywords, remove duplicates
        all_phrases = noun_phrases + keywords
        unique_phrases = list(set(all_phrases))
        
        # Sort by length (favor longer phrases) and return top N
        sorted_phrases = sorted(unique_phrases, key=len, reverse=True)
        
        return sorted_phrases[:max_phrases]

class NewsSentimentManager:
    """
    Manages the collection and analysis of news sentiment for trading assets
    """
    def __init__(self, news_api_client, sentiment_analyzer):
        self.news_api_client = news_api_client
        self.sentiment_analyzer = sentiment_analyzer
        self.data_dir = os.path.join(news_dir, 'data')
        os.makedirs(self.data_dir, exist_ok=True)
        
    def collect_and_analyze_news(self, assets, days_back=3, max_articles_per_asset=10):
        """
        Collect and analyze news for multiple assets
        
        Args:
            assets (list): List of asset names
            days_back (int): Days to look back for news
            max_articles_per_asset (int): Max articles per asset
            
        Returns:
            dict: Dictionary with asset names as keys and analyzed articles as values
        """
        all_results = {}
        
        for asset in assets:
            print(f"Collecting news for {asset}...")
            
            # Get news articles
            articles = self.news_api_client.get_news_for_asset(
                asset, days_back, max_articles_per_asset
            )
            
            # Analyze sentiment
            analyzed_articles = []
            for article in articles:
                analyzed_article = self.sentiment_analyzer.analyze_article(article)
                analyzed_articles.append(analyzed_article)
            
            all_results[asset] = analyzed_articles
            
            # Save to json file
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"{asset.replace('/', '_')}_{timestamp}.json"
            with open(os.path.join(self.data_dir, filename), 'w') as f:
                json.dump(analyzed_articles, f, indent=4)
                
        return all_results
    
    def calculate_asset_sentiment_summary(self, asset_articles):
        """
        Calculate summary sentiment metrics for an asset
        
        Args:
            asset_articles (list): List of analyzed articles for an asset
            
        Returns:
            dict: Summary sentiment metrics
        """
        if not asset_articles:
            return {
                'count': 0,
                'avg_vader_compound': 0,
                'avg_textblob_polarity': 0,
                'positive_count': 0,
                'negative_count': 0,
                'neutral_count': 0,
                'sentiment_distribution': {
                    'positive': 0,
                    'neutral': 0,
                    'negative': 0
                },
                'overall_sentiment': 'neutral',
                'key_phrases': []
            }
            
        # Extract sentiment scores
        vader_scores = [a['sentiment']['vader_compound'] for a in asset_articles]
        textblob_scores = [a['sentiment']['textblob_polarity'] for a in asset_articles]
        
        # Count sentiments
        sentiments = [a['sentiment']['sentiment_label'] for a in asset_articles]
        positive_count = sentiments.count('positive')
        negative_count = sentiments.count('negative')
        neutral_count = sentiments.count('neutral')
        total_count = len(sentiments)
        
        # Calculate distribution percentages
        sentiment_dist = {
            'positive': (positive_count / total_count) * 100 if total_count > 0 else 0,
            'neutral': (neutral_count / total_count) * 100 if total_count > 0 else 0, 
            'negative': (negative_count / total_count) * 100 if total_count > 0 else 0
        }
        
        # Determine overall sentiment
        if positive_count > negative_count and positive_count > neutral_count:
            overall = 'positive'
        elif negative_count > positive_count and negative_count > neutral_count:
            overall = 'negative'
        else:
            overall = 'neutral'
            
        # Extract key phrases across all articles
        all_text = " ".join([
            f"{a.get('title', '')}. {a.get('description', '')}" 
            for a in asset_articles
        ])
        key_phrases = self.sentiment_analyzer.extract_key_phrases(all_text, 10)
        
        return {
            'count': total_count,
            'avg_vader_compound': np.mean(vader_scores) if vader_scores else 0,
            'avg_textblob_polarity': np.mean(textblob_scores) if textblob_scores else 0,
            'positive_count': positive_count,
            'negative_count': negative_count,
            'neutral_count': neutral_count,
            'sentiment_distribution': sentiment_dist,
            'overall_sentiment': overall,
            'key_phrases': key_phrases
        }
    
    def get_news_sentiment_features(self, all_assets_articles):
        """
        Extract sentiment features for ML model
        
        Args:
            all_assets_articles (dict): Dictionary with asset names and analyzed articles
            
        Returns:
            dict: Dictionary with asset names and sentiment features
        """
        features = {}
        
        for asset, articles in all_assets_articles.items():
            # Get summary
            summary = self.calculate_asset_sentiment_summary(articles)
            
            # Extract ML features
            features[asset] = {
                'sentiment_score': summary['avg_vader_compound'],  # Main sentiment score
                'sentiment_magnitude': abs(summary['avg_vader_compound']),  # Intensity of sentiment
                'textblob_score': summary['avg_textblob_polarity'],
                'positive_ratio': summary['sentiment_distribution']['positive'] / 100,
                'negative_ratio': summary['sentiment_distribution']['negative'] / 100,
                'neutral_ratio': summary['sentiment_distribution']['neutral'] / 100,
                'article_count': summary['count'],
                'sentiment_label': summary['overall_sentiment'],
                'has_news': summary['count'] > 0
            }
            
        return features

# Create a README file for the news sentiment module
with open(os.path.join(news_dir, 'README.md'), 'w') as f:
    f.write('''# News Sentiment Analysis Module

This module handles the collection and analysis of financial news sentiment for trading assets.

## Components

- `NewsAPIClient`: Fetches news articles from various news API providers
- `SentimentAnalyzer`: Analyzes sentiment using VADER and TextBlob
- `NewsSentimentManager`: Orchestrates collection, analysis, and storage of sentiment data

## Supported Assets

- US100 (NASDAQ 100)
- US30 (Dow Jones Industrial Average)
- EUR/USD (Euro/US Dollar)
- GBP/USD (British Pound/US Dollar)
- Crude Oil WTI
- Crude Oil Brent

## Usage


SyntaxError: incomplete input (346504003.py, line 404)

In [13]:
from metagpt.tools.libs.terminal import Terminal
terminal = Terminal()
await terminal.run('pip install textblob vaderSentiment')

import os
import pandas as pd
import numpy as np
import requests
import json
import re
from datetime import datetime, timedelta
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Set the base directory
base_dir = '/data/chats/46etsd/workspace/data_processing'
news_dir = os.path.join(base_dir, 'news_sentiment')
os.makedirs(os.path.join(news_dir, 'data'), exist_ok=True)

class NewsAPIClient:
    """
    A client for fetching financial news from various news APIs
    """
    def __init__(self, api_provider='newsapi', api_key=None):
        """
        Initialize the news API client
        
        Args:
            api_provider (str): The news API provider ('newsapi', 'finnhub')
            api_key (str): API key for the provider
        """
        self.api_provider = api_provider.lower()
        self.api_key = api_key
        
        # Base URLs for news APIs
        self.base_urls = {
            'newsapi': 'https://newsapi.org/v2',
            'finnhub': 'https://finnhub.io/api/v1'
        }
        
        # Keywords mapping for each asset to use in news search
        self.asset_keywords = {
            'US100': ['NASDAQ', 'NASDAQ 100', 'NDX', 'tech stocks', 'technology sector'],
            'US30': ['Dow Jones', 'Dow 30', 'DJIA', 'Dow Jones Industrial Average'],
            'EUR/USD': ['EUR/USD', 'euro dollar', 'euro forex', 'euro currency', 'ECB', 'Federal Reserve'],
            'GBP/USD': ['GBP/USD', 'pound dollar', 'sterling', 'Bank of England', 'Brexit'],
            'Crude Oil WTI': ['crude oil', 'WTI', 'oil prices', 'OPEC', 'oil market'],
            'Crude Oil Brent': ['Brent crude', 'Brent oil', 'oil prices', 'OPEC', 'oil market']
        }

    def get_news_for_asset(self, asset, days_back=3, max_articles=10):
        """
        Fetch news articles related to a specific asset
        
        Args:
            asset (str): The asset name from our standard list
            days_back (int): Number of days to look back for news
            max_articles (int): Maximum number of articles to fetch
            
        Returns:
            list: List of news articles
        """
        if self.api_provider not in self.base_urls:
            raise ValueError(f"Unsupported API provider: {self.api_provider}")
            
        keywords = self.asset_keywords.get(asset)
        if not keywords:
            raise ValueError(f"No keywords defined for asset: {asset}")
            
        from_date = (datetime.now() - timedelta(days=days_back)).strftime('%Y-%m-%d')
        
        articles = []
        
        if self.api_provider == 'newsapi':
            # For each keyword, get articles and merge them
            for keyword in keywords:
                url = f"{self.base_urls['newsapi']}/everything"
                params = {
                    'q': keyword,
                    'from': from_date,
                    'sortBy': 'publishedAt',
                    'language': 'en',
                    'apiKey': self.api_key
                }
                
                try:
                    response = requests.get(url, params=params)
                    if response.status_code == 200:
                        data = response.json()
                        if data.get('status') == 'ok':
                            articles.extend(data.get('articles', []))
                    else:
                        print(f"Error fetching news from NewsAPI: {response.status_code}")
                except Exception as e:
                    print(f"Exception when fetching news: {e}")
                    
        elif self.api_provider == 'finnhub':
            # Use Finnhub's news endpoint
            for keyword in keywords:
                url = f"{self.base_urls['finnhub']}/news"
                params = {
                    'category': 'general',
                    'token': self.api_key
                }
                
                try:
                    response = requests.get(url, params=params)
                    if response.status_code == 200:
                        data = response.json()
                        # Filter articles containing our keyword
                        for article in data:
                            if keyword.lower() in article.get('headline', '').lower() or \
                               keyword.lower() in article.get('summary', '').lower():
                                articles.append({
                                    'title': article.get('headline'),
                                    'description': article.get('summary'),
                                    'url': article.get('url'),
                                    'publishedAt': article.get('datetime'),
                                    'source': {'name': article.get('source')}
                                })
                except Exception as e:
                    print(f"Exception when fetching news: {e}")
        
        # Deduplicate articles based on title
        unique_articles = []
        titles = set()
        for article in articles:
            if article['title'] not in titles:
                titles.add(article['title'])
                unique_articles.append(article)
                
                # Add the asset reference
                article['asset'] = asset
        
        # Sort by publication date and return the most recent ones
        sorted_articles = sorted(unique_articles, 
                                 key=lambda x: x.get('publishedAt', ''),
                                 reverse=True)
        
        return sorted_articles[:max_articles]

class SentimentAnalyzer:
    """
    A class for analyzing sentiment of financial news articles
    using both VADER and TextBlob
    """
    def __init__(self):
        self.vader = SentimentIntensityAnalyzer()
        
    def analyze_text(self, text):
        """
        Analyze sentiment of a text using both VADER and TextBlob
        
        Args:
            text (str): The text to analyze
            
        Returns:
            dict: Sentiment scores from both analyzers
        """
        if not text:
            return {
                'vader_compound': 0,
                'vader_pos': 0,
                'vader_neg': 0,
                'vader_neu': 0,
                'textblob_polarity': 0,
                'textblob_subjectivity': 0,
                'sentiment_label': 'neutral'
            }
        
        # VADER sentiment analysis
        vader_scores = self.vader.polarity_scores(text)
        
        # TextBlob sentiment analysis
        tb = TextBlob(text)
        textblob_polarity = tb.sentiment.polarity
        textblob_subjectivity = tb.sentiment.subjectivity
        
        # Combined sentiment label
        # Use VADER's compound score as the primary indicator
        if vader_scores['compound'] >= 0.05:
            sentiment_label = 'positive'
        elif vader_scores['compound'] <= -0.05:
            sentiment_label = 'negative'
        else:
            sentiment_label = 'neutral'
        
        return {
            'vader_compound': vader_scores['compound'],
            'vader_pos': vader_scores['pos'],
            'vader_neg': vader_scores['neg'],
            'vader_neu': vader_scores['neu'],
            'textblob_polarity': textblob_polarity,
            'textblob_subjectivity': textblob_subjectivity,
            'sentiment_label': sentiment_label
        }
    
    def analyze_article(self, article):
        """
        Analyze sentiment of a news article
        
        Args:
            article (dict): News article dictionary
            
        Returns:
            dict: Original article with added sentiment analysis
        """
        # Create a combined text from title and description
        title = article.get('title', '')
        description = article.get('description', '')
        combined_text = f"{title}. {description}"
        
        # Get sentiment scores
        sentiment_scores = self.analyze_text(combined_text)
        
        # Add to the article dictionary
        article.update({
            'sentiment': sentiment_scores,
            'analyzed_at': datetime.now().isoformat()
        })
        
        return article
    
    def extract_key_phrases(self, text, max_phrases=5):
        """
        Extract key phrases from text
        
        Args:
            text (str): Input text
            max_phrases (int): Maximum number of phrases to extract
            
        Returns:
            list: List of key phrases
        """
        if not text:
            return []
            
        # Simple noun phrase extraction
        tb = TextBlob(text)
        noun_phrases = list(tb.noun_phrases)
        
        # Simple keyword extraction based on POS tagging
        important_tags = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ']
        words = tb.tags  # Get words with POS tags
        keywords = [word for word, tag in words if tag in important_tags]
        
        # Combine phrases and keywords, remove duplicates
        all_phrases = noun_phrases + keywords
        unique_phrases = list(set(all_phrases))
        
        # Sort by length (favor longer phrases) and return top N
        sorted_phrases = sorted(unique_phrases, key=len, reverse=True)
        
        return sorted_phrases[:max_phrases]

class NewsSentimentManager:
    """
    Manages the collection and analysis of news sentiment for trading assets
    """
    def __init__(self, news_api_client, sentiment_analyzer):
        self.news_api_client = news_api_client
        self.sentiment_analyzer = sentiment_analyzer
        self.data_dir = os.path.join(news_dir, 'data')
        os.makedirs(self.data_dir, exist_ok=True)
        
    def collect_and_analyze_news(self, assets, days_back=3, max_articles_per_asset=10):
        """
        Collect and analyze news for multiple assets
        
        Args:
            assets (list): List of asset names
            days_back (int): Days to look back for news
            max_articles_per_asset (int): Max articles per asset
            
        Returns:
            dict: Dictionary with asset names as keys and analyzed articles as values
        """
        all_results = {}
        
        for asset in assets:
            print(f"Collecting news for {asset}...")
            
            # Get news articles
            articles = self.news_api_client.get_news_for_asset(
                asset, days_back, max_articles_per_asset
            )
            
            # Analyze sentiment
            analyzed_articles = []
            for article in articles:
                analyzed_article = self.sentiment_analyzer.analyze_article(article)
                analyzed_articles.append(analyzed_article)
            
            all_results[asset] = analyzed_articles
            
            # Save to json file
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"{asset.replace('/', '_')}_{timestamp}.json"
            with open(os.path.join(self.data_dir, filename), 'w') as f:
                json.dump(analyzed_articles, f, indent=4)
                
        return all_results
    
    def calculate_asset_sentiment_summary(self, asset_articles):
        """
        Calculate summary sentiment metrics for an asset
        
        Args:
            asset_articles (list): List of analyzed articles for an asset
            
        Returns:
            dict: Summary sentiment metrics
        """
        if not asset_articles:
            return {
                'count': 0,
                'avg_vader_compound': 0,
                'avg_textblob_polarity': 0,
                'positive_count': 0,
                'negative_count': 0,
                'neutral_count': 0,
                'sentiment_distribution': {
                    'positive': 0,
                    'neutral': 0,
                    'negative': 0
                },
                'overall_sentiment': 'neutral',
                'key_phrases': []
            }
            
        # Extract sentiment scores
        vader_scores = [a['sentiment']['vader_compound'] for a in asset_articles]
        textblob_scores = [a['sentiment']['textblob_polarity'] for a in asset_articles]
        
        # Count sentiments
        sentiments = [a['sentiment']['sentiment_label'] for a in asset_articles]
        positive_count = sentiments.count('positive')
        negative_count = sentiments.count('negative')
        neutral_count = sentiments.count('neutral')
        total_count = len(sentiments)
        
        # Calculate distribution percentages
        sentiment_dist = {
            'positive': (positive_count / total_count) * 100 if total_count > 0 else 0,
            'neutral': (neutral_count / total_count) * 100 if total_count > 0 else 0, 
            'negative': (negative_count / total_count) * 100 if total_count > 0 else 0
        }
        
        # Determine overall sentiment
        if positive_count > negative_count and positive_count > neutral_count:
            overall = 'positive'
        elif negative_count > positive_count and negative_count > neutral_count:
            overall = 'negative'
        else:
            overall = 'neutral'
            
        # Extract key phrases across all articles
        all_text = " ".join([
            f"{a.get('title', '')}. {a.get('description', '')}" 
            for a in asset_articles
        ])
        key_phrases = self.sentiment_analyzer.extract_key_phrases(all_text, 10)
        
        return {
            'count': total_count,
            'avg_vader_compound': np.mean(vader_scores) if vader_scores else 0,
            'avg_textblob_polarity': np.mean(textblob_scores) if textblob_scores else 0,
            'positive_count': positive_count,
            'negative_count': negative_count,
            'neutral_count': neutral_count,
            'sentiment_distribution': sentiment_dist,
            'overall_sentiment': overall,
            'key_phrases': key_phrases
        }
    
    def get_news_sentiment_features(self, all_assets_articles):
        """
        Extract sentiment features for ML model
        
        Args:
            all_assets_articles (dict): Dictionary with asset names and analyzed articles
            
        Returns:
            dict: Dictionary with asset names and sentiment features
        """
        features = {}
        
        for asset, articles in all_assets_articles.items():
            # Get summary
            summary = self.calculate_asset_sentiment_summary(articles)
            
            # Extract ML features
            features[asset] = {
                'sentiment_score': summary['avg_vader_compound'],  # Main sentiment score
                'sentiment_magnitude': abs(summary['avg_vader_compound']),  # Intensity of sentiment
                'textblob_score': summary['avg_textblob_polarity'],
                'positive_ratio': summary['sentiment_distribution']['positive'] / 100,
                'negative_ratio': summary['sentiment_distribution']['negative'] / 100,
                'neutral_ratio': summary['sentiment_distribution']['neutral'] / 100,
                'article_count': summary['count'],
                'sentiment_label': summary['overall_sentiment'],
                'has_news': summary['count'] > 0
            }
            
        return features

# Create a README file for the news sentiment module with proper triple-quote closing
with open(os.path.join(news_dir, 'README.md'), 'w') as f:
    f.write('''# News Sentiment Analysis Module

This module handles the collection and analysis of financial news sentiment for trading assets.

## Components

- `NewsAPIClient`: Fetches news articles from various news API providers
- `SentimentAnalyzer`: Analyzes sentiment using VADER and TextBlob
- `NewsSentimentManager`: Orchestrates collection, analysis, and storage of sentiment data

## Supported Assets

- US100 (NASDAQ 100)
- US30 (Dow Jones Industrial Average)
- EUR/USD (Euro/US Dollar)
- GBP/USD (British Pound/US Dollar)
- Crude Oil WTI
- Crude Oil Brent

## Usage

### Basic Usage Example


SyntaxError: incomplete input (939305528.py, line 409)

In [14]:
from metagpt.tools.libs.terminal import Terminal
terminal = Terminal()
await terminal.run('pip install textblob vaderSentiment')

import os
import pandas as pd
import numpy as np
import requests
import json
import re
from datetime import datetime, timedelta
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Set the base directory
base_dir = '/data/chats/46etsd/workspace/data_processing'
news_dir = os.path.join(base_dir, 'news_sentiment')
os.makedirs(os.path.join(news_dir, 'data'), exist_ok=True)

class NewsAPIClient:
    """
    A client for fetching financial news from various news APIs
    """
    def __init__(self, api_provider='newsapi', api_key=None):
        """
        Initialize the news API client
        
        Args:
            api_provider (str): The news API provider ('newsapi', 'finnhub')
            api_key (str): API key for the provider
        """
        self.api_provider = api_provider.lower()
        self.api_key = api_key
        
        # Base URLs for news APIs
        self.base_urls = {
            'newsapi': 'https://newsapi.org/v2',
            'finnhub': 'https://finnhub.io/api/v1'
        }
        
        # Keywords mapping for each asset to use in news search
        self.asset_keywords = {
            'US100': ['NASDAQ', 'NASDAQ 100', 'NDX', 'tech stocks', 'technology sector'],
            'US30': ['Dow Jones', 'Dow 30', 'DJIA', 'Dow Jones Industrial Average'],
            'EUR/USD': ['EUR/USD', 'euro dollar', 'euro forex', 'euro currency', 'ECB', 'Federal Reserve'],
            'GBP/USD': ['GBP/USD', 'pound dollar', 'sterling', 'Bank of England', 'Brexit'],
            'Crude Oil WTI': ['crude oil', 'WTI', 'oil prices', 'OPEC', 'oil market'],
            'Crude Oil Brent': ['Brent crude', 'Brent oil', 'oil prices', 'OPEC', 'oil market']
        }

    def get_news_for_asset(self, asset, days_back=3, max_articles=10):
        """
        Fetch news articles related to a specific asset
        
        Args:
            asset (str): The asset name from our standard list
            days_back (int): Number of days to look back for news
            max_articles (int): Maximum number of articles to fetch
            
        Returns:
            list: List of news articles
        """
        if self.api_provider not in self.base_urls:
            raise ValueError(f"Unsupported API provider: {self.api_provider}")
            
        keywords = self.asset_keywords.get(asset)
        if not keywords:
            raise ValueError(f"No keywords defined for asset: {asset}")
            
        from_date = (datetime.now() - timedelta(days=days_back)).strftime('%Y-%m-%d')
        
        articles = []
        
        if self.api_provider == 'newsapi':
            # For each keyword, get articles and merge them
            for keyword in keywords:
                url = f"{self.base_urls['newsapi']}/everything"
                params = {
                    'q': keyword,
                    'from': from_date,
                    'sortBy': 'publishedAt',
                    'language': 'en',
                    'apiKey': self.api_key
                }
                
                try:
                    response = requests.get(url, params=params)
                    if response.status_code == 200:
                        data = response.json()
                        if data.get('status') == 'ok':
                            articles.extend(data.get('articles', []))
                    else:
                        print(f"Error fetching news from NewsAPI: {response.status_code}")
                except Exception as e:
                    print(f"Exception when fetching news: {e}")
                    
        elif self.api_provider == 'finnhub':
            # Use Finnhub's news endpoint
            for keyword in keywords:
                url = f"{self.base_urls['finnhub']}/news"
                params = {
                    'category': 'general',
                    'token': self.api_key
                }
                
                try:
                    response = requests.get(url, params=params)
                    if response.status_code == 200:
                        data = response.json()
                        # Filter articles containing our keyword
                        for article in data:
                            if keyword.lower() in article.get('headline', '').lower() or \
                               keyword.lower() in article.get('summary', '').lower():
                                articles.append({
                                    'title': article.get('headline'),
                                    'description': article.get('summary'),
                                    'url': article.get('url'),
                                    'publishedAt': article.get('datetime'),
                                    'source': {'name': article.get('source')}
                                })
                except Exception as e:
                    print(f"Exception when fetching news: {e}")
        
        # Deduplicate articles based on title
        unique_articles = []
        titles = set()
        for article in articles:
            if article['title'] not in titles:
                titles.add(article['title'])
                unique_articles.append(article)
                
                # Add the asset reference
                article['asset'] = asset
        
        # Sort by publication date and return the most recent ones
        sorted_articles = sorted(unique_articles, 
                                 key=lambda x: x.get('publishedAt', ''),
                                 reverse=True)
        
        return sorted_articles[:max_articles]

class SentimentAnalyzer:
    """
    A class for analyzing sentiment of financial news articles
    using both VADER and TextBlob
    """
    def __init__(self):
        self.vader = SentimentIntensityAnalyzer()
        
    def analyze_text(self, text):
        """
        Analyze sentiment of a text using both VADER and TextBlob
        
        Args:
            text (str): The text to analyze
            
        Returns:
            dict: Sentiment scores from both analyzers
        """
        if not text:
            return {
                'vader_compound': 0,
                'vader_pos': 0,
                'vader_neg': 0,
                'vader_neu': 0,
                'textblob_polarity': 0,
                'textblob_subjectivity': 0,
                'sentiment_label': 'neutral'
            }
        
        # VADER sentiment analysis
        vader_scores = self.vader.polarity_scores(text)
        
        # TextBlob sentiment analysis
        tb = TextBlob(text)
        textblob_polarity = tb.sentiment.polarity
        textblob_subjectivity = tb.sentiment.subjectivity
        
        # Combined sentiment label
        # Use VADER's compound score as the primary indicator
        if vader_scores['compound'] >= 0.05:
            sentiment_label = 'positive'
        elif vader_scores['compound'] <= -0.05:
            sentiment_label = 'negative'
        else:
            sentiment_label = 'neutral'
        
        return {
            'vader_compound': vader_scores['compound'],
            'vader_pos': vader_scores['pos'],
            'vader_neg': vader_scores['neg'],
            'vader_neu': vader_scores['neu'],
            'textblob_polarity': textblob_polarity,
            'textblob_subjectivity': textblob_subjectivity,
            'sentiment_label': sentiment_label
        }
    
    def analyze_article(self, article):
        """
        Analyze sentiment of a news article
        
        Args:
            article (dict): News article dictionary
            
        Returns:
            dict: Original article with added sentiment analysis
        """
        # Create a combined text from title and description
        title = article.get('title', '')
        description = article.get('description', '')
        combined_text = f"{title}. {description}"
        
        # Get sentiment scores
        sentiment_scores = self.analyze_text(combined_text)
        
        # Add to the article dictionary
        article.update({
            'sentiment': sentiment_scores,
            'analyzed_at': datetime.now().isoformat()
        })
        
        return article
    
    def extract_key_phrases(self, text, max_phrases=5):
        """
        Extract key phrases from text
        
        Args:
            text (str): Input text
            max_phrases (int): Maximum number of phrases to extract
            
        Returns:
            list: List of key phrases
        """
        if not text:
            return []
            
        # Simple noun phrase extraction
        tb = TextBlob(text)
        noun_phrases = list(tb.noun_phrases)
        
        # Simple keyword extraction based on POS tagging
        important_tags = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ']
        words = tb.tags  # Get words with POS tags
        keywords = [word for word, tag in words if tag in important_tags]
        
        # Combine phrases and keywords, remove duplicates
        all_phrases = noun_phrases + keywords
        unique_phrases = list(set(all_phrases))
        
        # Sort by length (favor longer phrases) and return top N
        sorted_phrases = sorted(unique_phrases, key=len, reverse=True)
        
        return sorted_phrases[:max_phrases]

class NewsSentimentManager:
    """
    Manages the collection and analysis of news sentiment for trading assets
    """
    def __init__(self, news_api_client, sentiment_analyzer):
        self.news_api_client = news_api_client
        self.sentiment_analyzer = sentiment_analyzer
        self.data_dir = os.path.join(news_dir, 'data')
        os.makedirs(self.data_dir, exist_ok=True)
        
    def collect_and_analyze_news(self, assets, days_back=3, max_articles_per_asset=10):
        """
        Collect and analyze news for multiple assets
        
        Args:
            assets (list): List of asset names
            days_back (int): Days to look back for news
            max_articles_per_asset (int): Max articles per asset
            
        Returns:
            dict: Dictionary with asset names as keys and analyzed articles as values
        """
        all_results = {}
        
        for asset in assets:
            print(f"Collecting news for {asset}...")
            
            # Get news articles
            articles = self.news_api_client.get_news_for_asset(
                asset, days_back, max_articles_per_asset
            )
            
            # Analyze sentiment
            analyzed_articles = []
            for article in articles:
                analyzed_article = self.sentiment_analyzer.analyze_article(article)
                analyzed_articles.append(analyzed_article)
            
            all_results[asset] = analyzed_articles
            
            # Save to json file
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"{asset.replace('/', '_')}_{timestamp}.json"
            with open(os.path.join(self.data_dir, filename), 'w') as f:
                json.dump(analyzed_articles, f, indent=4)
                
        return all_results
    
    def calculate_asset_sentiment_summary(self, asset_articles):
        """
        Calculate summary sentiment metrics for an asset
        
        Args:
            asset_articles (list): List of analyzed articles for an asset
            
        Returns:
            dict: Summary sentiment metrics
        """
        if not asset_articles:
            return {
                'count': 0,
                'avg_vader_compound': 0,
                'avg_textblob_polarity': 0,
                'positive_count': 0,
                'negative_count': 0,
                'neutral_count': 0,
                'sentiment_distribution': {
                    'positive': 0,
                    'neutral': 0,
                    'negative': 0
                },
                'overall_sentiment': 'neutral',
                'key_phrases': []
            }
            
        # Extract sentiment scores
        vader_scores = [a['sentiment']['vader_compound'] for a in asset_articles]
        textblob_scores = [a['sentiment']['textblob_polarity'] for a in asset_articles]
        
        # Count sentiments
        sentiments = [a['sentiment']['sentiment_label'] for a in asset_articles]
        positive_count = sentiments.count('positive')
        negative_count = sentiments.count('negative')
        neutral_count = sentiments.count('neutral')
        total_count = len(sentiments)
        
        # Calculate distribution percentages
        sentiment_dist = {
            'positive': (positive_count / total_count) * 100 if total_count > 0 else 0,
            'neutral': (neutral_count / total_count) * 100 if total_count > 0 else 0, 
            'negative': (negative_count / total_count) * 100 if total_count > 0 else 0
        }
        
        # Determine overall sentiment
        if positive_count > negative_count and positive_count > neutral_count:
            overall = 'positive'
        elif negative_count > positive_count and negative_count > neutral_count:
            overall = 'negative'
        else:
            overall = 'neutral'
            
        # Extract key phrases across all articles
        all_text = " ".join([
            f"{a.get('title', '')}. {a.get('description', '')}" 
            for a in asset_articles
        ])
        key_phrases = self.sentiment_analyzer.extract_key_phrases(all_text, 10)
        
        return {
            'count': total_count,
            'avg_vader_compound': np.mean(vader_scores) if vader_scores else 0,
            'avg_textblob_polarity': np.mean(textblob_scores) if textblob_scores else 0,
            'positive_count': positive_count,
            'negative_count': negative_count,
            'neutral_count': neutral_count,
            'sentiment_distribution': sentiment_dist,
            'overall_sentiment': overall,
            'key_phrases': key_phrases
        }
    
    def get_news_sentiment_features(self, all_assets_articles):
        """
        Extract sentiment features for ML model
        
        Args:
            all_assets_articles (dict): Dictionary with asset names and analyzed articles
            
        Returns:
            dict: Dictionary with asset names and sentiment features
        """
        features = {}
        
        for asset, articles in all_assets_articles.items():
            # Get summary
            summary = self.calculate_asset_sentiment_summary(articles)
            
            # Extract ML features
            features[asset] = {
                'sentiment_score': summary['avg_vader_compound'],  # Main sentiment score
                'sentiment_magnitude': abs(summary['avg_vader_compound']),  # Intensity of sentiment
                'textblob_score': summary['avg_textblob_polarity'],
                'positive_ratio': summary['sentiment_distribution']['positive'] / 100,
                'negative_ratio': summary['sentiment_distribution']['negative'] / 100,
                'neutral_ratio': summary['sentiment_distribution']['neutral'] / 100,
                'article_count': summary['count'],
                'sentiment_label': summary['overall_sentiment'],
                'has_news': summary['count'] > 0
            }
            
        return features

# Create example usage file to demonstrate how to use the sentiment analysis module
with open(os.path.join(news_dir, 'example_usage.py'), 'w') as f:
    f.write("""import os
import json
from news_api_client import NewsAPIClient
from sentiment_analyzer import SentimentAnalyzer
from news_sentiment_manager import NewsSentimentManager

# Initialize components
news_client = NewsAPIClient(api_provider='newsapi', api_key='YOUR_API_KEY')
sentiment_analyzer = SentimentAnalyzer()
manager = NewsSentimentManager(news_client, sentiment_analyzer)

# Define assets to analyze
assets = ['US100', 'US30', 'EUR/USD', 'GBP/USD', 'Crude Oil WTI', 'Crude Oil Brent']

# Collect and analyze news
results = manager.collect_and_analyze_news(assets, days_back=3, max_articles_per_asset=5)

# Get sentiment features for ML model
sentiment_features = manager.get_news_sentiment_features(results)

# Print summary for each asset
for asset, articles in results.items():
    summary = manager.calculate_asset_sentiment_summary(articles)
    print(f"\\nSentiment Summary for {asset}:")
    print(f"Number of articles: {summary['count']}")
    print(f"Average sentiment score: {summary['avg_vader_compound']:.2f}")
    print(f"Sentiment distribution: {summary['sentiment_distribution']}")
    print(f"Overall sentiment: {summary['overall_sentiment']}")
    print(f"Key phrases: {', '.join(summary['key_phrases'][:3])}")
""")

# Create a README file for the news sentiment module with complete content
readme_content = """# News Sentiment Analysis Module

This module handles the collection and analysis of financial news sentiment for trading assets.

## Components

- `NewsAPIClient`: Fetches news articles from various news API providers
- `SentimentAnalyzer`: Analyzes sentiment using VADER and TextBlob
- `NewsSentimentManager`: Orchestrates collection, analysis, and storage of sentiment data

## Supported Assets

- US100 (NASDAQ 100)
- US30 (Dow Jones Industrial Average)
- EUR/USD (Euro/US Dollar)
- GBP/USD (British Pound/US Dollar)
- Crude Oil WTI
- Crude Oil Brent

## Usage

### Basic Usage Example


SyntaxError: incomplete input (3261422680.py, line 441)