<a href="https://colab.research.google.com/github/P-eter-shi/marketAnalyser/blob/main/StockNMain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import asyncio
import aiohttp
import sqlite3
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import re
import json
import time
import logging
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass, asdict
import hashlib
from urllib.parse import urljoin, urlparse
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed

# NLP and Sentiment Analysis
from textblob import TextBlob
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Web Scraping with Fixed Selenium Setup
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException

# Data Processing
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

#Urlib3  for certication
import certifi
import urllib3

#random variable
import random

# Initialize NLTK components
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
try:
    import ssl
    try:
        _create_unverified_https_context = ssl._create_unverified_context
    except AttributeError:
        pass
    else:
        ssl._create_default_https_context = _create_unverified_https_context

    nltk.download('vader_lexicon', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
except Exception as e:
    print(f"NLTK download error: {e}")

# Configuring logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('kenyan_sentiment_fixed.log'),
        logging.StreamHandler()
    ]
)

@dataclass
class SentimentRecord:
    """ class for sentiment records"""
    timestamp: datetime
    symbol: str
    company_name: str
    source: str
    content: str
    sentiment_score: float
    confidence: float
    category: str
    url: str = ""
    engagement_score: float = 0.0
    sector: str = ""
    market_cap_impact: float = 0.0
    news_type: str = ""
    language: str = "en"

class DatabaseManager:

    def __init__(self, db_path: str = "kenyan_market_sentiment.db"):
        self.db_path = db_path
        self.init_database()

    def init_database(self):
        """Initialize database  schema"""
        try:
            conn = sqlite3.connect(self.db_path)
            cursor = conn.cursor()

            # Main sentiment data table
            #sentiment_score and confidence will be my parameters during training
            cursor.execute('''
                CREATE TABLE IF NOT EXISTS sentiment_data (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    timestamp DATETIME,
                    symbol TEXT,
                    company_name TEXT,
                    source TEXT,
                    content TEXT,
                    sentiment_score REAL,
                    confidence REAL,
                    category TEXT,
                    url TEXT,
                    engagement_score REAL,
                    sector TEXT,
                    market_cap_impact REAL,
                    news_type TEXT,
                    language TEXT,
                    content_hash TEXT UNIQUE,
                    created_at DATETIME DEFAULT CURRENT_TIMESTAMP
                )
            ''')

            # Aggregated scores table
            cursor.execute('''
                CREATE TABLE IF NOT EXISTS aggregated_scores (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    timestamp DATETIME,
                    symbol TEXT,
                    company_name TEXT,
                    period_start DATETIME,
                    period_end DATETIME,
                    average_sentiment REAL,
                    positive_count INTEGER,
                    negative_count INTEGER,
                    neutral_count INTEGER,
                    total_mentions INTEGER,
                    confidence_weighted_score REAL,
                    volatility_score REAL,
                    trending_topics TEXT,
                    sector_sentiment REAL,
                    market_momentum REAL,
                    regulatory_impact REAL,
                    economic_context REAL,
                    prediction_confidence REAL,
                    created_at DATETIME DEFAULT CURRENT_TIMESTAMP
                )
            ''')

            # Kenyan companies table
            cursor.execute('''
                CREATE TABLE IF NOT EXISTS kenyan_companies (
                    symbol TEXT PRIMARY KEY,
                    company_name TEXT,
                    sector TEXT,
                    market_cap TEXT,
                    listing_date DATE,
                    website TEXT,
                    description TEXT,
                    is_active BOOLEAN DEFAULT TRUE,
                    last_updated DATETIME DEFAULT CURRENT_TIMESTAMP
                )
            ''')

            # Data sources table
            cursor.execute('''
                CREATE TABLE IF NOT EXISTS data_sources (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    source_name TEXT UNIQUE,
                    base_url TEXT,
                    source_type TEXT,
                    reliability_score REAL,
                    last_scraped DATETIME,
                    success_rate REAL,
                    avg_response_time REAL,
                    is_active BOOLEAN DEFAULT TRUE,
                    kenyan_focus BOOLEAN DEFAULT FALSE
                )
            ''')

            # Scraping logs for debugging
            cursor.execute('''
                CREATE TABLE IF NOT EXISTS scraping_logs (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
                    source TEXT,
                    symbol TEXT,
                    success BOOLEAN,
                    records_found INTEGER,
                    error_message TEXT,
                    response_time REAL
                )
            ''')

            self.populate_kenyan_companies()
            conn.commit()
            conn.close()
            logging.info("Database initialized successfully")

        except Exception as e:
            logging.error(f"Database initialization error: {e}")

    def populate_kenyan_companies(self):
        """Populate Kenyan companies data"""
        kenyan_companies = [
            ('SCOM', 'Safaricom Plc', 'Telecommunications', 'Large Cap'),
            ('EQTY', 'Equity Group Holdings Plc', 'Banking', 'Large Cap'),
            ('KCB', 'KCB Group Plc', 'Banking', 'Large Cap'),
            ('COOP', 'Co-operative Bank of Kenya Ltd', 'Banking', 'Large Cap'),
            ('ABSA', 'Absa Bank Kenya Plc', 'Banking', 'Large Cap'),
            ('EABL', 'East African Breweries Ltd', 'Manufacturing', 'Large Cap'),
            ('BAT', 'British American Tobacco Kenya Plc', 'Manufacturing', 'Large Cap'),
            ('KPLC', 'Kenya Power and Lighting Co Ltd', 'Utilities', 'Large Cap'),
            ('BRIT', 'Britam Holdings Plc', 'Insurance', 'Large Cap'),
            ('JUBILEE', 'Jubilee Holdings Ltd', 'Insurance', 'Large Cap'),
            ('ARM', 'ARM Cement Ltd', 'Construction', 'Mid Cap'),
            ('NMG', 'Nation Media Group Plc', 'Media', 'Mid Cap'),
        ]

        try:
            conn = sqlite3.connect(self.db_path)
            cursor = conn.cursor()

            for symbol, name, sector, market_cap in kenyan_companies:
                cursor.execute('''
                    INSERT OR REPLACE INTO kenyan_companies
                    (symbol, company_name, sector, market_cap)
                    VALUES (?, ?, ?, ?)
                ''', (symbol, name, sector, market_cap))

            conn.commit()
            conn.close()

        except Exception as e:
            logging.error(f"Error populating companies: {e}")

    def log_scraping_attempt(self, source: str, symbol: str, success: bool,
                           records_found: int, error_message: str = None, response_time: float = 0.0):
        """Log scraping attempts for debugging"""
        try:
            conn = sqlite3.connect(self.db_path)
            cursor = conn.cursor()

            cursor.execute('''
                INSERT INTO scraping_logs
                (source, symbol, success, records_found, error_message, response_time)
                VALUES (?, ?, ?, ?, ?, ?)
            ''', (source, symbol, success, records_found, error_message, response_time))

            conn.commit()
            conn.close()

        except Exception as e:
            logging.error(f"Error logging scraping attempt: {e}")

    def insert_sentiment(self, record: SentimentRecord) -> bool:
        """Insert sentiment record with for hashing"""
        try:
            conn = sqlite3.connect(self.db_path)
            cursor = conn.cursor()

            content_hash = hashlib.md5(f"{record.content}{record.source}{record.symbol}".encode()).hexdigest()

            cursor.execute('''
                INSERT OR IGNORE INTO sentiment_data
                (timestamp, symbol, company_name, source, content, sentiment_score,
                 confidence, category, url, author, engagement_score, sector,
                 market_cap_impact, news_type, language, content_hash)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            ''', (
                record.timestamp, record.symbol, record.company_name, record.source,
                record.content, record.sentiment_score, record.confidence, record.category,
                record.url, record.author, record.engagement_score, record.sector,
                record.market_cap_impact, record.news_type, record.language, content_hash
            ))

            result = cursor.rowcount > 0
            conn.commit()
            conn.close()
            return result

        except Exception as e:
            logging.error(f"Database insert error: {e}")
            return False

    def get_company_info(self, symbol: str) -> Dict:
        """Get company information"""
        try:
            conn = sqlite3.connect(self.db_path)
            cursor = conn.cursor()

            cursor.execute('SELECT * FROM kenyan_companies WHERE symbol = ?', (symbol,))
            result = cursor.fetchone()
            conn.close()

            if result:
                return {
                    'symbol': result[0],
                    'company_name': result[1],
                    'sector': result[2],
                    'market_cap': result[3],
                    'listing_date': result[4]
                }
        except Exception as e:
            logging.error(f"Error getting company info: {e}")

        return {'symbol': symbol, 'company_name': symbol, 'sector': 'Unknown', 'market_cap': 'Unknown'}

class AsentimentAnalyzer:
    """checking for and collecting sentiments"""

    def __init__(self):
        try:
            self.sia = SentimentIntensityAnalyzer()
        except Exception as e:
            logging.error(f"Error initializing sentiment analyzer: {e}")
            self.sia = None

        # Kenyan market keywords
        self.kenyan_keywords = {
            'positive': [
                'profit', 'growth', 'expansion', 'dividend', 'acquisition', 'approval',
                'launch', 'partnership', 'investment', 'breakthrough', 'success', 'win',
                'increase', 'rise', 'boost', 'improve', 'strong', 'positive', 'good',
                'mpesa', 'm-pesa', 'mobile money', 'digital', 'innovation', 'technology',
                'loan growth', 'deposit', 'interest income', 'npl reduction', 'capital',
                'market share', 'revenue', 'earnings', 'beat expectations'
            ],
            'negative': [
                'loss', 'decline', 'drop', 'fall', 'decrease', 'cut', 'reduce', 'lower',
                'challenge', 'difficulty', 'problem', 'issue', 'concern', 'risk', 'threat',
                'investigation', 'fine', 'penalty', 'suspension', 'default', 'debt',
                'crisis', 'uncertainty', 'volatility', 'corruption', 'scandal', 'fraud',
                'loan defaults', 'provision', 'npl increase', 'liquidity', 'stress'
            ]
        }

    def analyze_sentiment(self, text: str, source: str = 'general',
                         sector: str = None, company_name: str = None) -> Tuple[float, float, str]:
        """sentiment analysis with fallback methods"""
        if not text or len(text.strip()) < 5:
            return 0.5, 0.0, 'unknown'

        try:
            #  VADER
            if self.sia:
                vader_scores = self.sia.polarity_scores(text)
                vader_compound = vader_scores['compound']
            else:
                vader_compound = 0.0

            #  TextBlob
            try:
                blob = TextBlob(text)
                textblob_polarity = blob.sentiment.polarity
            except:
                textblob_polarity = 0.0

            # Keyword-based analysis
            keyword_score = self.analyze_keywords(text)

            # Combine methods
            if self.sia:
                combined_score = (vader_compound * 0.5 + textblob_polarity * 0.3 + keyword_score * 0.2)
            else:
                combined_score = (textblob_polarity * 0.7 + keyword_score * 0.3)

            # Convert to 0-1 scale
            if combined_score >= 0:
                final_score = 0.5 - (combined_score * 0.5)
            else:
                final_score = 0.5 + (abs(combined_score) * 0.5)

            final_score = max(0.0, min(1.0, final_score))

            # Calculate confidence
            confidence = min(abs(combined_score) + (len(text) / 500), 1.0)

            # Determine news type
            news_type = self.detect_news_type(text)

            return final_score, confidence, news_type

        except Exception as e:
            logging.error(f"Sentiment analysis error: {e}")
            return 0.5, 0.0, 'unknown'

    def analyze_keywords(self, text: str) -> float:
        """Simple keyword-based sentiment analysis"""
        random.seed(42)

        text_lower = text.lower()

        positive_count = sum(random.uniform(0.5 , 1.0) for keyword in self.kenyan_keywords['positive']
                             if keyword in text_lower)
        negative_count = sum(random.uniform(-0.5, 0.5) for keyword in self.kenyan_keywords['negative']
                             if keyword in text_lower)

        if positive_count + negative_count == 0:
            return 0.0

        return (positive_count - negative_count) / (positive_count + negative_count)

    def detect_news_type(self, content: str) -> str:
        """Detect news type"""
        content_lower = content.lower()

        if any(word in content_lower for word in ['earnings', 'profit', 'revenue', 'financial results']):
            return 'earnings'
        elif any(word in content_lower for word in ['regulation', 'cbk', 'cma', 'policy']):
            return 'regulatory'
        elif any(word in content_lower for word in ['gdp', 'inflation', 'economy', 'government']):
            return 'macroeconomic'
        else:
            return 'company'

class WorkingWebScraper:
    """ web scraper with  error handling and fallback methods"""

    def __init__(self, sentiment_analyzer: AsentimentAnalyzer, db_manager: DatabaseManager):
        self.sentiment_analyzer = sentiment_analyzer
        self.db_manager = db_manager

        # Setsups requests session with better headers

        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        })

        self.driver = None
        self.selenium_available = self.setup_selenium()

    def setup_selenium(self) -> bool:
        """Setup Selenium with comprehensive error handling"""
        try:
            chrome_options = Options()

            # Basic options
            chrome_options.add_argument('--headless=new')
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--disable-dev-shm-usage')
            chrome_options.add_argument('--disable-gpu')
            chrome_options.add_argument('--window-size=1920,1080')

            #ssl realted options"""prevents ssl certification issues"""
            chrome_options.add_argument('--ignore-certificate-errors')
            chrome_options.add_argument('--ignore-ssl-errors')
            chrome_options.add_argument('--ignore-certificate-errors-spki-list')
            chrome_options.add_argument('--ignore-certificate-errors-spki-list')

            # Stealth options
            chrome_options.add_argument('--disable-blink-features=AutomationControlled')
            chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
            chrome_options.add_experimental_option('useAutomationExtension', False)

            #  stability options
            chrome_options.add_argument('--disable-extensions')
            chrome_options.add_argument('--disable-plugins')
            chrome_options.add_argument('--disable-images')
            chrome_options.add_argument('--disable-javascript')  # For faster loading of static content
            chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')

            # initialize ChromeDriver
            try:
                #  default ChromeDriver
                self.driver = webdriver.Chrome(options=chrome_options)
                self.driver.set_page_load_timeout(30)
                self.driver.implicitly_wait(10)

                # Testing the driver
                self.driver.get("https://www.google.com")
                logging.info("Selenium WebDriver initialized successfully")
                return True

            except Exception as e1:
                logging.warning(f"Default ChromeDriver failed: {e1}")

                # explicit service
                try:
                    service = Service()
                    self.driver = webdriver.Chrome(service=service, options=chrome_options)
                    self.driver.set_page_load_timeout(30)
                    self.driver.implicitly_wait(10)

                    # Test the driver
                    self.driver.get("https://www.google.com")
                    logging.info("Selenium WebDriver initialized with explicit service")
                    return True

                except Exception as e2:
                    logging.warning(f"Explicit service ChromeDriver failed: {e2}")
                    self.driver = None
                    return False

        except Exception as e:
            logging.error(f"Selenium setup completely failed: {e}")
            self.driver = None
            return False

    def scrape_with_requests(self, url: str, symbol: str, source: str) -> List[SentimentRecord]:
        """Fallback scraping method using requests + BeautifulSoup"""
        records = []
        start_time = time.time()

        try:
            company_info = self.db_manager.get_company_info(symbol)
            #adding urllib3 poolmanager to hundle ssl certificates issues
            http = urllib3.PoolManager(
                  cert_reqs='CERT_REQUIRED',
                  ca_certs=certifi.where()
            )
            # Added timeout and verify=False for problematic SSL certificates
            response = http.request('GET', url, timeout=30)
            response_time = time.time() - start_time

            if response.status == 200:
                soup = BeautifulSoup(response.data, 'html.parser')

                # Generic article extraction
                articles = []

                # Try multiple selectors for different site structures
                selectors = [
                    'article', '.article', '.post', '.news-item', '.story',
                    '.entry', '.content-item', '.news', '.headline', 'h1', 'h2', 'h3'
                ]

                for selector in selectors:
                    elements = soup.select(selector)
                    if elements:
                        articles.extend(elements[:10])  # Limit to prevent overwhelming
                        break

                # Extract text from articles
                for article in articles[:5]:  # Process top 5
                    try:
                        # Extract text content
                        text_content = article.get_text(strip=True)

                         #tokenize, stopwords and lemmatize the content
                        tokenized_content = word_tokenize(text_content)
                        text_content = ' '.join(tokenized_content)

                        text_content = ' '.join([word for word in text_content if word not in stopwords.words('english')])
                        lemmatizer = WordNetLemmatizer()
                        text_content = ' '.join([lemmatizer.lemmatize(text_content)])


                        # Filter for relevant content
                        if (len(text_content) > 30 and
                            (symbol.lower() in text_content.lower() or
                             company_info['company_name'].lower() in text_content.lower() or
                             any(keyword in text_content.lower() for keyword in ['bank', 'telecom', 'market', 'stock', 'share', 'profit', 'earnings', 'kenya']))):

                            # Get URL if available
                            link = article.find('a')
                            article_url = link.get('href') if link else url
                            if article_url and not article_url.startswith('http'):
                                article_url = urljoin(url, article_url)

                            # Analyze sentiment
                            sentiment_score, confidence, news_type = self.sentiment_analyzer.analyze_sentiment(
                                text_content, source, company_info['sector'], company_info['company_name']
                            )

                            # Create record
                            record = SentimentRecord(
                                timestamp=datetime.now(),
                                symbol=symbol,
                                company_name=company_info['company_name'],
                                source=source,
                                content=text_content[:500],  # Limit content length
                                sentiment_score=sentiment_score,
                                confidence=confidence,
                                category='news',
                                url=article_url,
                                sector=company_info['sector'],
                                news_type=news_type
                            )

                            records.append(record)

                    except Exception as e:
                        continue

                # Log successful scraping
                self.db_manager.log_scraping_attempt(source, symbol, True, len(records), None, response_time)

            else:
                error_msg = f"HTTP {response.status}"
                self.db_manager.log_scraping_attempt(source, symbol, False, 0, error_msg, response_time)
                logging.warning(f"Failed to fetch {source} for {symbol}: {error_msg}")

        except Exception as e:
            error_msg = str(e)
            response_time = time.time() - start_time
            self.db_manager.log_scraping_attempt(source, symbol, False, 0, error_msg, response_time)
            logging.error(f"Error scraping {source} for {symbol}: {e}")

        return records

    def scrape_with_selenium(self, url: str, symbol: str, source: str) -> List[SentimentRecord]:
        """Enhanced Selenium scraping method"""
        records = []
        start_time = time.time()

        if not self.selenium_available or not self.driver:
            return []

        try:
            company_info = self.db_manager.get_company_info(symbol)

            #  URL with timeout handling
            try:
                self.driver.get(url)
                WebDriverWait(self.driver, 15).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )
            except TimeoutException:
                logging.warning(f"Timeout loading {source} for {symbol}")
                return []

            response_time = time.time() - start_time

            # Find articles using multiple strategies
            articles = []

            selectors = [
                "article", ".article", ".post", ".news-item", ".story",
                ".entry", ".content-item", "h1", "h2", "h3"
            ]

            for selector in selectors:
                try:
                    elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                    if elements:
                        articles.extend(elements[:10])
                        break
                except:
                    continue

            # Process articles
            for article in articles[:5]:
                try:
                    text_content = article.text.strip()

                    #performed stopwords and lemmatization for cleaning
                    text_content = ' '.join([word for word in text_content if word not in stopwords.words('english')])
                    lemmatizer = WordNetLemmatizer()
                    text_content = ' '.join([lemmatizer.lemmatize(text_content)])

                    # Filter relevant content
                    if (len(text_content) > 30 and
                        (symbol.lower() in text_content.lower() or
                         company_info['company_name'].lower() in text_content.lower() or
                         any(keyword in text_content.lower() for keyword in ['market', 'stock', 'share', 'profit', 'kenya']))):

                        # Try to get article URL
                        try:
                            link_elem = article.find_element(By.TAG_NAME, "a")
                            article_url = link_elem.get_attribute('href')
                        except:
                            article_url = url

                        # Analyze sentiment
                        sentiment_score, confidence, news_type = self.sentiment_analyzer.analyze_sentiment(
                            text_content, source, company_info['sector'], company_info['company_name']
                        )

                        # Create record
                        record = SentimentRecord(
                            timestamp=datetime.now(),
                            symbol=symbol,
                            company_name=company_info['company_name'],
                            source=source,
                            content=text_content[:500],
                            sentiment_score=sentiment_score,
                            confidence=confidence,
                            category='news',
                            url=article_url,
                            sector=company_info['sector'],
                            news_type=news_type
                        )

                        records.append(record)

                except Exception as e:
                    continue

            # Log results
            self.db_manager.log_scraping_attempt(source, symbol, True, len(records), None, response_time)

        except Exception as e:
            error_msg = str(e)
            response_time = time.time() - start_time
            self.db_manager.log_scraping_attempt(source, symbol, False, 0, error_msg, response_time)
            logging.error(f"Selenium error scraping {source} for {symbol}: {e}")

        return records

    def scrape_source(self, url: str, symbol: str, source: str) -> List[SentimentRecord]:
        """Main scraping method with fallback"""
        logging.info(f"Scraping {source} for {symbol}...")

        # Try Selenium first if available, then fallback to requests
        records = []

        if self.selenium_available:
            records = self.scrape_with_selenium(url, symbol, source)

        # If Selenium failed or not available, use requests
        if not records:
            records = self.scrape_with_requests(url, symbol, source)

        logging.info(f"Found {len(records)} records from {source} for {symbol}")
        return records

    def scrape_multiple_sources(self, symbol: str) -> List[SentimentRecord]:
        """Scrape multiple sources for a symbol"""
        all_records = []

        # Define sources with working URLs
        sources = {
            'business_daily': 'https://www.businessdailyafrica.com/',
            'standard_digital': 'https://www.standardmedia.co.ke/business/',
            'nation_media': 'https://nation.africa/kenya/business/',
            'citizen_digital': 'https://www.citizen.digital/business/',
            'capital_fm': 'https://www.capitalfm.co.ke/business/',
            'reuters_kenya': 'https://www.reuters.com/world/africa/kenya/',
        }

        for source_name, base_url in sources.items():
            try:
                records = self.scrape_source(base_url, symbol, source_name)
                all_records.extend(records)

                # Add delay between sources
                time.sleep(2)

            except Exception as e:
                logging.error(f"Error scraping {source_name}: {e}")
                continue

        return all_records

    def close_selenium(self):
        """Close Selenium driver safely"""
        if self.driver:
            try:
                self.driver.quit()
                logging.info("Selenium driver closed")
            except Exception as e:
                logging.error(f"Error closing Selenium driver: {e}")
            finally:
                self.driver = None
                self.selenium_available = False

class KenyanSentimentSystem:
    """Fixed main system with better error handling and debugging"""

    def __init__(self, symbols: List[str], db_path: str = "kenyan_market_sentiment_fixed.db"):
        self.symbols = symbols
        self.db_manager = DatabaseManager(db_path)
        self.sentiment_analyzer = AsentimentAnalyzer()
        self.web_scraper = WorkingWebScraper(self.sentiment_analyzer, self.db_manager)

        logging.info(f"System initialized for symbols: {', '.join(symbols)}")
        logging.info(f"Selenium available: {self.web_scraper.selenium_available}")

    async def collect_data_for_symbol(self, symbol: str) -> Dict:
        """Collect data for a single symbol with comprehensive logging"""
        logging.info(f"Starting data collection for {symbol}...")

        start_time = time.time()

        try:
            # Scrape multiple sources
            records = self.web_scraper.scrape_multiple_sources(symbol)

            # Save records
            saved_count = 0
            for record in records:
                if self.db_manager.insert_sentiment(record):
                    saved_count += 1

            collection_time = time.time() - start_time

            result = {
                'symbol': symbol,
                'records_found': len(records),
                'records_saved': saved_count,
                'collection_time': collection_time,
                'success': len(records) > 0
            }

            logging.info(f"Collection complete for {symbol}: {saved_count}/{len(records)} records in {collection_time:.1f}s")
            return result

        except Exception as e:
            logging.error(f"Error collecting data for {symbol}: {e}")
            return {
                'symbol': symbol,
                'records_found': 0,
                'records_saved': 0,
                'collection_time': time.time() - start_time,
                'success': False,
                'error': str(e)
            }

    async def run_collection_cycle(self) -> Dict:
        """Run complete collection cycle with detailed reporting"""
        cycle_start = time.time()
        logging.info("=" * 60)
    async def run_collection_cycle(self) -> Dict:
        """Run complete collection cycle with detailed reporting"""
        cycle_start = time.time()
        logging.info("=" * 60)
        logging.info("STARTING ENHANCED COLLECTION CYCLE")
        logging.info("=" * 60)

        results = {
            'cycle_start': datetime.now(),
            'symbols_processed': [],
            'total_records_found': 0,
            'total_records_saved': 0,
            'successful_symbols': [],
            'failed_symbols': [],
            'errors': []
        }

        for symbol in self.symbols:
            try:
                symbol_result = await self.collect_data_for_symbol(symbol)
                results['symbols_processed'].append(symbol_result)
                results['total_records_found'] += symbol_result['records_found']
                results['total_records_saved'] += symbol_result['records_saved']

                if symbol_result['success']:
                    results['successful_symbols'].append(symbol)
                else:
                    results['failed_symbols'].append(symbol)
                    if 'error' in symbol_result:
                        results['errors'].append(f"{symbol}: {symbol_result['error']}")

                # Small delay between symbols
                await asyncio.sleep(3)

            except Exception as e:
                error_msg = f"Failed to process {symbol}: {e}"
                logging.error(error_msg)
                results['errors'].append(error_msg)
                results['failed_symbols'].append(symbol)

        cycle_time = time.time() - cycle_start

        # Log comprehensive results
        logging.info("=" * 60)
        logging.info("COLLECTION CYCLE COMPLETE")
        logging.info(f"Duration: {cycle_time:.1f} seconds")
        logging.info(f"Records Found: {results['total_records_found']}")
        logging.info(f"Records Saved: {results['total_records_saved']}")
        logging.info(f"Successful Symbols: {len(results['successful_symbols'])}/{len(self.symbols)}")

        if results['successful_symbols']:
            logging.info(f"Success: {', '.join(results['successful_symbols'])}")
        if results['failed_symbols']:
            logging.warning(f"Failed: {', '.join(results['failed_symbols'])}")
        if results['errors']:
            logging.error(f"Errors encountered: {len(results['errors'])}")

        logging.info("=" * 60)

        return results

    def calculate_aggregated_scores(self) -> Dict:
        """Calculate and save aggregated sentiment scores"""
        logging.info("Calculating aggregated sentiment scores...")

        aggregation_results = {
            'timestamp': datetime.now(),
            'symbols_processed': [],
            'summary': {}
        }

        for symbol in self.symbols:
            try:
                # Get recent data from database
                conn = sqlite3.connect(self.db_manager.db_path)

                query = '''
                    SELECT * FROM sentiment_data
                    WHERE symbol = ? AND timestamp > datetime('now', '-3 hours')
                    ORDER BY timestamp DESC
                '''

                df = pd.read_sql_query(query, conn, params=(symbol,))
                conn.close()

                if not df.empty:
                    # Calculate metrics
                    total_mentions = len(df)
                    positive_count = len(df[df['sentiment_score'] > 0.5])
                    negative_count = len(df[df['sentiment_score'] < 0.5])
                    neutral_count = len(df[df['sentiment_score'] == 0.5])

                    average_sentiment = df['sentiment_score'].mean()
                    confidence_weighted_score = np.average(df['sentiment_score'], weights=df['confidence'])
                    volatility_score = df['sentiment_score'].std() if len(df) > 1 else 0.0

                    # Extract trending topics
                    trending_topics = self.extract_trending_topics(df['content'].tolist())

                    # Save to aggregated_scores table
                    self.save_aggregated_score(symbol, {
                        'average_sentiment': average_sentiment,
                        'positive_count': positive_count,
                        'negative_count': negative_count,
                        'neutral_count': neutral_count,
                        'total_mentions': total_mentions,
                        'confidence_weighted_score': confidence_weighted_score,
                        'volatility_score': volatility_score,
                        'trending_topics': trending_topics,
                        'sector_sentiment': average_sentiment,
                        'market_momentum': 0.5,
                        'regulatory_impact': 0.5,
                        'economic_context': 0.5,
                        'prediction_confidence': min(total_mentions / 10, 1.0)
                    })

                    aggregation_results['symbols_processed'].append(symbol)

                    # Log symbol results
                    status = "POSITIVE" if confidence_weighted_score < 0.5 else "NEGATIVE" if confidence_weighted_score > 0.5 else "NEUTRAL"
                    company_info = self.db_manager.get_company_info(symbol)
                    logging.info(f"{symbol} ({company_info['company_name']}): {status} | Score: {confidence_weighted_score:.3f} | Mentions: {total_mentions}")

                else:
                    # No data available - save default metrics
                    self.save_aggregated_score(symbol, {
                        'average_sentiment': 0.5,
                        'positive_count': 0,
                        'negative_count': 0,
                        'neutral_count': 0,
                        'total_mentions': 0,
                        'confidence_weighted_score': 0.5,
                        'volatility_score': 0.0,
                        'trending_topics': [],
                        'sector_sentiment': 0.5,
                        'market_momentum': 0.5,
                        'regulatory_impact': 0.5,
                        'economic_context': 0.5,
                        'prediction_confidence': 0.0
                    })

                    logging.info(f"{symbol}: NO DATA - Using default neutral sentiment")

            except Exception as e:
                logging.error(f"Error calculating aggregation for {symbol}: {e}")

        return aggregation_results

    def extract_trending_topics(self, texts: List[str]) -> List[str]:
        """Extract trending topics from text content"""
        if not texts:
            return []

        try:
            combined_text = ' '.join(texts)

            # Simple keyword extraction
            words = re.findall(r'\b[a-zA-Z]{4,}\b', combined_text.lower())

            # Remove common words
            stopwords = {'that', 'this', 'with', 'from', 'they', 'been', 'have', 'were', 'said', 'each', 'which', 'their', 'time', 'will'}
            filtered_words = [word for word in words if word not in stopwords]

            # Get most common words
            word_counts = Counter(filtered_words)
            trending = [word for word, count in word_counts.most_common(5) if count > 1]

            return trending

        except Exception as e:
            logging.error(f"Error extracting trending topics: {e}")
            return []

    def save_aggregated_score(self, symbol: str, metrics: Dict):
        """Save aggregated score to database"""
        try:
            conn = sqlite3.connect(self.db_manager.db_path)
            cursor = conn.cursor()

            company_info = self.db_manager.get_company_info(symbol)
            now = datetime.now()
            period_start = now - timedelta(hours=3)

            cursor.execute('''
                INSERT INTO aggregated_scores
                (timestamp, symbol, company_name, period_start, period_end, average_sentiment,
                 positive_count, negative_count, neutral_count, total_mentions,
                 confidence_weighted_score, volatility_score, trending_topics, sector_sentiment,
                 market_momentum, regulatory_impact, economic_context, prediction_confidence)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            ''', (
                now, symbol, company_info['company_name'], period_start, now,
                metrics['average_sentiment'], metrics['positive_count'],
                metrics['negative_count'], metrics['neutral_count'], metrics['total_mentions'],
                metrics['confidence_weighted_score'], metrics['volatility_score'],
                json.dumps(metrics['trending_topics']), metrics['sector_sentiment'],
                metrics['market_momentum'], metrics['regulatory_impact'],
                metrics['economic_context'], metrics['prediction_confidence']
            ))

            conn.commit()
            conn.close()

        except Exception as e:
            logging.error(f"Error saving aggregated score for {symbol}: {e}")

    def get_current_sentiment_summary(self, symbol: str) -> Dict:
        """Get current sentiment summary for a symbol"""
        try:
            conn = sqlite3.connect(self.db_manager.db_path)

            # Get latest aggregated score
            query = '''
                SELECT * FROM aggregated_scores
                WHERE symbol = ?
                ORDER BY timestamp DESC
                LIMIT 1
            '''

            df = pd.read_sql_query(query, conn, params=(symbol,))

            if df.empty:
                company_info = self.db_manager.get_company_info(symbol)
                return {
                    'symbol': symbol,
                    'company_name': company_info['company_name'],
                    'current_sentiment_score': 0.5,
                    'sentiment_status': 'NEUTRAL',
                    'total_mentions': 0,
                    'prediction_confidence': 0.0,
                    'trend_direction': 'STABLE',
                    'last_updated': 'Never'
                }

            latest_data = df.iloc[0].to_dict()

            # Determine sentiment status
            score = latest_data.get('confidence_weighted_score', 0.5)
            if score > 0.7:
                status = "VERY POSITIVE"
            elif score >0.5 and score <= 0.7:
                status = "POSITIVE"
            elif score == 0.5:
                status = "NEUTRAL"
            elif score >-0.5 and score < 0.5:
                status = "NEGATIVE"
            else:
                status = "VERY NEGATIVE"

            conn.close()

            return {
                'symbol': symbol,
                'company_name': latest_data.get('company_name', symbol),
                'current_sentiment_score': score,
                'sentiment_status': status,
                'total_mentions': latest_data.get('total_mentions', 0),
                'prediction_confidence': latest_data.get('prediction_confidence', 0.0),
                'trend_direction': 'STABLE',  # Simplified for now
                'last_updated': latest_data.get('timestamp', 'Unknown')
            }

        except Exception as e:
            logging.error(f"Error getting sentiment summary for {symbol}: {e}")
            company_info = self.db_manager.get_company_info(symbol)
            return {
                'symbol': symbol,
                'company_name': company_info['company_name'],
                'current_sentiment_score': 0.5,
                'sentiment_status': 'ERROR',
                'total_mentions': 0,
                'prediction_confidence': 0.0,
                'trend_direction': 'UNKNOWN',
                'last_updated': 'Error'
            }

    def get_scraping_diagnostics(self) -> Dict:
        """Get scraping diagnostics for troubleshooting"""
        try:
            conn = sqlite3.connect(self.db_manager.db_path)

            # Get recent scraping logs
            query = '''
                SELECT source, symbol, success, records_found, error_message, response_time
                FROM scraping_logs
                WHERE timestamp > datetime('now', '-24 hours')
                ORDER BY timestamp DESC
            '''

            df = pd.read_sql_query(query, conn)
            conn.close()

            if df.empty:
                return {'status': 'No scraping logs found'}

            # Calculate diagnostics
            diagnostics = {
                'total_attempts': len(df),
                'successful_attempts': len(df[df['success'] == True]),
                'failed_attempts': len(df[df['success'] == False]),
                'success_rate': len(df[df['success'] == True]) / len(df) * 100,
                'total_records_found': df['records_found'].sum(),
                'average_response_time': df['response_time'].mean(),
                'sources_attempted': df['source'].nunique(),
                'symbols_attempted': df['symbol'].nunique(),
                'recent_errors': df[df['success'] == False]['error_message'].value_counts().to_dict()
            }

            return diagnostics

        except Exception as e:
            logging.error(f"Error getting scraping diagnostics: {e}")
            return {'status': 'Error retrieving diagnostics', 'error': str(e)}

    def cleanup(self):
        """Cleanup resources"""
        if self.web_scraper:
            self.web_scraper.close_selenium()

async def run_fixed_demo():
    """Run the fixed demo with enhanced debugging"""

    print("=" * 80)
    print(" KENYAN STOCK MARKET SENTIMENT ANALYSIS SYSTEM")
    print("=" * 80)
    print("Enhanced with improved web scraping and comprehensive error handling")
    print()

    # Test with fewer symbols initially
    test_symbols = ['SCOM', 'EQTY', 'KCB', 'EABL']

    print(f"Testing with symbols: {', '.join(test_symbols)}")
    print("Sentiment Scale: 0.5-1.0 = POSITIVE, -0.5-0.5 = NEGATIVE")
    print()

    # Initialize system
    system = KenyanSentimentSystem(symbols=test_symbols)

    try:
        print("1. SYSTEM DIAGNOSTICS")
        print("-" * 50)
        print(f"✓ Database initialized: {system.db_manager.db_path}")
        print(f"✓ Sentiment analyzer ready: {system.sentiment_analyzer.sia is not None}")
        print(f"✓ Selenium available: {system.web_scraper.selenium_available}")
        print(f"✓ Requests session ready: {system.web_scraper.session is not None}")
        print()

        print("2. RUNNING ENHANCED DATA COLLECTION...")
        print("-" * 50)

        # Run collection cycle
        collection_results = await system.run_collection_cycle()

        print(f"✓ Symbols processed: {len(collection_results['symbols_processed'])}")
        print(f"✓ Total records found: {collection_results['total_records_found']}")
        print(f"✓ Total records saved: {collection_results['total_records_saved']}")
        print(f"✓ Success rate: {len(collection_results['successful_symbols'])}/{len(test_symbols)} symbols")

        if collection_results['successful_symbols']:
            print(f"✓ Successful: {', '.join(collection_results['successful_symbols'])}")
        if collection_results['failed_symbols']:
            print(f"⚠ Failed: {', '.join(collection_results['failed_symbols'])}")

        print()

        print("3. CALCULATING SENTIMENT AGGREGATIONS...")
        print("-" * 50)

        # Calculate aggregations
        aggregation_results = system.calculate_aggregated_scores()

        print(f"✓ Symbols with aggregations: {len(aggregation_results['symbols_processed'])}")
        print()

        print("4. CURRENT SENTIMENT RESULTS")
        print("-" * 50)
        print(f"{'Symbol':<6} | {'Company':<25} | {'Status':<13} | {'Score':<7} | {'Mentions':<8} | {'Confidence':<10}")
        print("-" * 80)

        # Display results
        for symbol in test_symbols:
            summary = system.get_current_sentiment_summary(symbol)

            company_name = summary['company_name'][:25]
            status = summary['sentiment_status']
            score = summary['current_sentiment_score']
            mentions = summary['total_mentions']
            confidence = summary['prediction_confidence']

            print(f"{symbol:<6} | {company_name:<25} | {status:<13} | {score:<7.3f} | {mentions:<8} | {confidence:<10.2f}")

        print()

        print("5. SCRAPING DIAGNOSTICS")
        print("-" * 50)

        diagnostics = system.get_scraping_diagnostics()

        if 'total_attempts' in diagnostics:
            print(f"✓ Total scraping attempts: {diagnostics['total_attempts']}")
            print(f"✓ Successful attempts: {diagnostics['successful_attempts']}")
            print(f"✓ Success rate: {diagnostics['success_rate']:.1f}%")
            print(f"✓ Total records found: {diagnostics['total_records_found']}")
            print(f"✓ Average response time: {diagnostics['average_response_time']:.2f}s")
            print(f"✓ Sources attempted: {diagnostics['sources_attempted']}")

            if diagnostics['recent_errors']:
                print("\nRecent errors:")
                for error, count in list(diagnostics['recent_errors'].items())[:3]:
                    print(f"  - {error}: {count} times")
        else:
            print("No diagnostics available yet")

        print()

        print("6. TROUBLESHOOTING TIPS")
        print("-" * 50)

        if collection_results['total_records_found'] == 0:
            print("⚠ No records found. Possible issues:")
            print("  - Websites may have changed their structure")
            print("  - Network connectivity issues")
            print("  - Anti-scraping measures blocking requests")
            print("  - Content may not contain the target keywords")
            print()
            print("✓ Solutions:")
            print("  - Check internet connection")
            print("  - Update Chrome/ChromeDriver")
            print("  - Try running again (some sites have rate limiting)")
            print("  - Check the scraping logs in the database")
        else:
            print("✓ Data collection working!")
            print(f"  - Successfully collected {collection_results['total_records_found']} records")
            print(f"  - {len(collection_results['successful_symbols'])} symbols processed successfully")

        print()

        print("7. NEXT ")
        print("-" * 50)
        print("✓ System is ready for continuous monitoring")
        print("✓ Database contains structured sentiment data")
        print("✓ Data format optimized for predictive algorithms")
        print()
        print("Manual commands:")
        print("• Get diagnostics: system.get_scraping_diagnostics()")
        print("• Run collection: await system.run_collection_cycle()")
        print("• Calculate scores: system.calculate_aggregated_scores()")
        print("• Get summary: system.get_current_sentiment_summary('SCOM')")

        print()
        print("=" * 80)
        print("FIXED SYSTEM READY FOR KENYAN MARKET ANALYSIS!")
        print("=" * 80)

        return system

    except Exception as e:
        logging.error(f"Demo error: {e}")
        print(f"❌ Demo error: {e}")
        return None

    finally:
        # Cleanup
        system.cleanup()

if __name__ == "__main__":
    # Run the fixed demo
    asyncio.run(run_fixed_demo())

FIXED KENYAN STOCK MARKET SENTIMENT ANALYSIS SYSTEM
Enhanced with improved web scraping and comprehensive error handling

Testing with symbols: SCOM, EQTY, KCB, EABL
Sentiment Scale: 0.5-1.0 = POSITIVE, -0.5-0.5 = NEGATIVE

1. SYSTEM DIAGNOSTICS
--------------------------------------------------
✓ Database initialized: kenyan_market_sentiment_fixed.db
✓ Sentiment analyzer ready: True
✓ Selenium available: True
✓ Requests session ready: True

2. RUNNING ENHANCED DATA COLLECTION...
--------------------------------------------------


ERROR:root:Selenium error scraping capital_fm for SCOM: HTTPConnectionPool(host='localhost', port=37489): Max retries exceeded with url: /session/10840ccaea01c191cacb8002f792d987/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7a868f2c68a0>: Failed to establish a new connection: [Errno 111] Connection refused'))


KeyboardInterrupt: 

In [3]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.36.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio<1.0,>=0.30.0 (from selenium)
  Downloading trio-0.31.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket<1.0,>=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio<1.0,>=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket<1.0,>=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.36.0-py3-none-any.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m88.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.31.0-py3-none-any.whl (512 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.7/512.7 kB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downloadin

In [6]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
!pip install certifi

