In [1]:
# Data manipulation and analysis
import numpy as np
import pandas as pd

# Optimization libraries
import scipy.optimize as sco

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
plt.style.use('fivethirtyeight')

# API requests
import requests
from datetime import datetime, timedelta
import json
import os

# For model persistence
import pickle

# Warning suppression
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Function to fetch NSE stock data from Yahoo Finance
def fetch_nse_data(tickers, start_date, end_date):
    """
    Fetch historical stock data for NSE-listed companies
    
    Parameters:
    - tickers (list): List of NSE stock symbols with '.NR' suffix for Nairobi exchange
    - start_date (str): Start date in YYYY-MM-DD format
    - end_date (str): End date in YYYY-MM-DD format
    
    Returns:
    - DataFrame with historical stock prices
    """
    try:
        # Yahoo Finance requires headers to prevent blocking
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        # Create empty DataFrame to store all stock data
        all_data = pd.DataFrame()
        
        for ticker in tickers:
            # Convert dates to Unix timestamp for Yahoo Finance API
            start_timestamp = int(datetime.strptime(start_date, '%Y-%m-%d').timestamp())
            end_timestamp = int(datetime.strptime(end_date, '%Y-%m-%d').timestamp())
            
            # Construct URL
            url = f"https://query1.finance.yahoo.com/v7/finance/download/{ticker}?period1={start_timestamp}&period2={end_timestamp}&interval=1d&events=history"
            
            # Fetch data
            response = requests.get(url, headers=headers)
            
            if response.status_code == 200:
                # Create DataFrame from CSV content
                stock_data = pd.read_csv(pd.io.common.StringIO(response.text))
                stock_data['Symbol'] = ticker
                
                # Append to main DataFrame
                if all_data.empty:
                    all_data = stock_data
                else:
                    all_data = pd.concat([all_data, stock_data])
            else:
                print(f"Failed to fetch data for {ticker}: Status code {response.status_code}")
        
        # Convert date to datetime
        all_data['Date'] = pd.to_datetime(all_data['Date'])
        
        return all_data
    
    except Exception as e:
        print(f"Error fetching stock data: {str(e)}")
        return pd.DataFrame()

In [3]:
# Load sample NSE data if API fetch fails
def load_sample_nse_data():
    """
    Load sample NSE stock data for demonstration purposes
    
    Returns:
    - DataFrame with sample historical stock prices
    """
    # Create sample data for key NSE stocks with realistic values
    # We'll generate synthetic data based on real NSE stock behavior
    np.random.seed(42)  # For reproducibility
    
    # Define stock parameters (based on historical NSE performance)
    stocks = {
        'SCOM.NR': {'mean': 0.0005, 'std': 0.015, 'start_price': 42.75},  # Safaricom
        'EQTY.NR': {'mean': 0.0003, 'std': 0.012, 'start_price': 51.00},  # Equity Group
        'KCB.NR': {'mean': 0.0002, 'std': 0.014, 'start_price': 45.50},   # KCB Group
        'COOP.NR': {'mean': 0.0003, 'std': 0.013, 'start_price': 16.00},  # Cooperative Bank
        'EABL.NR': {'mean': 0.0001, 'std': 0.011, 'start_price': 170.00}, # East African Breweries
        'BAT.NR': {'mean': -0.0001, 'std': 0.010, 'start_price': 800.00}, # BAT Kenya
        'ABSA.NR': {'mean': 0.0002, 'std': 0.012, 'start_price': 11.75},  # Absa Bank Kenya
        'JUB.NR': {'mean': 0.0004, 'std': 0.014, 'start_price': 305.00}   # Jubilee Holdings
    }
    
    # Generate dates (2 years of trading days - approximately 504 days)
    end_date = datetime.now()
    dates = [end_date - timedelta(days=i) for i in range(504)]
    dates.reverse()  # Sort chronologically
    dates = [date for date in dates if date.weekday() < 5]  # Keep only weekdays
    
    # Create empty DataFrame
    all_data = pd.DataFrame()
    
    # Generate data for each stock
    for ticker, params in stocks.items():
        # Generate returns using random walk with drift
        returns = np.random.normal(params['mean'], params['std'], size=len(dates))
        
        # Convert returns to prices
        price = params['start_price']
        prices = [price]
        
        for ret in returns[1:]:
            price = price * (1 + ret)
            prices.append(price)
        
        # Create DataFrame for this stock
        stock_data = pd.DataFrame({
            'Date': dates,
            'Open': prices,
            'High': [p * (1 + abs(np.random.normal(0, 0.005))) for p in prices],
            'Low': [p * (1 - abs(np.random.normal(0, 0.005))) for p in prices],
            'Close': prices,
            'Adj Close': prices,
            'Volume': [int(np.random.normal(500000, 100000)) for _ in range(len(dates))],
            'Symbol': ticker
        })
        
        # Append to main DataFrame
        if all_data.empty:
            all_data = stock_data
        else:
            all_data = pd.concat([all_data, stock_data])
    
    return all_data

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import os

BASE_URL = "https://www.nse.co.ke/dataservices/historical-data/"
HEADERS = {"User-Agent": "Mozilla/5.0"}

# Define the different categories to scrape
data_categories = {
    "daily_equities": "daily-equities-pricelist",
    "daily_bonds": "daily-bonds-pricelist",
    "weekly_equities": "weekly-equities-pricelist",
    "monthly_bulletin": "monthly-bulletin",
}

# Date range from 2015 to 2025
start_date = datetime.date(2015, 1, 1)
end_date = datetime.date(2025, 12, 31)

# Function to scrape data
def scrape_nse_data(category, start_date, end_date):
    current_date = start_date
    all_data = []
    while current_date <= end_date:
        formatted_date = current_date.strftime("%Y-%m-%d")
        url = f"{BASE_URL}{data_categories[category]}?date={formatted_date}"
        response = requests.get(url, headers=HEADERS)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            tables = soup.find_all("table")
            for table in tables:
                df = pd.read_html(str(table))[0]
                df["Date"] = formatted_date
                all_data.append(df)
        else:
            print(f"Failed to fetch data for {formatted_date}: {response.status_code}")
        current_date += datetime.timedelta(days=1)
    return pd.concat(all_data, ignore_index=True) if all_data else None

# Create a directory for storing data
if not os.path.exists("nse_data"): 
    os.makedirs("nse_data")

# Scrape and save data
for category in data_categories.keys():
    print(f"Scraping {category} data...")
    df = scrape_nse_data(category, start_date, end_date)
    if df is not None:
        file_path = f"nse_data/{category}.csv"
        df.to_csv(file_path, index=False)
        print(f"Saved {category} data to {file_path}")
    else:
        print(f"No data found for {category}")

Scraping daily_equities data...


ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
from datetime import datetime
import logging
import re

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("nse_scraper.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

class NSEDataScraper:
    def __init__(self):
        """Initialize the NSE data scraper with necessary parameters"""
        self.base_dir = 'nse_data'
        os.makedirs(self.base_dir, exist_ok=True)
        self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # Define NSE stock symbols from the provided data
        self.stock_symbols = [
            'EQTY', 'KCB', 'COOP', 'ABSA', 'SCBK', 'DTK', 'IMH', 'NCBA',
            'SCOM', 'EABL', 'BAT', 'BAMB', 'KEGN', 'KPLC', 'JUB', 'BRIT',
            'CIC', 'CTUM', 'NBV', 'HAFR', 'TOTL', 'KENO', 'KUKZ', 'SASN',
            'KAPC', 'SIC', 'SCAN', 'NMG', 'LKL', 'UMME'
        ]
        
        # Company codes with NSE suffix
        self.nse_codes = [f"{symbol}.NR" for symbol in self.stock_symbols]
        
        # HTTP headers to mimic browser behavior
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept-Language': 'en-US,en;q=0.9',
            'Referer': 'https://www.google.com/'
        }
        
        logger.info(f"NSE Scraper initialized with {len(self.stock_symbols)} stocks")

    def scrape_nse_market_statistics(self):
        """Scrape market statistics from the NSE official website"""
        url = "https://www.nse.co.ke/dataservices/market-statistics/"
        logger.info(f"Scraping market statistics from {url}")
        
        try:
            response = requests.get(url, headers=self.headers, timeout=30)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find all tables on the page
            tables = soup.find_all('table')
            logger.info(f"Found {len(tables)} tables on market statistics page")
            
            # Process and store each table
            for i, table in enumerate(tables):
                try:
                    df = pd.read_html(str(table))[0]
                    
                    # Create directory if it doesn't exist
                    output_dir = f"{self.base_dir}/market_statistics"
                    os.makedirs(output_dir, exist_ok=True)
                    
                    # Save to CSV
                    file_path = f"{output_dir}/table_{i+1}_{self.timestamp}.csv"
                    df.to_csv(file_path, index=False)
                    logger.info(f"Saved table {i+1} with shape {df.shape} to {file_path}")
                except Exception as e:
                    logger.error(f"Error processing table {i+1}: {e}")
            
            return True
        
        except requests.exceptions.RequestException as e:
            logger.error(f"Error fetching market statistics: {e}")
            return False
    
    def scrape_nse_historical_data(self):
        """Scrape historical data links from the NSE website"""
        url = "https://www.nse.co.ke/dataservices/historical-data/"
        logger.info(f"Scraping historical data links from {url}")
        
        try:
            response = requests.get(url, headers=self.headers, timeout=30)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find all download links
            download_links = []
            for link in soup.find_all('a', href=True):
                href = link.get('href', '')
                if href.endswith(('.xlsx', '.xls', '.csv', '.pdf')) or 'download' in href.lower():
                    download_links.append({
                        'text': link.text.strip() or "Unnamed Link",
                        'href': href,
                        'date_scraped': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                    })
            
            logger.info(f"Found {len(download_links)} download links")
            
            # Save links to CSV
            if download_links:
                links_df = pd.DataFrame(download_links)
                output_dir = f"{self.base_dir}/historical_data"
                os.makedirs(output_dir, exist_ok=True)
                file_path = f"{output_dir}/download_links_{self.timestamp}.csv"
                links_df.to_csv(file_path, index=False)
                logger.info(f"Saved download links to {file_path}")
                
                # Now download the files
                self.download_files(links_df, f"{output_dir}/files")
            
            return True
        
        except requests.exceptions.RequestException as e:
            logger.error(f"Error fetching historical data links: {e}")
            return False
    
    def download_files(self, links_df, output_dir):
        """Download files from the extracted links"""
        if links_df.empty:
            logger.warning("No links provided for download")
            return
        
        os.makedirs(output_dir, exist_ok=True)
        logger.info(f"Downloading {len(links_df)} files to {output_dir}")
        
        for i, row in links_df.iterrows():
            try:
                url = row['href']
                # Handle relative URLs
                if not url.startswith(('http://', 'https://')):
                    url = f"https://www.nse.co.ke{'' if url.startswith('/') else '/'}{url}"
                
                logger.info(f"Downloading file {i+1}/{len(links_df)}: {url}")
                response = requests.get(url, headers=self.headers, timeout=60)
                response.raise_for_status()
                
                # Extract filename from URL or generate one
                if '/' in url:
                    filename = url.split('/')[-1]
                    # Clean up filename
                    filename = re.sub(r'[\\/*?:"<>|]', "_", filename)
                    if not filename or len(filename) < 3:
                        filename = f"file_{i+1}_{self.timestamp}.dat"
                else:
                    filename = f"file_{i+1}_{self.timestamp}.dat"
                
                # Save the file
                with open(f"{output_dir}/{filename}", 'wb') as f:
                    f.write(response.content)
                logger.info(f"Successfully downloaded {filename}")
                
                # Be polite to the server
                time.sleep(2)
            
            except Exception as e:
                logger.error(f"Error downloading file {i+1}: {e}")
    
    def scrape_mystocks_data(self):
        """Scrape stock data from myStocks Kenya website"""
        logger.info("Scraping data from myStocks Kenya")
        
        all_stock_data = []
        
        for symbol in self.stock_symbols:
            url = f"https://live.mystocks.co.ke/stock={symbol}"
            logger.info(f"Scraping {symbol} from myStocks Kenya")
            
            try:
                response = requests.get(url, headers=self.headers, timeout=30)
                response.raise_for_status()
                
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Extract stock price
                price = None
                price_elem = soup.select_one('.roundbox .price')
                if price_elem:
                    price = price_elem.text.strip()
                
                # Extract price change
                change = None
                change_elem = soup.select_one('.roundbox .change')
                if change_elem:
                    change = change_elem.text.strip()
                
                # Extract volume
                volume = None
                volume_elem = soup.select_one('.roundbox .volume')
                if volume_elem:
                    volume = volume_elem.text.strip()
                
                # Find stock details table
                details = {}
                details_table = soup.select_one('table.infotable')
                if details_table:
                    rows = details_table.select('tr')
                    for row in rows:
                        cells = row.select('td')
                        if len(cells) >= 2:
                            key = cells[0].text.strip().rstrip(':')
                            value = cells[1].text.strip()
                            details[key] = value
                
                stock_data = {
                    'Symbol': symbol,
                    'Price': price,
                    'Change': change,
                    'Volume': volume,
                    'Timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                    **details
                }
                
                all_stock_data.append(stock_data)
                logger.info(f"Successfully scraped {symbol} data")
                
                # Be polite to the server
                time.sleep(2)
            
            except Exception as e:
                logger.error(f"Error scraping {symbol} from myStocks: {e}")
        
        # Save the collected data
        if all_stock_data:
            df = pd.DataFrame(all_stock_data)
            output_dir = f"{self.base_dir}/mystocks"
            os.makedirs(output_dir, exist_ok=True)
            file_path = f"{output_dir}/stock_data_{self.timestamp}.csv"
            df.to_csv(file_path, index=False)
            logger.info(f"Saved myStocks data to {file_path}")
            
            return df
        else:
            logger.warning("No stock data collected from myStocks Kenya")
            return pd.DataFrame()
    
    def scrape_investing_data(self):
        """Scrape NSE indices data from Investing.com"""
        logger.info("Scraping NSE indices data from Investing.com")
        
        indices = [
            {
                'name': 'NSE_20',
                'url': 'https://www.investing.com/indices/kenya-nse-20-historical-data'
            },
            {
                'name': 'NASI',
                'url': 'https://www.investing.com/indices/nairobi-all-share-historical-data'
            }
        ]
        
        for index in indices:
            logger.info(f"Scraping {index['name']} from {index['url']}")
            
            try:
                # Custom headers for Investing.com
                headers = {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                    'Accept-Language': 'en-US,en;q=0.9',
                    'Referer': 'https://www.investing.com/'
                }
                
                response = requests.get(index['url'], headers=headers, timeout=30)
                response.raise_for_status()
                
                # Parse HTML
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Find the historical data table
                table = soup.find('table', {'id': 'curr_table'})
                
                if table:
                    # Extract historical data
                    df = pd.read_html(str(table))[0]
                    
                    # Save to CSV
                    output_dir = f"{self.base_dir}/investing"
                    os.makedirs(output_dir, exist_ok=True)
                    file_path = f"{output_dir}/{index['name']}_{self.timestamp}.csv"
                    df.to_csv(file_path, index=False)
                    logger.info(f"Saved {index['name']} data with {len(df)} rows to {file_path}")
                else:
                    logger.warning(f"No data table found for {index['name']}")
                
                # Be polite to the server
                time.sleep(3)
                
            except Exception as e:
                logger.error(f"Error scraping {index['name']} from Investing.com: {e}")
    
    def scrape_mendeley_datasets(self):
        """Get metadata about NSE datasets from Mendeley Data"""
        logger.info("Collecting information about NSE datasets from Mendeley Data")
        
        datasets = [
            {
                'name': 'NSE_Stocks_2022',
                'url': 'https://data.mendeley.com/datasets/jmcdmnyh2s'
            },
            {
                'name': 'NSE_Stocks_2023_2024',
                'url': 'https://data.mendeley.com/datasets/ss5pfw8xnk/1'
            }
        ]
        
        dataset_info = []
        
        for dataset in datasets:
            logger.info(f"Getting metadata for {dataset['name']} from {dataset['url']}")
            
            try:
                response = requests.get(dataset['url'], headers=self.headers, timeout=30)
                response.raise_for_status()
                
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Extract dataset metadata
                title_elem = soup.select_one('h1.title')
                title = title_elem.text.strip() if title_elem else dataset['name']
                
                # Extract description
                desc_elem = soup.select_one('.description')
                description = desc_elem.text.strip() if desc_elem else "No description available"
                
                # Extract file information
                files = []
                file_elems = soup.select('.file-list-item')
                for file_elem in file_elems:
                    file_name_elem = file_elem.select_one('.name')
                    file_name = file_name_elem.text.strip() if file_name_elem else "Unknown file"
                    files.append(file_name)
                
                dataset_info.append({
                    'Dataset': dataset['name'],
                    'Title': title,
                    'URL': dataset['url'],
                    'Description': description,
                    'Files': ", ".join(files) if files else "File list not available",
                    'Timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })
                
                logger.info(f"Successfully collected metadata for {dataset['name']}")
                
                # Be polite to the server
                time.sleep(3)
                
            except Exception as e:
                logger.error(f"Error collecting metadata for {dataset['name']}: {e}")
                dataset_info.append({
                    'Dataset': dataset['name'],
                    'Title': dataset['name'],
                    'URL': dataset['url'],
                    'Description': "Error collecting metadata",
                    'Files': "Not available",
                    'Timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })
        
        # Save dataset information
        if dataset_info:
            df = pd.DataFrame(dataset_info)
            output_dir = f"{self.base_dir}/mendeley"
            os.makedirs(output_dir, exist_ok=True)
            file_path = f"{output_dir}/dataset_info_{self.timestamp}.csv"
            df.to_csv(file_path, index=False)
            logger.info(f"Saved Mendeley dataset information to {file_path}")
            
            # Note: Direct download may require authentication,
            # so we're just collecting metadata here
            logger.info("Note: To download the actual datasets, please visit the Mendeley Data website")
    
    def run(self):
        """Run all scraping functions"""
        logger.info(f"Starting NSE data scraping at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        
        # Create the main directory
        os.makedirs(self.base_dir, exist_ok=True)
        
        # Scrape NSE official website
        self.scrape_nse_market_statistics()
        self.scrape_nse_historical_data()
        
        # Scrape additional sources
        self.scrape_mystocks_data()
        self.scrape_investing_data()
        self.scrape_mendeley_datasets()
        
        logger.info(f"Scraping completed! All data saved to {os.path.abspath(self.base_dir)}")
        return os.path.abspath(self.base_dir)

if __name__ == "__main__":
    scraper = NSEDataScraper()
    output_path = scraper.run()
    print(f"\nScraping completed successfully!\nData saved to: {output_path}")

2025-03-13 15:33:46,506 - INFO - NSE Scraper initialized with 30 stocks
2025-03-13 15:33:46,508 - INFO - Starting NSE data scraping at 2025-03-13 15:33:46
2025-03-13 15:33:46,510 - INFO - Scraping market statistics from https://www.nse.co.ke/dataservices/market-statistics/
2025-03-13 15:33:47,505 - INFO - Found 6 tables on market statistics page
2025-03-13 15:33:47,524 - INFO - Saved table 1 with shape (16, 3) to nse_data/market_statistics/table_1_20250313_153346.csv
2025-03-13 15:33:47,532 - INFO - Saved table 2 with shape (0, 8) to nse_data/market_statistics/table_2_20250313_153346.csv
2025-03-13 15:33:47,539 - INFO - Saved table 3 with shape (4, 6) to nse_data/market_statistics/table_3_20250313_153346.csv
2025-03-13 15:33:47,545 - INFO - Saved table 4 with shape (1, 6) to nse_data/market_statistics/table_4_20250313_153346.csv
2025-03-13 15:33:47,554 - INFO - Saved table 5 with shape (2, 3) to nse_data/market_statistics/table_5_20250313_153346.csv
2025-03-13 15:33:47,558 - INFO - Sav


Scraping completed successfully!
Data saved to: c:\xampp\htdocs\PesaGuru\notebooks\predictive_modeling\nse_data


In [4]:
# Load sample NSE data if API fetch fails
def load_sample_nse_data():
    """
    Load sample NSE stock data for demonstration purposes
    
    Returns:
    - DataFrame with sample historical stock prices
    """
    # Create sample data for key NSE stocks with realistic values
    # We'll generate synthetic data based on real NSE stock behavior
    np.random.seed(42)  # For reproducibility
    
    # Define stock parameters (based on historical NSE performance)
    stocks = {
        'SCOM.NR': {'mean': 0.0005, 'std': 0.015, 'start_price': 42.75},  # Safaricom
        'EQTY.NR': {'mean': 0.0003, 'std': 0.012, 'start_price': 51.00},  # Equity Group
        'KCB.NR': {'mean': 0.0002, 'std': 0.014, 'start_price': 45.50},   # KCB Group
        'COOP.NR': {'mean': 0.0003, 'std': 0.013, 'start_price': 16.00},  # Cooperative Bank
        'EABL.NR': {'mean': 0.0001, 'std': 0.011, 'start_price': 170.00}, # East African Breweries
        'BAT.NR': {'mean': -0.0001, 'std': 0.010, 'start_price': 800.00}, # BAT Kenya
        'ABSA.NR': {'mean': 0.0002, 'std': 0.012, 'start_price': 11.75},  # Absa Bank Kenya
        'JUB.NR': {'mean': 0.0004, 'std': 0.014, 'start_price': 305.00}   # Jubilee Holdings
    }
    
    # Generate dates (2 years of trading days - approximately 504 days)
    end_date = datetime.now()
    dates = [end_date - timedelta(days=i) for i in range(504)]
    dates.reverse()  # Sort chronologically
    dates = [date for date in dates if date.weekday() < 5]  # Keep only weekdays
    
    # Create empty DataFrame
    all_data = pd.DataFrame()
    
    # Generate data for each stock
    for ticker, params in stocks.items():
        # Generate returns using random walk with drift
        returns = np.random.normal(params['mean'], params['std'], size=len(dates))
        
        # Convert returns to prices
        price = params['start_price']
        prices = [price]
        
        for ret in returns[1:]:
            price = price * (1 + ret)
            prices.append(price)
        
        # Create DataFrame for this stock
        stock_data = pd.DataFrame({
            'Date': dates,
            'Open': prices,
            'High': [p * (1 + abs(np.random.normal(0, 0.005))) for p in prices],
            'Low': [p * (1 - abs(np.random.normal(0, 0.005))) for p in prices],
            'Close': prices,
            'Adj Close': prices,
            'Volume': [int(np.random.normal(500000, 100000)) for _ in range(len(dates))],
            'Symbol': ticker
        })
        
        # Append to main DataFrame
        if all_data.empty:
            all_data = stock_data
        else:
            all_data = pd.concat([all_data, stock_data])
    
    return all_data

In [12]:
# Function to preprocess stock data
def preprocess_stock_data(data):
    """
    Preprocess stock data for portfolio optimization
    
    Parameters:
    - data (DataFrame): Raw stock data
    
    Returns:
    - DataFrame with daily returns
    - DataFrame with monthly returns
    """
    # Make a copy of the data
    df = data.copy()
    
    # Pivot the data to have dates as index and tickers as columns
    pivot_df = df.pivot(index='Date', columns='Symbol', values='Adj Close')
    
    # Calculate daily returns
    daily_returns = pivot_df.pct_change().dropna()
    
    # Calculate monthly returns (resample to month end)
    monthly_returns = pivot_df.resample('M').last().pct_change().dropna()
    
    return daily_returns, monthly_returns

# Preprocess the stock data
daily_returns, monthly_returns = preprocess_stock_data(stock_data)

# Display first few rows of daily returns
print("Daily Returns:")
daily_returns.head()

Daily Returns:


Symbol,ABSA.NR,BAT.NR,COOP.NR,EABL.NR,EQTY.NR,JUB.NR,KCB.NR,SCOM.NR
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-10-30 14:28:08.729639,-0.009045,0.00055,-0.001094,0.005783,0.014382,-0.017907,-0.02949,-0.001574
2023-10-31 14:28:08.729639,0.01392,0.008033,0.016164,-0.010695,0.002472,0.004231,-0.008296,0.010215
2023-11-01 14:28:08.729639,0.002418,-0.008539,-0.013697,-0.015353,-0.015262,0.022336,0.006608,0.023345
2023-11-02 14:28:08.729639,0.003226,-0.017537,-0.009613,0.001279,0.005096,0.008787,-0.038265,-0.003012
2023-11-03 14:28:08.729639,-0.023444,0.001468,0.013772,-0.003407,-0.007516,0.002859,-0.006796,-0.003012


In [None]:
# Calculate and visualize correlation matrix
correlation_matrix = daily_returns.corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, linewidths=0.5)
plt.title('Correlation Matrix of NSE Stocks', fontsize=16)
plt.show()

In [None]:
# Calculate and visualize annualized returns and volatility
# We'll use daily data for more accuracy
mean_returns = daily_returns.mean() * 252  # Annualize (252 trading days)
volatility = daily_returns.std() * np.sqrt(252)  # Annualized volatility

# Create DataFrame with return and risk metrics
risk_return = pd.DataFrame({
    'Annualized Return': mean_returns,
    'Annualized Volatility': volatility
})

# Display risk-return metrics
print("Risk-Return Metrics:")
risk_return.sort_values('Annualized Return', ascending=False)

In [None]:
# Plot risk-return scatter plot
plt.figure(figsize=(12, 8))
plt.scatter(risk_return['Annualized Volatility'], risk_return['Annualized Return'], s=100)

# Add labels for each stock
for i, ticker in enumerate(risk_return.index):
    plt.annotate(ticker.replace('.NR', ''), 
                 (risk_return['Annualized Volatility'][i], risk_return['Annualized Return'][i]),
                 xytext=(10, 5), textcoords='offset points', fontsize=12)

plt.xlabel('Annualized Volatility (Risk)', fontsize=14)
plt.ylabel('Annualized Return', fontsize=14)
plt.title('Risk-Return Profile of NSE Stocks', fontsize=16)
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Define functions for portfolio calculations

def portfolio_performance(weights, mean_returns, cov_matrix):
    """
    Calculate portfolio return and volatility
    
    Parameters:
    - weights: Portfolio weights
    - mean_returns: Expected returns
    - cov_matrix: Covariance matrix
    
    Returns:
    - returns, volatility, sharpe_ratio
    """
    returns = np.sum(mean_returns * weights) * 252  # Annualized return
    volatility = np.sqrt(np.dot(weights.T, np.dot(cov_matrix * 252, weights)))  # Annualized volatility
    sharpe_ratio = returns / volatility  # Sharpe ratio (assuming 0% risk-free rate)
    return returns, volatility, sharpe_ratio

def negative_sharpe(weights, mean_returns, cov_matrix):
    """
    Return negative Sharpe ratio (for minimization)
    """
    returns, volatility, sharpe = portfolio_performance(weights, mean_returns, cov_matrix)
    return -sharpe

def portfolio_volatility(weights, mean_returns, cov_matrix):
    """
    Return portfolio volatility (for minimization)
    """
    return portfolio_performance(weights, mean_returns, cov_matrix)[1]

def portfolio_return(weights, mean_returns, cov_matrix):
    """
    Return portfolio return (for minimization)
    """
    return portfolio_performance(weights, mean_returns, cov_matrix)[0]

def max_sharpe_ratio(mean_returns, cov_matrix, num_assets):
    """
    Find portfolio with maximum Sharpe ratio
    """
    # Initial guess (equal weights)
    args = (mean_returns, cov_matrix)
    constraints = ({'type': 'eq', 'fun': lambda x: np.sum(x) - 1})
    bounds = tuple((0, 1) for _ in range(num_assets))
    
    # Optimize
    result = sco.minimize(negative_sharpe, num_assets * [1./num_assets], args=args,
                        method='SLSQP', bounds=bounds, constraints=constraints)
    
    return result

def min_volatility(mean_returns, cov_matrix, num_assets):
    """
    Find portfolio with minimum volatility
    """
    # Initial guess (equal weights)
    args = (mean_returns, cov_matrix)
    constraints = ({'type': 'eq', 'fun': lambda x: np.sum(x) - 1})
    bounds = tuple((0, 1) for _ in range(num_assets))
    
    # Optimize
    result = sco.minimize(portfolio_volatility, num_assets * [1./num_assets], args=args,
                        method='SLSQP', bounds=bounds, constraints=constraints)
    
    return result

def efficient_frontier(mean_returns, cov_matrix, num_assets, returns_range):
    """
    Calculate efficient frontier for a range of target returns
    """
    efficient_portfolios = []
    
    for target_return in returns_range:
        args = (mean_returns, cov_matrix)
        constraints = (
            {'type': 'eq', 'fun': lambda x: portfolio_return(x, mean_returns, cov_matrix) - target_return},
            {'type': 'eq', 'fun': lambda x: np.sum(x) - 1}
        )
        bounds = tuple((0, 1) for _ in range(num_assets))
        
        # Find minimum volatility for target return
        result = sco.minimize(portfolio_volatility, num_assets * [1./num_assets], args=args,
                            method='SLSQP', bounds=bounds, constraints=constraints)
        
        if result['success']:
            efficient_portfolios.append({
                'return': target_return,
                'volatility': result['fun'],
                'weights': result['x']
            })
    
    return efficient_portfolios

In [None]:
# Calculate efficient frontier

# Get mean returns and covariance matrix
mean_returns_daily = daily_returns.mean()
cov_matrix = daily_returns.cov()
num_assets = len(mean_returns_daily)

# Find portfolio with maximum Sharpe ratio
max_sharpe_result = max_sharpe_ratio(mean_returns_daily, cov_matrix, num_assets)
max_sharpe_weights = max_sharpe_result['x']
max_sharpe_returns, max_sharpe_volatility, max_sharpe = portfolio_performance(max_sharpe_weights, mean_returns_daily, cov_matrix)

# Find portfolio with minimum volatility
min_vol_result = min_volatility(mean_returns_daily, cov_matrix, num_assets)
min_vol_weights = min_vol_result['x']
min_vol_returns, min_vol_volatility, min_vol_sharpe = portfolio_performance(min_vol_weights, mean_returns_daily, cov_matrix)

# Print results
print("Maximum Sharpe Ratio Portfolio:")
print(f"Annual Return: {max_sharpe_returns:.4f}")
print(f"Annual Volatility: {max_sharpe_volatility:.4f}")
print(f"Sharpe Ratio: {max_sharpe:.4f}")
print("\nWeights:")
for i, ticker in enumerate(daily_returns.columns):
    print(f"{ticker}: {max_sharpe_weights[i]:.4f}")

print("\nMinimum Volatility Portfolio:")
print(f"Annual Return: {min_vol_returns:.4f}")
print(f"Annual Volatility: {min_vol_volatility:.4f}")
print(f"Sharpe Ratio: {min_vol_sharpe:.4f}")
print("\nWeights:")
for i, ticker in enumerate(daily_returns.columns):
    print(f"{ticker}: {min_vol_weights[i]:.4f}")

In [None]:
# Calculate efficient frontier
target_returns = np.linspace(min_vol_returns, max_sharpe_returns, 30)
efficient_portfolios = efficient_frontier(mean_returns_daily, cov_matrix, num_assets, target_returns)

# Extract results
ef_returns = [p['return'] for p in efficient_portfolios]
ef_volatility = [p['volatility'] for p in efficient_portfolios]

# Generate random portfolios for visualization
num_random_portfolios = 5000
random_returns = []
random_volatility = []

for _ in range(num_random_portfolios):
    # Generate random weights
    weights = np.random.random(num_assets)
    weights /= np.sum(weights)
    
    # Calculate portfolio performance
    returns, volatility, _ = portfolio_performance(weights, mean_returns_daily, cov_matrix)
    random_returns.append(returns)
    random_volatility.append(volatility)

# Plot efficient frontier
plt.figure(figsize=(14, 10))

# Plot random portfolios
plt.scatter(random_volatility, random_returns, c='lightgray', marker='.', s=10, alpha=0.3, label='Random Portfolios')

# Plot efficient frontier
plt.plot(ef_volatility, ef_returns, 'b-', linewidth=3, label='Efficient Frontier')

# Plot maximum Sharpe ratio portfolio
plt.scatter(max_sharpe_volatility, max_sharpe_returns, marker='*', color='red', s=300, label='Maximum Sharpe Ratio')

# Plot minimum volatility portfolio
plt.scatter(min_vol_volatility, min_vol_returns, marker='o', color='green', s=200, label='Minimum Volatility')

# Plot individual assets
for i, ticker in enumerate(daily_returns.columns):
    asset_vol = volatility[i]
    asset_ret = mean_returns[i]
    plt.scatter(asset_vol, asset_ret, marker='x', s=100, label=ticker)
    plt.annotate(ticker.replace('.NR', ''), (asset_vol, asset_ret), xytext=(10, 5), textcoords='offset points', fontsize=10)

plt.title('Efficient Frontier with NSE Stocks', fontsize=16)
plt.xlabel('Annualized Volatility', fontsize=14)
plt.ylabel('Annualized Return', fontsize=14)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Function to recommend portfolio based on risk profile
def recommend_portfolio(risk_profile, mean_returns, cov_matrix, tickers):
    """
    Recommend a portfolio based on user's risk profile
    
    Parameters:
    - risk_profile: 'conservative', 'moderate', 'aggressive'
    - mean_returns: Mean returns Series
    - cov_matrix: Covariance matrix
    - tickers: List of stock tickers
    
    Returns:
    - Dictionary with portfolio details
    """
    num_assets = len(mean_returns)
    
    # Get minimum volatility and maximum Sharpe ratio portfolios
    min_vol_result = min_volatility(mean_returns, cov_matrix, num_assets)
    min_vol_weights = min_vol_result['x']
    min_vol_returns, min_vol_volatility, _ = portfolio_performance(min_vol_weights, mean_returns, cov_matrix)
    
    max_sharpe_result = max_sharpe_ratio(mean_returns, cov_matrix, num_assets)
    max_sharpe_weights = max_sharpe_result['x']
    max_sharpe_returns, max_sharpe_volatility, _ = portfolio_performance(max_sharpe_weights, mean_returns, cov_matrix)
    
    # Define target return based on risk profile
    if risk_profile.lower() == 'conservative':
        # Conservative investors prioritize capital preservation
        target_return = min_vol_returns  # Use minimum volatility portfolio
        weights = min_vol_weights
        expected_return = min_vol_returns
        expected_volatility = min_vol_volatility
    
    elif risk_profile.lower() == 'aggressive':
        # Aggressive investors prioritize maximum returns
        target_return = max_sharpe_returns  # Use maximum Sharpe ratio portfolio
        weights = max_sharpe_weights
        expected_return = max_sharpe_returns
        expected_volatility = max_sharpe_volatility
    
    elif risk_profile.lower() == 'moderate':
        # Moderate investors seek balance between risk and return
        # Target return halfway between min volatility and max Sharpe
        target_return = (min_vol_returns + max_sharpe_returns) / 2
        
        # Find portfolio with this target return
        args = (mean_returns, cov_matrix)
        constraints = (
            {'type': 'eq', 'fun': lambda x: portfolio_return(x, mean_returns, cov_matrix) - target_return},
            {'type': 'eq', 'fun': lambda x: np.sum(x) - 1}
        )
        bounds = tuple((0, 1) for _ in range(num_assets))
        
        result = sco.minimize(portfolio_volatility, num_assets * [1./num_assets], args=args,
                            method='SLSQP', bounds=bounds, constraints=constraints)
        
        weights = result['x']
        expected_return, expected_volatility, _ = portfolio_performance(weights, mean_returns, cov_matrix)
    
    else:
        raise ValueError("Risk profile must be 'conservative', 'moderate', or 'aggressive'")
    
    # Create portfolio recommendation
    portfolio = {
        'risk_profile': risk_profile,
        'expected_annual_return': expected_return,
        'expected_annual_volatility': expected_volatility,
        'allocations': []
    }
    
    # Add allocations with minimum 0.5% threshold to avoid tiny positions
    for i, ticker in enumerate(tickers):
        weight = weights[i]
        if weight >= 0.005:  # 0.5% minimum allocation
            portfolio['allocations'].append({
                'ticker': ticker.replace('.NR', ''),  # Remove suffix for display
                'weight': weight,
                'amount_per_10k': weight * 10000  # Amount to invest per KES 10,000
            })
    
    # Sort allocations by weight (descending)
    portfolio['allocations'] = sorted(portfolio['allocations'], key=lambda x: x['weight'], reverse=True)
    
    return portfolio

In [None]:
# Generate portfolio recommendations for different risk profiles
risk_profiles = ['Conservative', 'Moderate', 'Aggressive']
portfolio_recommendations = {}

for profile in risk_profiles:
    portfolio = recommend_portfolio(profile, mean_returns_daily, cov_matrix, daily_returns.columns)
    portfolio_recommendations[profile] = portfolio

# Display portfolio recommendations
for profile, portfolio in portfolio_recommendations.items():
    print(f"\n{profile} Portfolio Recommendation:")
    print(f"Expected Annual Return: {portfolio['expected_annual_return']:.2%}")
    print(f"Expected Annual Volatility: {portfolio['expected_annual_volatility']:.2%}")
    print("\nAllocations:")
    for allocation in portfolio['allocations']:
        print(f"{allocation['ticker']}: {allocation['weight']:.2%} (KES {allocation['amount_per_10k']:.0f} per KES 10,000)")

In [None]:
# Create a visualization of asset allocations for different risk profiles
fig, axes = plt.subplots(1, 3, figsize=(18, 8))

for i, profile in enumerate(risk_profiles):
    portfolio = portfolio_recommendations[profile]
    
    # Extract data for plotting
    labels = [alloc['ticker'] for alloc in portfolio['allocations']]
    sizes = [alloc['weight'] for alloc in portfolio['allocations']]
    
    # Create pie chart
    axes[i].pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, shadow=False)
    axes[i].set_title(f"{profile} Portfolio\nReturn: {portfolio['expected_annual_return']:.2%}, Risk: {portfolio['expected_annual_volatility']:.2%}", fontsize=14)
    axes[i].axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle

plt.suptitle('Portfolio Allocations by Risk Profile', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# Create a portfolio optimization class for export
class PortfolioOptimizer:
    """
    Portfolio Optimization for Kenyan market
    """
    def __init__(self, mean_returns, cov_matrix, tickers):
        self.mean_returns = mean_returns
        self.cov_matrix = cov_matrix
        self.tickers = tickers
        self.num_assets = len(tickers)
    
    def portfolio_performance(self, weights):
        """
        Calculate portfolio performance metrics
        """
        returns = np.sum(self.mean_returns * weights) * 252
        volatility = np.sqrt(np.dot(weights.T, np.dot(self.cov_matrix * 252, weights)))
        sharpe_ratio = returns / volatility
        return returns, volatility, sharpe_ratio
    
    def optimize_portfolio(self, risk_profile):
        """
        Optimize portfolio based on risk profile
        """
        return recommend_portfolio(risk_profile, self.mean_returns, self.cov_matrix, self.tickers)
    
    def generate_efficient_frontier(self, num_points=30):
        """
        Generate efficient frontier
        """
        # Get min vol and max Sharpe portfolios
        min_vol_result = min_volatility(self.mean_returns, self.cov_matrix, self.num_assets)
        min_vol_weights = min_vol_result['x']
        min_vol_returns, min_vol_volatility, _ = portfolio_performance(min_vol_weights, self.mean_returns, self.cov_matrix)
        
        max_sharpe_result = max_sharpe_ratio(self.mean_returns, self.cov_matrix, self.num_assets)
        max_sharpe_weights = max_sharpe_result['x']
        max_sharpe_returns, max_sharpe_volatility, _ = portfolio_performance(max_sharpe_weights, self.mean_returns, self.cov_matrix)
        
        # Calculate efficient frontier
        target_returns = np.linspace(min_vol_returns, max_sharpe_returns * 1.2, num_points)
        efficient_portfolios = efficient_frontier(self.mean_returns, self.cov_matrix, self.num_assets, target_returns)
        
        return {
            'min_vol': {'return': min_vol_returns, 'volatility': min_vol_volatility, 'weights': min_vol_weights},
            'max_sharpe': {'return': max_sharpe_returns, 'volatility': max_sharpe_volatility, 'weights': max_sharpe_weights},
            'efficient_frontier': efficient_portfolios
        }

# Create and export the model
optimizer = PortfolioOptimizer(mean_returns_daily, cov_matrix, daily_returns.columns)

# Save the model
output_path = '../models/portfolio_optimizer.pkl'
os.makedirs(os.path.dirname(output_path), exist_ok=True)

with open(output_path, 'wb') as f:
    pickle.dump(optimizer, f)

print(f"Portfolio optimization model saved to {output_path}")

In [None]:
# Example of how to use the model in the PesaGuru chatbot
def sample_chatbot_integration():
    """
    Demonstrate how the PesaGuru chatbot would use the portfolio optimizer
    """
    # Load the model
    with open('../models/portfolio_optimizer.pkl', 'rb') as f:
        optimizer = pickle.load(f)
    
    # Example user input
    user_risk_profile = 'moderate'  # From user questionnaire
    investment_amount = 100000  # KES 100,000
    
    # Get portfolio recommendation
    portfolio = optimizer.optimize_portfolio(user_risk_profile)
    
    # Format response for chatbot
    response = {
        'message': f"Based on your {user_risk_profile} risk profile, here's your recommended investment portfolio:",
        'expected_return': f"{portfolio['expected_annual_return']:.2%} per year",
        'expected_risk': f"{portfolio['expected_annual_volatility']:.2%} annual volatility",
        'allocations': []
    }
    
    # Calculate allocation amounts
    for allocation in portfolio['allocations']:
        amount = allocation['weight'] * investment_amount
        response['allocations'].append({
            'ticker': allocation['ticker'],
            'percentage': f"{allocation['weight']:.2%}",
            'amount': f"KES {amount:,.0f}"
        })
    
    return response

# Show example response
chatbot_response = sample_chatbot_integration()
print(chatbot_response['message'])
print(f"Expected return: {chatbot_response['expected_return']}")
print(f"Expected risk: {chatbot_response['expected_risk']}")
print("\nRecommended allocations:")
for alloc in chatbot_response['allocations']:
    print(f"{alloc['ticker']}: {alloc['percentage']} ({alloc['amount']})")