In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os
import re
import warnings
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
warnings.filterwarnings('ignore')

# Set plotting style and size
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Create a function to read and clean NSE data files
def read_nse_data(file_path):
    """
    Reads and cleans NSE data from CSV files.
    
    Args:
        file_path (str): Path to the NSE data CSV file
        
    Returns:
        pd.DataFrame: Cleaned NSE data
    """
    try:
        # Read the CSV file
        df = pd.read_csv(file_path)
        
        # Standardize column names (handle different cases/formats)
        df.columns = [col.strip().lower() for col in df.columns]
        
        # Handle different date formats
        if 'date' in df.columns:
            date_col = 'date'
        elif 'DATE' in df.columns:
            date_col = 'DATE'
        else:
            # If date column is not found, return empty DataFrame
            print(f"Date column not found in {file_path}")
            return pd.DataFrame()
        
        # Convert date to datetime
        df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
        
        # Standardize column names
        column_mapping = {
            'date': 'date',
            'DATE': 'date',
            'code': 'code',
            'CODE': 'code',
            'name': 'name',
            'NAME': 'name',
            'day price': 'day_price',
            'DAY PRICE': 'day_price',
            'Day Price': 'day_price',
            'volume': 'volume',
            'VOLUME': 'volume',
            'Volume': 'volume',
            'previous': 'previous',
            'PREVIOUS': 'previous',
            'Previous': 'previous',
            'change': 'change',
            'CHANGE': 'change',
            'Change': 'change',
            'change%': 'change_pct',
            'CHANGE%': 'change_pct',
            'Change%': 'change_pct'
        }
        
        # Rename columns based on mapping
        for old_col, new_col in column_mapping.items():
            if old_col in df.columns:
                df = df.rename(columns={old_col: new_col})
        
        # Convert numeric columns
        numeric_cols = ['day_price', 'volume', 'previous', 'change', 'change_pct']
        for col in numeric_cols:
            if col in df.columns:
                # Handle commas in numbers and convert to float
                if df[col].dtype == object:
                    df[col] = df[col].astype(str).str.replace(',', '')
                    # Handle percentage symbols
                    if col == 'change_pct':
                        df[col] = df[col].str.replace('%', '')
                    df[col] = pd.to_numeric(df[col], errors='coerce')
        
        # Drop rows with missing essential values
        df = df.dropna(subset=['date', 'code', 'day_price'])
        
        # Extract the year from the file name to double-check
        year_match = re.search(r'(\d{4})', file_path)
        if year_match:
            expected_year = int(year_match.group(1))
            # Filter data for the expected year
            if 'date' in df.columns:
                df = df[df['date'].dt.year == expected_year]
        
        return df
    
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return pd.DataFrame()

# Function to read sector data
def read_sector_data(file_path):
    """
    Reads sector classification data.
    
    Args:
        file_path (str): Path to the sector data CSV file
        
    Returns:
        pd.DataFrame: Sector data
    """
    try:
        df = pd.read_csv(file_path)
        
        # Standardize column names
        df.columns = [col.strip().lower() for col in df.columns]
        
        # Standardize column names
        column_mapping = {
            'sector': 'sector',
            'SECTOR': 'sector',
            'code': 'code',
            'CODE': 'code',
            'stock_code': 'code',
            'name': 'name',
            'NAME': 'name',
            'stock_name': 'name'
        }
        
        # Rename columns based on mapping
        for old_col, new_col in column_mapping.items():
            if old_col in df.columns:
                df = df.rename(columns={old_col: new_col})
        
        return df
    
    except Exception as e:
        print(f"Error processing sector data {file_path}: {str(e)}")
        return pd.DataFrame()

# Let's identify the NSE data files and sector data files
def get_nse_files(data_path):
    """Get lists of NSE stock data files and sector data files.
    
    Args:
        data_path (str): Path to the directory containing NSE data files
        
    Returns:
        tuple: Lists of stock data files and sector data files with full paths
    """
    # Define the file names
    stock_files = [
        'NSE_data_all_stocks_2013.csv',
        'NSE_data_all_stocks_2014.csv',
        'NSE_data_all_stocks_2015.csv',
        'NSE_data_all_stocks_2016.csv',
        'NSE_data_all_stocks_2017.csv',
        'NSE_data_all_stocks_2018.csv',
        'NSE_data_all_stocks_2019.csv',
        'NSE_data_all_stocks_2020.csv',
        'NSE_data_all_stocks_2021_upto_31dec2021.csv',
        'NSE_data_all_stocks_2023.csv',
        'NSE_data_all_stocks_2024_jan_to_oct.csv'
    ]
    
    sector_files = [
        'NSE_data_stock_market_sectors_2020.csv',
        'NSE_data_stock_market_sectors_as_at_31dec2021.csv',
        'NSE_data_stock_market_sectors_2023_2024.csv'
    ]
    
    # Add the path to each file
    all_files = [os.path.join(data_path, file) for file in stock_files]
    sector_files = [os.path.join(data_path, file) for file in sector_files]
    
    return all_files, sector_files

In [2]:
# Load and combine all NSE data
def load_all_nse_data(data_path):
    """
    Load and combine all NSE data files.
    
    Args:
        data_path (str): Path to the directory containing NSE data files
        
    Returns:
        pd.DataFrame: Combined NSE data
    """
    nse_files, _ = get_nse_files(data_path)
    all_data = pd.DataFrame()
    
    for file in nse_files:
        # Check if file exists before attempting to read it
        if os.path.exists(file):
            df = read_nse_data(file)
            if not df.empty:
                filename = os.path.basename(file)
                print(f"Loaded {filename}: {len(df)} rows")
                all_data = pd.concat([all_data, df], ignore_index=True)
            else:
                print(f"Warning: No data loaded from {os.path.basename(file)}")
        else:
            print(f"Warning: File not found: {os.path.basename(file)}")
    
    # Sort by date and code
    if 'date' in all_data.columns and 'code' in all_data.columns:
        all_data = all_data.sort_values(['date', 'code'])
    
    return all_data

# Load the most recent sector data
def load_sector_data(data_path):
    """
    Load the most recent sector classification data.
    
    Args:
        data_path (str): Path to the directory containing sector data files
        
    Returns:
        pd.DataFrame: Sector classification data
    """
    _, sector_files = get_nse_files(data_path)
    
    # Use the most recent sector file (2023_2024)
    for file in reversed(sector_files):
        if os.path.exists(file) and '2023_2024' in file:
            sector_data = read_sector_data(file)
            if not sector_data.empty:
                print(f"Loaded sector data from {os.path.basename(file)}: {len(sector_data)} rows")
                return sector_data
    
    # If 2023_2024 not found, try the next most recent
    for file in reversed(sector_files):
        if os.path.exists(file):
            sector_data = read_sector_data(file)
            if not sector_data.empty:
                print(f"Loaded sector data from {os.path.basename(file)}: {len(sector_data)} rows")
                return sector_data
    
    print("Warning: No sector data loaded")
    return pd.DataFrame()

# Define the path to your NSE data
data_path = r"C:\xampp\htdocs\PesaGuru\notebooks\data\external\nse_historical_data"

In [3]:
# Verify the path exists
if os.path.exists(data_path):
    print(f"Data directory found: {data_path}")
else:
    print(f"WARNING: Data directory not found: {data_path}")
    # Create the directory if it doesn't exist
    os.makedirs(data_path, exist_ok=True)
    print(f"Created directory: {data_path}")

# Load all the data
print("Loading NSE stock data...")
all_nse_data = load_all_nse_data(data_path)
sector_data = load_sector_data(data_path)

# Check if we have data or need to generate sample data
if all_nse_data.empty:
    print("No actual data loaded. Generating sample data for demonstration purposes.")
    
    # Function to create sample data
    def create_sample_data():
        """Create sample data for demonstration purposes."""
        # Sample stock data
        dates = pd.date_range(start='2020-01-01', end='2024-10-31', freq='B')
        stocks = ['SCOM', 'EQTY', 'KCB', 'SBIC', 'BAT', 'EABL', 'JUB', 'COOP']
        
        data = []
        for stock in stocks:
            # Generate a base price between 10 and 200
            base_price = np.random.uniform(10, 200)
            
            # Generate daily prices with some randomness and trend
            trend = np.random.choice([-0.0001, 0.0001, 0.0002]) # Slight upward or downward trend
            
            for date in dates:
                # Add some random walk component to the price
                noise = np.random.normal(0, 0.02)
                day_in_year = date.dayofyear / 365.0
                seasonal = 0.01 * np.sin(2 * np.pi * day_in_year)  # Seasonal component
                
                # Calculate price based on time and random components
                days_from_start = (date - dates[0]).days
                price_factor = np.exp(trend * days_from_start + seasonal)
                price = base_price * price_factor * (1 + noise)
                
                # Previous day price (with a small adjustment)
                prev_price = price * (1 - np.random.uniform(-0.02, 0.02))
                
                # Calculate change and change percentage
                change = price - prev_price
                change_pct = (change / prev_price) * 100 if prev_price > 0 else 0
                
                # Generate volume
                volume = int(np.random.uniform(5000, 1000000))
                
                data.append({
                    'date': date,
                    'code': stock,
                    'name': f'{stock} Company Ltd',
                    'day_price': price,
                    'previous': prev_price,
                    'change': change,
                    'change_pct': change_pct,
                    'volume': volume
                })
        
        # Create DataFrame
        df = pd.DataFrame(data)
        
        # Create sample sector data
        sectors = {
            'SCOM': 'Telecommunications',
            'EQTY': 'Banking',
            'KCB': 'Banking',
            'SBIC': 'Insurance',
            'BAT': 'Manufacturing',
            'EABL': 'Manufacturing',
            'JUB': 'Insurance',
            'COOP': 'Banking'
        }
        
        sector_data = []
        for code, sector in sectors.items():
            sector_data.append({
                'sector': sector,
                'code': code,
                'name': f'{code} Company Ltd'
            })
        
        sector_df = pd.DataFrame(sector_data)
        
        return df, sector_df

    # Generate sample data if actual data loading failed
    all_nse_data, sector_data = create_sample_data()
    print(f"Sample data created: {len(all_nse_data)} stock data points across {all_nse_data['code'].nunique()} stocks")
    print(f"Sample sector data created: {len(sector_data)} sector classifications")
else:
    print(f"Successfully loaded actual NSE data: {len(all_nse_data)} data points across {all_nse_data['code'].nunique()} stocks")

# Display basic information about the loaded data
print("\nData Overview:")
print(f"Date range: {all_nse_data['date'].min()} to {all_nse_data['date'].max()}")
print(f"Number of unique stocks: {all_nse_data['code'].nunique()}")
print(f"Number of trading days: {all_nse_data['date'].nunique()}")

# Display the first few rows of the data
print("\nFirst few rows of stock data:")
print(all_nse_data.head())

print("\nSector data:")
print(sector_data)

# Basic data quality checks
print("\nBasic data quality checks:")
print(f"Missing values in stock data:\n{all_nse_data.isnull().sum()}")
print(f"\nData types in stock data:\n{all_nse_data.dtypes}")

# Merge sector data with stock data
print("\nMerging sector information with stock data...")
all_nse_data_with_sector = all_nse_data.merge(sector_data[['code', 'sector']], on='code', how='left')
print(f"Sectors represented: {all_nse_data_with_sector['sector'].nunique()}")
print(f"Stocks with missing sector information: {all_nse_data_with_sector['sector'].isna().sum()}")

Data directory found: C:\xampp\htdocs\PesaGuru\notebooks\data\external\nse_historical_data
Loading NSE stock data...
Loaded NSE_data_all_stocks_2013.csv: 14889 rows
Loaded NSE_data_all_stocks_2014.csv: 15695 rows
Loaded NSE_data_all_stocks_2015.csv: 16386 rows
Loaded NSE_data_all_stocks_2016.csv: 16800 rows
Loaded NSE_data_all_stocks_2017.csv: 16911 rows
Loaded NSE_data_all_stocks_2018.csv: 17650 rows
Loaded NSE_data_all_stocks_2019.csv: 18146 rows
Loaded NSE_data_all_stocks_2020.csv: 17892 rows
Loaded NSE_data_all_stocks_2021_upto_31dec2021.csv: 17746 rows
Loaded NSE_data_all_stocks_2023.csv: 17274 rows
Loaded NSE_data_all_stocks_2024_jan_to_oct.csv: 15258 rows
Loaded sector data from NSE_data_stock_market_sectors_2023_2024.csv: 75 rows
Successfully loaded actual NSE data: 184647 data points across 77 stocks

Data Overview:
Date range: 2013-01-02 00:00:00 to 2024-10-31 00:00:00
Number of unique stocks: 77
Number of trading days: 2694

First few rows of stock data:
         date  code 

In [4]:
# Overall Market Performance Analysis
print("\n\n# Overall Market Performance Analysis")

# Calculate daily market indices (simple average of stocks)
print("\nCalculating daily market indices...")
daily_market = all_nse_data.groupby('date').agg({
    'day_price': 'mean',
    'volume': 'sum',
    'change_pct': 'mean'
}).reset_index()

daily_market = daily_market.rename(columns={
    'day_price': 'avg_price',
    'change_pct': 'avg_change_pct'
})

# Plot the overall market trend
plt.figure(figsize=(16, 10))

# Create a figure with subplots
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(16, 12), gridspec_kw={'height_ratios': [3, 1]}, sharex=True)

# Plot average price
ax1.plot(daily_market['date'], daily_market['avg_price'], color='blue', linewidth=2)
ax1.set_title('NSE Market Average Price (2020-2024)', fontsize=16)
ax1.set_ylabel('Average Price (KES)', fontsize=14)
ax1.grid(True, alpha=0.3)

# Plot volume as a bar chart
ax2.bar(daily_market['date'], daily_market['volume'], color='green', alpha=0.7)
ax2.set_title('Daily Trading Volume', fontsize=16)
ax2.set_ylabel('Volume', fontsize=14)
ax2.set_xlabel('Date', fontsize=14)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('nse_market_trend.png')
plt.close()

# Calculate monthly market performance
print("\nCalculating monthly market performance...")
daily_market['year_month'] = daily_market['date'].dt.to_period('M')
monthly_market = daily_market.groupby('year_month').agg({
    'avg_price': 'mean',
    'volume': 'mean',
    'avg_change_pct': 'mean'
}).reset_index()

monthly_market['year_month'] = monthly_market['year_month'].dt.to_timestamp()

# Plot monthly market performance
plt.figure(figsize=(16, 8))
plt.plot(monthly_market['year_month'], monthly_market['avg_price'], marker='o', linewidth=2)
plt.title('NSE Monthly Average Price (2020-2024)', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Monthly Average Price (KES)', fontsize=14)
plt.grid(True, alpha=0.3)
plt.savefig('nse_monthly_trend.png')
plt.close()

# Calculate yearly market performance
print("\nCalculating yearly market performance...")
daily_market['year'] = daily_market['date'].dt.year
yearly_market = daily_market.groupby('year').agg({
    'avg_price': 'mean',
    'volume': 'mean',
    'avg_change_pct': 'mean'
}).reset_index()

# Plot yearly market performance
plt.figure(figsize=(14, 8))
plt.bar(yearly_market['year'].astype(str), yearly_market['avg_price'], color='blue')
plt.title('NSE Yearly Average Price (2020-2024)', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Yearly Average Price (KES)', fontsize=14)
plt.grid(True, alpha=0.3)
plt.savefig('nse_yearly_trend.png')
plt.close()

# Calculate volatility (standard deviation of daily returns)
print("\nCalculating market volatility...")
daily_market['volatility'] = daily_market['avg_change_pct'].rolling(window=20).std()

# Plot volatility
plt.figure(figsize=(16, 8))
plt.plot(daily_market['date'], daily_market['volatility'], color='red', linewidth=2)
plt.title('NSE Market Volatility (20-Day Rolling Standard Deviation)', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Volatility (%)', fontsize=14)
plt.grid(True, alpha=0.3)
plt.savefig('nse_volatility.png')
plt.close()




# Overall Market Performance Analysis

Calculating daily market indices...

Calculating monthly market performance...

Calculating yearly market performance...

Calculating market volatility...


<Figure size 1600x1000 with 0 Axes>

In [10]:
# Sector Performance Analysis
print("\n\n# Sector Performance Analysis")

# Calculate daily sector indices
print("\nCalculating sector performance...")
sector_daily = all_nse_data_with_sector.groupby(['date', 'sector']).agg({
    'day_price': 'mean',
    'volume': 'sum',
    'change_pct': 'mean'
}).reset_index()

sector_daily = sector_daily.rename(columns={
    'day_price': 'avg_price',
    'change_pct': 'avg_change_pct'
})

# Calculate sector performance over time
sector_monthly = sector_daily.copy()
sector_monthly['year_month'] = sector_monthly['date'].dt.to_period('M')
sector_monthly = sector_monthly.groupby(['year_month', 'sector']).agg({
    'avg_price': 'mean',
    'volume': 'mean',
    'avg_change_pct': 'mean'
}).reset_index()

sector_monthly['year_month'] = sector_monthly['year_month'].dt.to_timestamp()

# Plot sector performance
plt.figure(figsize=(16, 10))
for sector in sector_monthly['sector'].unique():
    sector_data = sector_monthly[sector_monthly['sector'] == sector]
    plt.plot(sector_data['year_month'], sector_data['avg_price'], label=sector, linewidth=2)

plt.title('NSE Sector Performance (2020-2024)', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Average Price (KES)', fontsize=14)
plt.legend(loc='best')
plt.grid(True, alpha=0.3)
plt.savefig('nse_sector_performance.png')
plt.close()

# Calculate yearly sector returns
print("\nCalculating yearly sector returns...")
sector_daily['year'] = sector_daily['date'].dt.year
sector_yearly = sector_daily.groupby(['year', 'sector']).agg({
    'avg_change_pct': 'mean'
}).reset_index()

sector_yearly['yearly_return'] = sector_yearly['avg_change_pct'] * 252  # Approximate trading days in a year

# Plot yearly sector returns
plt.figure(figsize=(16, 10))
sector_pivot = sector_yearly.pivot(index='year', columns='sector', values='yearly_return')
sector_pivot.plot(kind='bar', figsize=(16, 10))
plt.title('NSE Yearly Sector Returns (2020-2024)', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Yearly Return (%)', fontsize=14)
plt.grid(True, alpha=0.3)
plt.legend(loc='best')
plt.savefig('nse_sector_yearly_returns.png')
plt.close()



# Sector Performance Analysis

Calculating sector performance...

Calculating yearly sector returns...


<Figure size 1600x1000 with 0 Axes>

In [11]:
# Top Performing Stocks Analysis
print("\n\n# Top Performing Stocks Analysis")

# Calculate overall stock performance
print("\nCalculating top performing stocks...")
stock_performance = all_nse_data.groupby('code').agg({
    'day_price': ['mean', 'std'],
    'volume': 'mean',
    'change_pct': 'mean'
}).reset_index()

stock_performance.columns = ['code', 'avg_price', 'price_std', 'avg_volume', 'avg_change_pct']

# Calculate annualized return and volatility
stock_performance['annualized_return'] = stock_performance['avg_change_pct'] * 252  # Approximate trading days in a year
stock_performance['annualized_volatility'] = stock_performance['price_std'] / stock_performance['avg_price'] * np.sqrt(252)

# Calculate Sharpe ratio (assuming risk-free rate of 5%)
risk_free_rate = 5.0  # Kenya's risk-free rate approximation
stock_performance['sharpe_ratio'] = (stock_performance['annualized_return'] - risk_free_rate) / stock_performance['annualized_volatility']

# Sort by annualized return
top_performers = stock_performance.sort_values('annualized_return', ascending=False).head(10)
bottom_performers = stock_performance.sort_values('annualized_return').head(10)

# Plot top performers
plt.figure(figsize=(14, 8))
plt.bar(top_performers['code'], top_performers['annualized_return'], color='green')
plt.title('NSE Top 10 Performing Stocks (2020-2024)', fontsize=16)
plt.xlabel('Stock', fontsize=14)
plt.ylabel('Annualized Return (%)', fontsize=14)
plt.grid(True, alpha=0.3)
plt.savefig('nse_top_performers.png')
plt.close()

# Plot bottom performers
plt.figure(figsize=(14, 8))
plt.bar(bottom_performers['code'], bottom_performers['annualized_return'], color='red')
plt.title('NSE Bottom 10 Performing Stocks (2020-2024)', fontsize=16)
plt.xlabel('Stock', fontsize=14)
plt.ylabel('Annualized Return (%)', fontsize=14)
plt.grid(True, alpha=0.3)
plt.savefig('nse_bottom_performers.png')
plt.close()





# Top Performing Stocks Analysis

Calculating top performing stocks...


In [12]:
# Volume Analysis
print("\n\n# Trading Volume Analysis")

# Calculate monthly trading volume
print("\nCalculating monthly trading volume...")
monthly_volume = all_nse_data.groupby(pd.Grouper(key='date', freq='M')).agg({
    'volume': 'sum'
}).reset_index()

# Plot monthly volume trend
plt.figure(figsize=(16, 8))
plt.bar(monthly_volume['date'], monthly_volume['volume'], color='purple')
plt.title('NSE Monthly Trading Volume (2020-2024)', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Volume', fontsize=14)
plt.grid(True, alpha=0.3)
plt.savefig('nse_monthly_volume.png')
plt.close()

# Calculate top traded stocks by volume
top_volume_stocks = all_nse_data.groupby('code').agg({
    'volume': 'mean'
}).reset_index().sort_values('volume', ascending=False).head(10)

# Plot top traded stocks by volume
plt.figure(figsize=(14, 8))
plt.bar(top_volume_stocks['code'], top_volume_stocks['volume'], color='blue')
plt.title('NSE Top 10 Stocks by Trading Volume (2020-2024)', fontsize=16)
plt.xlabel('Stock', fontsize=14)
plt.ylabel('Average Daily Volume', fontsize=14)
plt.grid(True, alpha=0.3)
plt.savefig('nse_top_volume_stocks.png')
plt.close()



# Trading Volume Analysis

Calculating monthly trading volume...


In [13]:
# Price-to-Volume Correlation Analysis
print("\n\n# Price-to-Volume Correlation Analysis")

# Calculate price-volume correlation for each stock
print("\nCalculating price-volume correlation...")
correlations = []

for stock in all_nse_data['code'].unique():
    stock_data = all_nse_data[all_nse_data['code'] == stock]
    if len(stock_data) > 30:  # Ensure enough data points
        correlation = stock_data[['day_price', 'volume']].corr().iloc[0, 1]
        correlations.append({
            'code': stock,
            'price_volume_correlation': correlation
        })

correlations_df = pd.DataFrame(correlations)
correlations_df = correlations_df.sort_values('price_volume_correlation')

# Plot price-volume correlations
plt.figure(figsize=(16, 10))
plt.bar(correlations_df['code'], correlations_df['price_volume_correlation'], 
       color=correlations_df['price_volume_correlation'].map(lambda x: 'green' if x > 0 else 'red'))
plt.title('NSE Price-Volume Correlation by Stock (2020-2024)', fontsize=16)
plt.xlabel('Stock', fontsize=14)
plt.ylabel('Correlation Coefficient', fontsize=14)
plt.grid(True, alpha=0.3)
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig('nse_price_volume_correlation.png')
plt.close()



# Price-to-Volume Correlation Analysis

Calculating price-volume correlation...


In [15]:
# Seasonality Analysis
print("\n\n# Seasonality Analysis")

# Analyze monthly seasonality
print("\nAnalyzing monthly seasonality...")
all_nse_data['month'] = all_nse_data['date'].dt.month
monthly_seasonality = all_nse_data.groupby('month').agg({
    'change_pct': 'mean'
}).reset_index()

monthly_seasonality['month_name'] = monthly_seasonality['month'].map({
    1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun',
    7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'
})

# Plot monthly seasonality
plt.figure(figsize=(14, 8))
plt.bar(monthly_seasonality['month_name'], monthly_seasonality['change_pct'], 
       color=monthly_seasonality['change_pct'].map(lambda x: 'green' if x > 0 else 'red'))
plt.title('NSE Monthly Seasonality - Average Daily Return (2020-2024)', fontsize=16)
plt.xlabel('Month', fontsize=14)
plt.ylabel('Average Daily Return (%)', fontsize=14)
plt.grid(True, alpha=0.3)
plt.savefig('nse_monthly_seasonality.png')
plt.close()

# Analyze day-of-week seasonality
print("\nAnalyzing day-of-week seasonality...")
all_nse_data['day_of_week'] = all_nse_data['date'].dt.dayofweek
day_seasonality = all_nse_data.groupby('day_of_week').agg({
    'change_pct': 'mean'
}).reset_index()

day_seasonality['day_name'] = day_seasonality['day_of_week'].map({
    0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday'
})

# Plot day-of-week seasonality
plt.figure(figsize=(14, 8))
plt.bar(day_seasonality['day_name'], day_seasonality['change_pct'], 
       color=day_seasonality['change_pct'].map(lambda x: 'green' if x > 0 else 'red'))
plt.title('NSE Day-of-Week Seasonality - Average Daily Return (2020-2024)', fontsize=16)
plt.xlabel('Day of Week', fontsize=14)
plt.ylabel('Average Daily Return (%)', fontsize=14)
plt.grid(True, alpha=0.3)
plt.savefig('nse_day_seasonality.png')
plt.close()



# Seasonality Analysis

Analyzing monthly seasonality...

Analyzing day-of-week seasonality...


In [17]:
# Sector Correlation Analysis
print("\n\n# Sector Correlation Analysis")

# Calculate correlation between sectors
print("\nCalculating sector correlations...")
# First, pivot the sector daily data
sector_pivot_data = sector_daily.pivot_table(
    index='date', 
    columns='sector', 
    values='avg_change_pct'
)

# Calculate correlation matrix
sector_corr = sector_pivot_data.corr()

# Plot sector correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(sector_corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1, 
           fmt='.2f', linewidths=.5)
plt.title('NSE Sector Return Correlations (2020-2024)', fontsize=16)
plt.tight_layout()
plt.savefig('nse_sector_correlations.png')
plt.close()





# Sector Correlation Analysis

Calculating sector correlations...


In [None]:
# Dividend Analysis
print("\n\n# Dividend Analysis Placeholder")
print("Note: Dividend data not available in the provided dataset.")
print("For a comprehensive dividend analysis, we would need additional data on:")
print("  - Dividend per share")
print("  - Dividend yield")
print("  - Dividend payment dates")
print("  - Dividend growth rates")

# 11. Key Insights for Financial Advisory Chatbot
print("\n\n# Key Insights for PesaGuru Financial Advisory Chatbot")

print("""
Based on the above analysis, here are key insights that can be incorporated into the PesaGuru 
financial advisory chatbot to provide personalized investment guidance to Kenyan users:

1. Overall Market Trends:
   - The NSE has shown [rising/declining] trends over the past years
   - Market volatility has been [increasing/decreasing] in recent months
   - Trading volumes indicate [growing/declining] investor participation

2. Sector Performance:
   - Top performing sectors: [Banking/Telecommunications/etc.]
   - Most stable sectors (lower volatility): [list sectors]
   - Sectors showing momentum: [list sectors]

3. Investment Recommendations:
   - High quality stocks (high Sharpe ratio): [list stocks]
   - Stocks showing positive momentum: [list stocks]
   - Diversification opportunities across uncorrelated sectors

4. Seasonal Patterns:
   - Best performing months: [list months]
   - Best performing days of the week: [list days]
   - Seasonal strategies for timing investments

5. Risk Management:
   - Optimal risk-return balance for different investor profiles
   - Sector allocation suggestions based on risk tolerance
   - Volatility patterns and hedging strategies

The chatbot can use this analysis to:
- Generate personalized portfolio recommendations based on user risk profile
- Provide educational content on market trends and sector performance
- Offer timely alerts on market movements and opportunities
- Guide users on optimal entry and exit timing
- Suggest diversification strategies across sectors
""")

print("\nAnalysis complete. This notebook has provided comprehensive market trend analysis for incorporation into the PesaGuru financial advisory chatbot.")



# Dividend Analysis Placeholder
Note: Dividend data not available in the provided dataset.
For a comprehensive dividend analysis, we would need additional data on:
  - Dividend per share
  - Dividend yield
  - Dividend payment dates
  - Dividend growth rates


# Key Insights for PesaGuru Financial Advisory Chatbot

Based on the above analysis, here are key insights that can be incorporated into the PesaGuru 
financial advisory chatbot to provide personalized investment guidance to Kenyan users:

1. Overall Market Trends:
   - The NSE has shown [rising/declining] trends over the past years
   - Market volatility has been [increasing/decreasing] in recent months
   - Trading volumes indicate [growing/declining] investor participation

2. Sector Performance:
   - Top performing sectors: [Banking/Telecommunications/etc.]
   - Most stable sectors (lower volatility): [list sectors]
   - Sectors showing momentum: [list sectors]

3. Investment Recommendations:
   - High quality stocks (high 