In [2]:
# Essential libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Machine learning libraries
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Deep learning libraries
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, Embedding, Flatten, Concatenate

# File handling and system libraries
import os
import sys
import json
import pickle
import requests
from dotenv import load_dotenv

# Visualization settings
plt.style.use('fivethirtyeight')
sns.set(style="whitegrid")

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Load environment variables (API keys, etc.)
load_dotenv()

True

In [3]:
# Define paths to data sources
base_path = '../../data/'
user_profiles_path = os.path.join(base_path, 'processed/user_profiles.csv')
investment_products_path = os.path.join(base_path, 'processed/kenya_investment_products.csv')
market_data_path = os.path.join(base_path, 'processed/nse_historical_data.csv')
survey_data_path = os.path.join(base_path, 'external/financial_advisory_chatbot_survey.csv')

# Check for data files existence
file_paths = [user_profiles_path, investment_products_path, market_data_path, survey_data_path]
for path in file_paths:
    if not os.path.exists(path):
        print(f"Warning: {path} does not exist. Will need to prepare data.")

# Load survey data for user preference analysis
try:
    survey_data = pd.read_csv(survey_data_path)
    print(f"Loaded survey data with {survey_data.shape[0]} responses and {survey_data.shape[1]} features")
    survey_data.head()
except FileNotFoundError:
    print(f"Survey data file not found at {survey_data_path}")
    # If we're running this in development, we can use sample data
    # Otherwise, we'd need to prepare the data first
    survey_data = None


Survey data file not found at ../../data/external/financial_advisory_chatbot_survey.csv


In [None]:
# Function to fetch NSE data from API if not already downloaded
def fetch_nse_data(symbols, start_date, end_date):
    """Fetch historical stock data for NSE stocks"""
    try:
        # Try to load from Alpha Vantage API
        alpha_vantage_key = os.getenv('ALPHA_VANTAGE_API_KEY')
        if not alpha_vantage_key:
            raise ValueError("Alpha Vantage API key not found in environment variables")
            
        all_data = []
        for symbol in symbols:
            url = f"https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol={symbol}.NBO&outputsize=full&apikey={alpha_vantage_key}"
            response = requests.get(url)
            data = response.json()
            
            if 'Time Series (Daily)' not in data:
                print(f"Error fetching data for {symbol}: {data}")
                continue
                
            time_series = data['Time Series (Daily)']
            df = pd.DataFrame(time_series).T
            df.index = pd.to_datetime(df.index)
            df = df.rename(columns={'1. open': 'Open', '2. high': 'High', '3. low': 'Low', '4. close': 'Close', '5. volume': 'Volume'})
            df = df.astype({'Open': float, 'High': float, 'Low': float, 'Close': float, 'Volume': float})
            df['Symbol'] = symbol
            
            # Filter by date range
            mask = (df.index >= start_date) & (df.index <= end_date)
            df = df.loc[mask]
            
            all_data.append(df)
            
        # Combine all data and save
        if all_data:
            combined_data = pd.concat(all_data)
            combined_data.to_csv(market_data_path)
            return combined_data
        else:
            raise ValueError("No data fetched from API")
            
    except Exception as e:
        print(f"Error fetching NSE data: {e}")
        # Return a sample dataframe for development purposes
        return pd.DataFrame({'Symbol': ['SCOM', 'EQTY', 'KCB'], 
                            'Date': [datetime.now() - timedelta(days=i) for i in range(30)],
                            'Close': np.random.uniform(10, 50, 30)})

In [None]:
# Load or create investment products dataset
try:
    investment_products = pd.read_csv(investment_products_path)
    print(f"Loaded {len(investment_products)} investment products")
except FileNotFoundError:
    print("Investment products dataset not found. Creating sample dataset...")
    
    # Create a sample investment products dataset
    investment_products = pd.DataFrame({
        'product_id': range(1, 21),
        'name': [
            'Safaricom Stock', 'Equity Bank Stock', 'KCB Stock', 'EABL Stock',
            'T-Bill 91 Days', 'T-Bill 182 Days', 'T-Bill 364 Days',
            'T-Bond 2 Year', 'T-Bond 5 Year', 'T-Bond 10 Year', 'T-Bond 15 Year',
            'Money Market Fund - CIC', 'Money Market Fund - Britam',
            'Balanced Fund - Old Mutual', 'Equity Fund - Sanlam',
            'REIT - Stanlib Fahari', 'Cytonn High Yield Fund',
            'Mwalimu SACCO Savings', 'Stima SACCO Deposits',
            'M-Akiba Government Bond'
        ],
        'category': [
            'Equity', 'Equity', 'Equity', 'Equity',
            'Government Security', 'Government Security', 'Government Security',
            'Government Security', 'Government Security', 'Government Security', 'Government Security',
            'Money Market', 'Money Market',
            'Balanced Fund', 'Equity Fund',
            'Real Estate', 'High Yield',
            'SACCO', 'SACCO',
            'Mobile Bond'
        ],
        'min_investment': [
            1000, 1000, 1000, 1000,
            100000, 100000, 100000,
            50000, 50000, 50000, 50000,
            5000, 5000,
            10000, 10000,
            20000, 100000,
            1000, 1000,
            3000
        ],
        'risk_level': [
            4, 4, 4, 4,
            1, 1, 2,
            2, 3, 3, 3,
            2, 2,
            3, 4,
            3, 5,
            2, 2,
            1
        ],
        'avg_annual_return': [
            0.15, 0.12, 0.10, 0.09,
            0.06, 0.07, 0.085,
            0.095, 0.11, 0.125, 0.13,
            0.09, 0.085,
            0.11, 0.13,
            0.10, 0.18,
            0.08, 0.09,
            0.10
        ],
        'liquidity': [
            'High', 'High', 'High', 'High',
            'Low', 'Medium', 'Medium',
            'Low', 'Low', 'Low', 'Low',
            'High', 'High',
            'Medium', 'Medium',
            'Low', 'Low',
            'Medium', 'Medium',
            'Medium'
        ],
        'investment_horizon': [
            'Long-term', 'Long-term', 'Long-term', 'Long-term',
            'Short-term', 'Short-term', 'Short-term',
            'Medium-term', 'Medium-term', 'Long-term', 'Long-term',
            'Short-term', 'Short-term',
            'Medium-term', 'Long-term',
            'Long-term', 'Medium-term',
            'Medium-term', 'Medium-term',
            'Medium-term'
        ]
    })
    
    # Save the sample dataset
    os.makedirs(os.path.dirname(investment_products_path), exist_ok=True)
    investment_products.to_csv(investment_products_path, index=False)
    print(f"Created and saved {len(investment_products)} sample investment products")

In [None]:
# Load or generate user profile data
try:
    user_profiles = pd.read_csv(user_profiles_path)
    print(f"Loaded {len(user_profiles)} user profiles")
except FileNotFoundError:
    print("User profiles dataset not found. Creating dataset from survey data...")
    
    if survey_data is not None:
        # Generate user profiles from survey data
        user_profiles = pd.DataFrame()
        user_profiles['user_id'] = range(1, len(survey_data) + 1)
        user_profiles['age_group'] = survey_data['Age Group']
        user_profiles['location'] = survey_data['Location']
        user_profiles['employment_status'] = survey_data['Employment Status']
        user_profiles['income_range'] = survey_data['Monthly Income Range (KES)']
        user_profiles['financial_literacy'] = survey_data['Financial Literacy Level']
        user_profiles['primary_goal'] = survey_data['Primary Financial Goals']
        user_profiles['challenges'] = survey_data['Financial Challenges']
        
        # Map risk tolerance based on demographic factors and financial literacy
        # This is a simplification - in practice, risk tolerance would be assessed more thoroughly
        def map_risk_tolerance(row):
            # Start with a base risk tolerance of 3 (medium)
            base_risk = 3
            
            # Adjust based on age group
            if row['age_group'] == '18-24' or row['age_group'] == '25-34':
                base_risk += 1
            elif row['age_group'] == '55-64' or row['age_group'] == '65+':
                base_risk -= 1
                
            # Adjust based on financial literacy
            if row['financial_literacy'] >= 8:
                base_risk += 1
            elif row['financial_literacy'] <= 4:
                base_risk -= 1
                
            # Clamp to 1-5 scale
            return max(1, min(5, base_risk))
        
        user_profiles['risk_tolerance'] = user_profiles.apply(map_risk_tolerance, axis=1)
        
        # Generate investment horizons based on financial goals
        def map_investment_horizon(goal):
            short_term_goals = ['Emergency fund', 'Paying off debt', 'Education funding (near term)']
            medium_term_goals = ['Buying a home', 'Starting a business', 'Education funding (long term)']
            long_term_goals = ['Retirement savings', 'Building wealth', 'Financial independence']
            
            # Check each category
            for st_goal in short_term_goals:
                if st_goal.lower() in goal.lower():
                    return 'Short-term'
            for mt_goal in medium_term_goals:
                if mt_goal.lower() in goal.lower():
                    return 'Medium-term'
            for lt_goal in long_term_goals:
                if lt_goal.lower() in goal.lower():
                    return 'Long-term'
            
            # Default to medium-term if no match
            return 'Medium-term'
        
        user_profiles['investment_horizon'] = user_profiles['primary_goal'].apply(map_investment_horizon)
        
        # Save the processed user profiles
        os.makedirs(os.path.dirname(user_profiles_path), exist_ok=True)
        user_profiles.to_csv(user_profiles_path, index=False)
        print(f"Created and saved {len(user_profiles)} user profiles from survey data")
    else:
        # Create a sample user profiles dataset if survey data is not available
        print("Survey data not available. Creating sample user profiles...")
        np.random.seed(42)
        num_profiles = 500
        
        age_groups = ['18-24', '25-34', '35-44', '45-54', '55-64', '65+']
        locations = ['Nairobi', 'Mombasa', 'Kisumu', 'Nakuru', 'Eldoret', 'Rural']
        employment = ['Employed', 'Self-employed', 'Unemployed', 'Student', 'Retired']
        income_ranges = ['Below 15,000', '15,000-30,000', '30,001-50,000', '50,001-100,000', 'Above 100,000']
        goals = [
            'Emergency fund', 'Buying a home', 'Retirement savings', 'Education funding',
            'Starting a business', 'Building wealth', 'Paying off debt', 'Financial independence'
        ]
        challenges = [
            'Low income', 'High debt', 'Irregular income', 'Poor financial knowledge',
            'High expenses', 'Lack of discipline', 'Market volatility', 'Access to financial services'
        ]
        horizons = ['Short-term', 'Medium-term', 'Long-term']
        
        user_profiles = pd.DataFrame({
            'user_id': range(1, num_profiles + 1),
            'age_group': np.random.choice(age_groups, num_profiles),
            'location': np.random.choice(locations, num_profiles),
            'employment_status': np.random.choice(employment, num_profiles),
            'income_range': np.random.choice(income_ranges, num_profiles),
            'financial_literacy': np.random.randint(1, 11, num_profiles),
            'primary_goal': np.random.choice(goals, num_profiles),
            'challenges': np.random.choice(challenges, num_profiles),
            'risk_tolerance': np.random.randint(1, 6, num_profiles),
            'investment_horizon': np.random.choice(horizons, num_profiles)
        })
        
        # Save the sample user profiles
        os.makedirs(os.path.dirname(user_profiles_path), exist_ok=True)
        user_profiles.to_csv(user_profiles_path, index=False)
        print(f"Created and saved {len(user_profiles)} sample user profiles")

In [None]:
# Load market data
try:
    market_data = pd.read_csv(market_data_path)
    print(f"Loaded market data with {len(market_data)} records")
except FileNotFoundError:
    print("Market data not found. Fetching from NSE API...")
    # List of top NSE stocks
    nse_symbols = ['SCOM', 'EQTY', 'KCB', 'EABL', 'BAT', 'COOP', 'SBIC', 'SCAN', 'ABSA', 'NCBA']
    # Fetch data for the past 2 years
    end_date = datetime.now().strftime('%Y-%m-%d')
    start_date = (datetime.now() - timedelta(days=730)).strftime('%Y-%m-%d')
    
    market_data = fetch_nse_data(nse_symbols, start_date, end_date)
    print(f"Fetched and saved market data with {len(market_data)} records")

In [None]:
# Exploring user profiles
print("User profile summary statistics:")
user_profiles.describe(include='all')

# Distribution of risk tolerance
plt.figure(figsize=(10, 6))
sns.countplot(x='risk_tolerance', data=user_profiles)
plt.title('Distribution of Risk Tolerance Levels')
plt.xlabel('Risk Tolerance (1=Low, 5=High)')
plt.ylabel('Count')
plt.show()

# Distribution of investment horizons
plt.figure(figsize=(10, 6))
sns.countplot(x='investment_horizon', data=user_profiles)
plt.title('Distribution of Investment Horizons')
plt.xlabel('Investment Horizon')
plt.ylabel('Count')
plt.show()

# Age group vs. risk tolerance
plt.figure(figsize=(12, 6))
sns.boxplot(x='age_group', y='risk_tolerance', data=user_profiles)
plt.title('Risk Tolerance by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Risk Tolerance (1=Low, 5=High)')
plt.show()

# Income range vs. risk tolerance
plt.figure(figsize=(12, 6))
sns.boxplot(x='income_range', y='risk_tolerance', data=user_profiles)
plt.title('Risk Tolerance by Income Range')
plt.xlabel('Monthly Income Range (KES)')
plt.ylabel('Risk Tolerance (1=Low, 5=High)')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Exploring investment products
print("Investment products summary:")
investment_products.describe(include='all')

# Plot investment product returns vs. risk
plt.figure(figsize=(12, 8))
sns.scatterplot(x='risk_level', y='avg_annual_return', 
                hue='category', size='min_investment', sizes=(50, 500),
                data=investment_products, alpha=0.7)
plt.title('Investment Products: Risk vs. Return')
plt.xlabel('Risk Level (1=Low, 5=High)')
plt.ylabel('Average Annual Return')

# Add product names as text labels
for i, row in investment_products.iterrows():
    plt.text(row['risk_level'], row['avg_annual_return'], row['name'], 
             fontsize=9, ha='right', va='bottom')

plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Investment categories by risk level
plt.figure(figsize=(12, 6))
category_risk = investment_products.groupby('category')['risk_level'].mean().sort_values()
category_risk.plot(kind='barh')
plt.title('Average Risk Level by Investment Category')
plt.xlabel('Average Risk Level (1=Low, 5=High)')
plt.tight_layout()
plt.show()

In [None]:
# Convert categorical variables to numeric for modeling
def preprocess_user_profiles(df):
    """Preprocess user profiles for model input"""
    df_processed = df.copy()
    
    # Create age group numeric mapping
    age_mapping = {
        '18-24': 1,
        '25-34': 2,
        '35-44': 3,
        '45-54': 4,
        '55-64': 5,
        '65+': 6
    }
    df_processed['age_numeric'] = df_processed['age_group'].map(age_mapping)
    
    # Create income range numeric mapping
    income_mapping = {
        'Below 15,000': 1,
        '15,000-30,000': 2,
        '30,001-50,000': 3,
        '50,001-100,000': 4,
        'Above 100,000': 5
    }
    df_processed['income_numeric'] = df_processed['income_range'].map(income_mapping)
    
    # Create investment horizon numeric mapping
    horizon_mapping = {
        'Short-term': 1,
        'Medium-term': 2,
        'Long-term': 3
    }
    df_processed['horizon_numeric'] = df_processed['investment_horizon'].map(horizon_mapping)
    
    # One-hot encode categorical variables
    cat_vars = ['location', 'employment_status', 'primary_goal', 'challenges']
    for var in cat_vars:
        dummies = pd.get_dummies(df_processed[var], prefix=var)
        df_processed = pd.concat([df_processed, dummies], axis=1)
    
    return df_processed

# Preprocess investment products
def preprocess_investment_products(df):
    """Preprocess investment products for model input"""
    df_processed = df.copy()
    
    # Create liquidity numeric mapping
    liquidity_mapping = {
        'Low': 1,
        'Medium': 2,
        'High': 3
    }
    df_processed['liquidity_numeric'] = df_processed['liquidity'].map(liquidity_mapping)
    
    # Create investment horizon numeric mapping
    horizon_mapping = {
        'Short-term': 1,
        'Medium-term': 2,
        'Long-term': 3
    }
    df_processed['horizon_numeric'] = df_processed['investment_horizon'].map(horizon_mapping)
    
    # One-hot encode categories
    cat_vars = ['category']
    for var in cat_vars:
        dummies = pd.get_dummies(df_processed[var], prefix=var)
        df_processed = pd.concat([df_processed, dummies], axis=1)
    
    return df_processed

# Apply preprocessing
user_profiles_processed = preprocess_user_profiles(user_profiles)
investment_products_processed = preprocess_investment_products(investment_products)

print("Processed user profiles shape:", user_profiles_processed.shape)
print("Processed investment products shape:", investment_products_processed.shape)

In [None]:
# Create a simulated historical performance dataset
def simulate_historical_performance():
    """Create a simulated investment performance history for training"""
    np.random.seed(42)
    num_samples = 2000
    
    # Sample user IDs and product IDs randomly
    user_ids = np.random.choice(user_profiles['user_id'], num_samples)
    product_ids = np.random.choice(investment_products['product_id'], num_samples)
    
    # Create investment history
    investment_history = pd.DataFrame({
        'user_id': user_ids,
        'product_id': product_ids,
        'investment_date': pd.date_range(start='2022-01-01', periods=num_samples),
        'investment_amount': np.random.uniform(1000, 100000, num_samples),
        'duration_months': np.random.randint(1, 36, num_samples)
    })
    
    # Join with user profiles and investment products
    investment_history = investment_history.merge(
        user_profiles[['user_id', 'risk_tolerance', 'investment_horizon']], 
        on='user_id'
    )
    investment_history = investment_history.merge(
        investment_products[['product_id', 'risk_level', 'avg_annual_return', 'investment_horizon']], 
        on='product_id', 
        suffixes=('_user', '_product')
    )
    
    # Calculate a risk-return alignment score
    # Higher score = better alignment between user profile and product
    investment_history['risk_alignment'] = 5 - abs(investment_history['risk_tolerance'] - investment_history['risk_level'])
    
    # Calculate a horizon alignment score
    horizon_map = {'Short-term': 1, 'Medium-term': 2, 'Long-term': 3}
    investment_history['horizon_user_numeric'] = investment_history['investment_horizon_user'].map(horizon_map)
    investment_history['horizon_product_numeric'] = investment_history['investment_horizon_product'].map(horizon_map)
    investment_history['horizon_alignment'] = 3 - abs(investment_history['horizon_user_numeric'] - investment_history['horizon_product_numeric'])
    
    # Calculate an overall recommendation score
    # This combines alignment factors with a randomization factor to simulate real-world noise
    investment_history['recommendation_score'] = (
        investment_history['risk_alignment'] * 0.5 +
        investment_history['horizon_alignment'] * 0.3 +
        np.random.uniform(0, 1, num_samples) * 0.2  # Random factor for diversity
    )
    
    # Calculate a user satisfaction score (simulated)
    # This would normally come from user feedback
    base_satisfaction = investment_history['recommendation_score'] * 2  # Base satisfaction from good recommendations
    returns_factor = investment_history['avg_annual_return'] * 5  # Returns influence satisfaction
    noise = np.random.normal(0, 0.5, num_samples)  # Random noise
    
    investment_history['user_satisfaction'] = base_satisfaction + returns_factor + noise
    investment_history['user_satisfaction'] = investment_history['user_satisfaction'].clip(1, 10)  # Clip to 1-10 scale
    
    # Create a binary target for classification: Was this a good recommendation?
    investment_history['good_recommendation'] = (investment_history['user_satisfaction'] >= 7).astype(int)
    
    return investment_history

# Generate simulated investment history
investment_history = simulate_historical_performance()
print(f"Generated {len(investment_history)} investment history records")
investment_history.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Analyze the investment history data
plt.figure(figsize=(10, 6))
sns.histplot(data=investment_history, x='user_satisfaction', bins=20)
plt.title('Distribution of User Satisfaction Scores')
plt.axvline(x=7, color='red', linestyle='--')
plt.xlabel('Satisfaction Score (1-10)')
plt.show()

# Plot risk alignment vs. satisfaction
plt.figure(figsize=(10, 6))
sns.boxplot(x='risk_alignment', y='user_satisfaction', data=investment_history)
plt.title('Risk Alignment vs. User Satisfaction')
plt.xlabel('Risk Alignment Score (higher = better)')
plt.ylabel('User Satisfaction (1-10)')
plt.show()

# Plot horizon alignment vs. satisfaction
plt.figure(figsize=(10, 6))
sns.boxplot(x='horizon_alignment', y='user_satisfaction', data=investment_history)
plt.title('Investment Horizon Alignment vs. User Satisfaction')
plt.xlabel('Horizon Alignment Score (higher = better)')
plt.ylabel('User Satisfaction (1-10)')
plt.show()

# Plot correlation matrix
correlations = investment_history[
    ['risk_alignment', 'horizon_alignment', 'avg_annual_return', 
     'recommendation_score', 'user_satisfaction', 'good_recommendation']
].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlations, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Key Investment Factors')
plt.tight_layout()
plt.show()


In [None]:
# Prepare features for the classification model
def prepare_features(history_df, user_df, product_df):
    """Prepare features for the recommendation model"""
    # Merge history with user features
    features_df = history_df.merge(
        user_df[['user_id', 'age_numeric', 'income_numeric', 'financial_literacy', 'horizon_numeric']],
        on='user_id'
    )
    
    # Merge with product features
    features_df = features_df.merge(
        product_df[['product_id', 'risk_level', 'avg_annual_return', 'min_investment', 'liquidity_numeric', 'horizon_numeric']],
        on='product_id',
        suffixes=('_user', '_product')
    )
    
    # Create interaction features
    features_df['risk_diff'] = abs(features_df['risk_tolerance'] - features_df['risk_level'])
    features_df['horizon_diff'] = abs(features_df['horizon_numeric_user'] - features_df['horizon_numeric_product'])
    features_df['risk_return_ratio'] = features_df['risk_level'] / features_df['avg_annual_return']
    features_df['affordability_ratio'] = features_df['min_investment'] / features_df['income_numeric']
    
    # Get relevant features
    X_features = features_df[[
        'risk_tolerance', 'risk_level', 'risk_diff',
        'horizon_numeric_user', 'horizon_numeric_product', 'horizon_diff',
        'age_numeric', 'income_numeric', 'financial_literacy',
        'avg_annual_return', 'liquidity_numeric_product', 'risk_return_ratio', 'affordability_ratio',
        'investment_amount', 'duration_months'
    ]]
    
    # Get target variable
    y_target = features_df['good_recommendation']
    
    return X_features, y_target, features_df

# Prepare the model features
X, y, features_df = prepare_features(
    investment_history, 
    user_profiles_processed, 
    investment_products_processed
)

print("Feature matrix shape:", X.shape)
print("Target vector shape:", y.shape)
print("Class distribution:\n", y.value_counts(normalize=True))

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

In [None]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Random Forest classifier
print("Training Random Forest classifier...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test_scaled)
y_prob_rf = rf_model.predict_proba(X_test_scaled)[:, 1]

# Evaluate the model
print("\nRandom Forest Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Random Forest')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Feature importance
plt.figure(figsize=(12, 8))
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance - Random Forest')
plt.tight_layout()
plt.show()

In [None]:
# Train a Gradient Boosting classifier for comparison
print("Training Gradient Boosting classifier...")
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_gb = gb_model.predict(X_test_scaled)
y_prob_gb = gb_model.predict_proba(X_test_scaled)[:, 1]

# Evaluate the model
print("\nGradient Boosting Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_gb))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_gb))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred_gb)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Gradient Boosting')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()


In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

# Create a neural network model for investment recommendations
def build_nn_recommender():
    """Build a neural network recommendation model"""
    # User features input
    user_input = Input(shape=(5,), name='user_features')  # age, income, literacy, risk_tolerance, horizon
    
    # Product features input
    product_input = Input(shape=(5,), name='product_features')  # risk, return, min_investment, liquidity, horizon
    
    # User branch
    user_dense = Dense(16, activation='relu')(user_input)
    user_dense = Dropout(0.2)(user_dense)
    user_dense = Dense(8, activation='relu')(user_dense)
    
    # Product branch
    product_dense = Dense(16, activation='relu')(product_input)
    product_dense = Dropout(0.2)(product_dense)
    product_dense = Dense(8, activation='relu')(product_dense)
    
    # Combine both branches
    concatenated = Concatenate()([user_dense, product_dense])
    
    # Joint network
    joint = Dense(16, activation='relu')(concatenated)
    joint = Dropout(0.3)(joint)
    joint = Dense(8, activation='relu')(joint)
    
    # Output (recommendation score)
    output = Dense(1, activation='sigmoid', name='recommendation')(joint)
    
    # Create model
    model = Model(inputs=[user_input, product_input], outputs=output)
    
    # Compile
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

# Prepare data for neural network
def prepare_nn_data(features_df):
    """Prepare data for neural network model"""
    # User features
    user_features = features_df[[
        'age_numeric', 'income_numeric', 'financial_literacy', 'risk_tolerance', 'horizon_numeric_user'
    ]].values
    
    # Product features
    product_features = features_df[[
        'risk_level', 'avg_annual_return', 'min_investment', 'liquidity_numeric_product', 'horizon_numeric_product'
    ]].values
    
    # Target
    target = features_df['good_recommendation'].values
    
    return user_features, product_features, target

# Scale features for neural network
user_scaler = StandardScaler()
product_scaler = StandardScaler()

# Assume features_df is provided
user_features, product_features, target = prepare_nn_data(features_df)

# Split data
indices = np.arange(len(target))
train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42, stratify=target)

# Scale features
user_features_train = user_scaler.fit_transform(user_features[train_idx])
user_features_test = user_scaler.transform(user_features[test_idx])

product_features_train = product_scaler.fit_transform(product_features[train_idx])
product_features_test = product_scaler.transform(product_features[test_idx])

y_train_nn = target[train_idx]
y_test_nn = target[test_idx]

# Build and train the neural network model
nn_model = build_nn_recommender()
print(nn_model.summary())

# Train the model
history = nn_model.fit(
    [user_features_train, product_features_train], y_train_nn,
    validation_data=([user_features_test, product_features_test], y_test_nn),
    epochs=20,
    batch_size=32,
    verbose=1
)

# Evaluate the model
nn_evaluation = nn_model.evaluate([user_features_test, product_features_test], y_test_nn)
print(f"Neural Network - Loss: {nn_evaluation[0]:.4f}, Accuracy: {nn_evaluation[1]:.4f}")

# Plot training history
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(loc='upper right')

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(loc='lower right')

plt.tight_layout()
plt.show()

In [None]:
# Prepare data for product clustering
product_features_for_clustering = investment_products_processed[[
    'risk_level', 'avg_annual_return', 'liquidity_numeric', 'horizon_numeric'
]]

# Scale features
product_cluster_scaler = StandardScaler()
product_features_scaled = product_cluster_scaler.fit_transform(product_features_for_clustering)

# Determine optimal number of clusters using the elbow method
inertia = []
k_range = range(1, 10)
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(product_features_scaled)
    inertia.append(kmeans.inertia_)

# Plot elbow curve
plt.figure(figsize=(10, 6))
plt.plot(k_range, inertia, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.xticks(k_range)
plt.grid(True, alpha=0.3)
plt.show()

# Based on the elbow curve, choose optimal k and perform clustering
optimal_k = 5  # This should be chosen based on the elbow plot
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
clusters = kmeans.fit_predict(product_features_scaled)

# Add cluster labels to the investment products
investment_products_processed['cluster'] = clusters

# Visualize clusters using PCA
pca = PCA(n_components=2)
product_features_2d = pca.fit_transform(product_features_scaled)

# Plot clusters
plt.figure(figsize=(12, 8))
scatter = plt.scatter(product_features_2d[:, 0], product_features_2d[:, 1], 
                     c=clusters, cmap='viridis', s=100, alpha=0.8)
plt.colorbar(scatter, label='Cluster')

# Add product names as annotations
for i, (x, y) in enumerate(product_features_2d):
    plt.annotate(investment_products_processed.iloc[i]['name'], 
                 (x, y), fontsize=8, ha='center', va='center',
                 bbox=dict(boxstyle='round,pad=0.3', fc='white', alpha=0.7))

plt.title('Investment Product Clusters (PCA Visualization)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Analyze clusters
cluster_analysis = investment_products_processed.groupby('cluster').agg({
    'risk_level': 'mean',
    'avg_annual_return': 'mean',
    'min_investment': 'mean',
    'liquidity_numeric': 'mean',
    'horizon_numeric': 'mean'
}).reset_index()

print("Cluster Analysis:")
print(cluster_analysis)

# Print products in each cluster
for cluster_id in range(optimal_k):
    print(f"\nCluster {cluster_id} Products:")
    cluster_products = investment_products_processed[investment_products_processed['cluster'] == cluster_id]
    for _, product in cluster_products.iterrows():
        print(f"  - {product['name']} (Risk: {product['risk_level']}, Return: {product['avg_annual_return']:.2f})")

In [None]:
class InvestmentRecommender:
    """Investment recommendation engine for PesaGuru chatbot"""
    
    def __init__(self, rf_model, nn_model, product_clusters, products_df):
        """Initialize with trained models"""
        self.rf_model = rf_model
        self.nn_model = nn_model
        self.product_clusters = product_clusters
        self.products_df = products_df
        self.user_scaler = user_scaler
        self.product_scaler = product_scaler
        self.feature_scaler = scaler
        
    def recommend_products(self, user_profile, top_n=5, diversify=True):
        """Generate personalized investment recommendations"""
        # Extract key user features
        user_features = np.array([[
            user_profile['age_numeric'], 
            user_profile['income_numeric'], 
            user_profile['financial_literacy'],
            user_profile['risk_tolerance'],
            user_profile['horizon_numeric']
        ]])
        
        # Scale user features
        user_features_scaled = self.user_scaler.transform(user_features)
        
        # Initialize scores dictionary
        product_scores = {}
        
        # Evaluate each product
        for _, product in self.products_df.iterrows():
            # Extract product features
            product_features = np.array([[
                product['risk_level'],
                product['avg_annual_return'],
                product['min_investment'],
                product['liquidity_numeric'],
                product['horizon_numeric']
            ]])
            
            # Scale product features
            product_features_scaled = self.product_scaler.transform(product_features)
            
            # Combined features for RF model
            combined_features = np.array([[
                user_profile['risk_tolerance'], 
                product['risk_level'],
                abs(user_profile['risk_tolerance'] - product['risk_level']),  # risk_diff
                user_profile['horizon_numeric'],
                product['horizon_numeric'],
                abs(user_profile['horizon_numeric'] - product['horizon_numeric']),  # horizon_diff
                user_profile['age_numeric'],
                user_profile['income_numeric'],
                user_profile['financial_literacy'],
                product['avg_annual_return'],
                product['liquidity_numeric'],
                product['risk_level'] / max(0.01, product['avg_annual_return']),  # risk_return_ratio
                product['min_investment'] / max(1, user_profile['income_numeric']),  # affordability_ratio
                50000,  # dummy investment_amount
                12      # dummy duration_months
            ]])
            
            # Scale combined features
            combined_features_scaled = self.feature_scaler.transform(combined_features)
            
            # Get predictions from both models
            rf_prob = self.rf_model.predict_proba(combined_features_scaled)[0, 1]
            nn_prob = self.nn_model.predict([user_features_scaled, product_features_scaled])[0, 0]
            
            # Combine scores (weighted average)
            combined_score = (rf_prob * 0.6) + (nn_prob * 0.4)
            
            # Store score
            product_scores[product['product_id']] = {
                'product_id': product['product_id'],
                'name': product['name'],
                'category': product['category'],
                'risk_level': product['risk_level'],
                'avg_annual_return': product['avg_annual_return'],
                'min_investment': product['min_investment'],
                'cluster': product['cluster'],
                'score': combined_score
            }
        
        # Get top recommendations
        if diversify:
            # Group by cluster
            cluster_products = {}
            for product_id, info in product_scores.items():
                cluster = info['cluster']
                if cluster not in cluster_products:
                    cluster_products[cluster] = []
                cluster_products[cluster].append(info)
            
            # Get top product from each cluster
            recommendations = []
            for cluster in sorted(cluster_products.keys()):
                sorted_cluster_products = sorted(cluster_products[cluster], key=lambda x: x['score'], reverse=True)
                if sorted_cluster_products:
                    recommendations.append(sorted_cluster_products[0])
            
            # Sort final recommendations by score
            recommendations = sorted(recommendations, key=lambda x: x['score'], reverse=True)[:top_n]
        else:
            # Simply get top N products by score
            recommendations = sorted(
                [info for _, info in product_scores.items()], 
                key=lambda x: x['score'], 
                reverse=True
            )[:top_n]
        
        return recommendations
        
    def explain_recommendation(self, recommendation, user_profile):
        """Generate explanation for why a product was recommended"""
        explanations = []
        
        # Risk tolerance alignment
        risk_diff = abs(user_profile['risk_tolerance'] - recommendation['risk_level'])
        if risk_diff <= 1:
            explanations.append(f"This investment aligns well with your risk tolerance level of {user_profile['risk_tolerance']}/5.")
        else:
            risk_comparison = "higher" if recommendation['risk_level'] > user_profile['risk_tolerance'] else "lower"
            explanations.append(f"This investment has a {risk_comparison} risk level ({recommendation['risk_level']}/5) "
                               f"compared to your risk tolerance ({user_profile['risk_tolerance']}/5).")
        
        # Return potential
        returns_percent = recommendation['avg_annual_return'] * 100
        explanations.append(f"It has an average annual return of {returns_percent:.1f}%.")
        
        # Investment horizon
        horizon_map = {1: 'Short-term', 2: 'Medium-term', 3: 'Long-term'}
        user_horizon = horizon_map.get(user_profile['horizon_numeric'], 'Medium-term')
        product_horizon = horizon_map.get(recommendation.get('horizon_numeric', 2), 'Medium-term')
        
        if user_horizon == product_horizon:
            explanations.append(f"This aligns with your {user_horizon.lower()} investment horizon.")
        else:
            explanations.append(f"While you indicated a {user_horizon.lower()} investment horizon, "
                              f"this is a {product_horizon.lower()} investment.")
        
        # Affordability
        min_investment = recommendation['min_investment']
        explanations.append(f"This requires a minimum investment of KES {min_investment:,.0f}.")
        
        return explanations

# Create the recommendation engine
recommender = InvestmentRecommender(
    rf_model=rf_model,
    nn_model=nn_model,
    product_clusters=kmeans,
    products_df=investment_products_processed
)


In [None]:
# Test the recommendation engine with a few sample user profiles

def test_recommender_with_sample_profiles():
    """Test the recommendation engine with sample user profiles"""
    # Sample profiles
    test_profiles = [
        {
            "profile_name": "Young Professional with High Risk Tolerance",
            "age_numeric": 2,  # 25-34
            "income_numeric": 3,  # 30,001-50,000
            "financial_literacy": 7,
            "risk_tolerance": 4,
            "horizon_numeric": 3  # Long-term
        },
        {
            "profile_name": "Older Conservative Investor",
            "age_numeric": 5,  # 55-64
            "income_numeric": 4,  # 50,001-100,000
            "financial_literacy": 8,
            "risk_tolerance": 2,
            "horizon_numeric": 1  # Short-term
        },
        {
            "profile_name": "Middle-Age Balanced Investor",
            "age_numeric": 3,  # 35-44
            "income_numeric": 5,  # >100,000
            "financial_literacy": 9,
            "risk_tolerance": 3,
            "horizon_numeric": 2  # Medium-term
        },
        {
            "profile_name": "Young Low-Income Cautious Investor",
            "age_numeric": 1,  # 18-24
            "income_numeric": 1,  # <15,000
            "financial_literacy": 4,
            "risk_tolerance": 2,
            "horizon_numeric": 2  # Medium-term
        }
    ]
    
    # Generate recommendations for each profile
    for profile in test_profiles:
        print(f"\n\n{'=' * 50}")
        print(f"Recommendations for: {profile['profile_name']}")
        print(f"{'=' * 50}")
        print(f"Age Group: {profile['age_numeric']}")
        print(f"Income Level: {profile['income_numeric']}")
        print(f"Financial Literacy: {profile['financial_literacy']}/10")
        print(f"Risk Tolerance: {profile['risk_tolerance']}/5")
        print(f"Investment Horizon: {profile['horizon_numeric']} (1=Short, 2=Medium, 3=Long)")
        print("\nTop Recommendations:")
        print("------------------")
        
        # Get diverse recommendations
        recommendations = recommender.recommend_products(profile, top_n=3, diversify=True)
        
        for i, rec in enumerate(recommendations, 1):
            print(f"\n{i}. {rec['name']} ({rec['category']})")
            print(f"   Risk Level: {rec['risk_level']}/5")
            print(f"   Avg. Annual Return: {rec['avg_annual_return']*100:.1f}%")
            print(f"   Min. Investment: KES {rec['min_investment']:,.0f}")
            print(f"   Recommendation Score: {rec['score']:.2f}")
            print("\n   Why this recommendation:")
            for explanation in recommender.explain_recommendation(rec, profile):
                print(f"   - {explanation}")

# Test the recommender
test_recommender_with_sample_profiles()

In [None]:
# Evaluate recommendation quality using historical data
def evaluate_recommendations(recommender, test_data, user_profiles_df, top_n=5):
    """Evaluate recommendation quality against historical data"""
    # Create actual good recommendations set
    actual_good_recs = set(
        test_data[test_data['good_recommendation'] == 1][['user_id', 'product_id']]
        .apply(tuple, axis=1)
    )
    
    predicted_good_recs = set()
    total_users = len(test_data['user_id'].unique())
    processed_users = 0
    
    # Track metrics
    precision_at_k = []
    recall_at_k = []
    
    # Evaluate for each user
    for user_id in test_data['user_id'].unique():
        # Get user profile
        user_profile = user_profiles_df[user_profiles_df['user_id'] == user_id].iloc[0].to_dict()
        
        # Get actual good recommendations for this user
        user_actual_good_products = set(
            test_data[(test_data['user_id'] == user_id) & (test_data['good_recommendation'] == 1)]['product_id']
        )
        
        # Skip users with no positive examples
        if not user_actual_good_products:
            continue
        
        # Get recommendations for this user
        recs = recommender.recommend_products(user_profile, top_n=top_n)
        recommended_products = {rec['product_id'] for rec in recs}
        
        # Add to predicted set
        for product_id in recommended_products:
            predicted_good_recs.add((user_id, product_id))
        
        # Calculate precision and recall for this user
        relevant_and_recommended = len(user_actual_good_products.intersection(recommended_products))
        precision = relevant_and_recommended / len(recommended_products) if recommended_products else 0
        recall = relevant_and_recommended / len(user_actual_good_products) if user_actual_good_products else 0
        
        precision_at_k.append(precision)
        recall_at_k.append(recall)
        
        processed_users += 1
        if processed_users % 50 == 0:
            print(f"Processed {processed_users}/{total_users} users...")
    
    # Calculate average precision and recall
    avg_precision = np.mean(precision_at_k)
    avg_recall = np.mean(recall_at_k)
    f1_score = 2 * (avg_precision * avg_recall) / (avg_precision + avg_recall) if (avg_precision + avg_recall) > 0 else 0
    
    # Global precision and recall
    true_positives = len(actual_good_recs.intersection(predicted_good_recs))
    global_precision = true_positives / len(predicted_good_recs) if predicted_good_recs else 0
    global_recall = true_positives / len(actual_good_recs) if actual_good_recs else 0
    global_f1 = 2 * (global_precision * global_recall) / (global_precision + global_recall) if (global_precision + global_recall) > 0 else 0
    
    return {
        'per_user_metrics': {
            'avg_precision': avg_precision,
            'avg_recall': avg_recall,
            'f1_score': f1_score
        },
        'global_metrics': {
            'precision': global_precision,
            'recall': global_recall,
            'f1_score': global_f1
        }
    }

# Evaluate the recommendation engine
print("Evaluating recommendation engine...")
eval_results = evaluate_recommendations(
    recommender, 
    test_data=features_df.iloc[test_idx], 
    user_profiles_df=user_profiles_processed,
    top_n=5
)

print("\nPer-user Metrics:")
print(f"Average Precision@5: {eval_results['per_user_metrics']['avg_precision']:.4f}")
print(f"Average Recall@5: {eval_results['per_user_metrics']['avg_recall']:.4f}")
print(f"F1 Score: {eval_results['per_user_metrics']['f1_score']:.4f}")

print("\nGlobal Metrics:")
print(f"Precision: {eval_results['global_metrics']['precision']:.4f}")
print(f"Recall: {eval_results['global_metrics']['recall']:.4f}")
print(f"F1 Score: {eval_results['global_metrics']['f1_score']:.4f}")

In [None]:
# Define output directory
output_dir = '../../ai/models/generated/'
os.makedirs(output_dir, exist_ok=True)

# Save Random Forest model
rf_model_path = os.path.join(output_dir, 'rf_recommendation_model.pkl')
with open(rf_model_path, 'wb') as f:
    pickle.dump(rf_model, f)
print(f"Saved Random Forest model to {rf_model_path}")

# Save Neural Network model
nn_model_path = os.path.join(output_dir, 'nn_recommendation_model')
nn_model.save(nn_model_path)
print(f"Saved Neural Network model to {nn_model_path}")

# Save KMeans model
kmeans_model_path = os.path.join(output_dir, 'kmeans_product_clusters.pkl')
with open(kmeans_model_path, 'wb') as f:
    pickle.dump(kmeans, f)
print(f"Saved KMeans model to {kmeans_model_path}")

# Save scalers
scalers = {
    'feature_scaler': scaler,
    'user_scaler': user_scaler,
    'product_scaler': product_scaler,
    'product_cluster_scaler': product_cluster_scaler
}
scalers_path = os.path.join(output_dir, 'recommendation_scalers.pkl')
with open(scalers_path, 'wb') as f:
    pickle.dump(scalers, f)
print(f"Saved scalers to {scalers_path}")

# Save investment products dataset
products_path = os.path.join(output_dir, 'investment_products.csv')
investment_products_processed.to_csv(products_path, index=False)
print(f"Saved processed investment products to {products_path}")

# Create model metadata
model_metadata = {
    'version': '1.0.0',
    'created_at': datetime.now().isoformat(),
    'performance': {
        'rf_accuracy': accuracy_score(y_test, y_pred_rf),
        'rf_precision': classification_report(y_test, y_pred_rf, output_dict=True)['1']['precision'],
        'rf_recall': classification_report(y_test, y_pred_rf, output_dict=True)['1']['recall'],
        'nn_accuracy': nn_evaluation[1],
        'recommendation_precision': eval_results['global_metrics']['precision'],
        'recommendation_recall': eval_results['global_metrics']['recall'],
        'recommendation_f1': eval_results['global_metrics']['f1_score']
    },
    'feature_importance': {
        feature: importance for feature, importance in 
        zip(X.columns, rf_model.feature_importances_)
    },
    'product_clusters': {
        str(i): list(investment_products_processed[investment_products_processed['cluster'] == i]['name'])
        for i in range(optimal_k)
    },
    'files': {
        'rf_model': os.path.basename(rf_model_path),
        'nn_model': os.path.basename(nn_model_path),
        'kmeans_model': os.path.basename(kmeans_model_path),
        'scalers': os.path.basename(scalers_path),
        'investment_products': os.path.basename(products_path)
    }
}

# Save model metadata
metadata_path = os.path.join(output_dir, 'recommendation_model_metadata.json')
with open(metadata_path, 'w') as f:
    json.dump(model_metadata, f, indent=4)
print(f"Saved model metadata to {metadata_path}")

In [None]:
# Create a Python module for loading and using the recommendation model
recommender_module = f'''
# investment_recommender.py
# Recommendation engine for PesaGuru financial advisory chatbot
# Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

import os
import pickle
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model

class InvestmentRecommender:
    """Investment recommendation engine for PesaGuru chatbot"""
    
    def __init__(self, models_dir):
        """Initialize with trained models"""
        # Load models
        with open(os.path.join(models_dir, 'rf_recommendation_model.pkl'), 'rb') as f:
            self.rf_model = pickle.load(f)
        
        self.nn_model = load_model(os.path.join(models_dir, 'nn_recommendation_model'))
        
        with open(os.path.join(models_dir, 'kmeans_product_clusters.pkl'), 'rb') as f:
            self.product_clusters = pickle.load(f)
        
        # Load scalers
        with open(os.path.join(models_dir, 'recommendation_scalers.pkl'), 'rb') as f:
            scalers = pickle.load(f)
            self.feature_scaler = scalers['feature_scaler']
            self.user_scaler = scalers['user_scaler']
            self.product_scaler = scalers['product_scaler']
        
        # Load products
        import pandas as pd
        self.products_df = pd.read_csv(os.path.join(models_dir, 'investment_products.csv'))
    
    def recommend_products(self, user_profile, top_n=5, diversify=True):
        """Generate personalized investment recommendations"""
        # Extract key user features
        user_features = np.array([[
            user_profile['age_numeric'], 
            user_profile['income_numeric'], 
            user_profile['financial_literacy'],
            user_profile['risk_tolerance'],
            user_profile['horizon_numeric']
        ]])
        
        # Scale user features
        user_features_scaled = self.user_scaler.transform(user_features)
        
        # Initialize scores dictionary
        product_scores = {}
        
        # Evaluate each product
        for _, product in self.products_df.iterrows():
            # Extract product features
            product_features = np.array([[
                product['risk_level'],
                product['avg_annual_return'],
                product['min_investment'],
                product['liquidity_numeric'],
                product['horizon_numeric']
            ]])
            
            # Scale product features
            product_features_scaled = self.product_scaler.transform(product_features)
            
            # Combined features for RF model
            combined_features = np.array([[
                user_profile['risk_tolerance'], 
                product['risk_level'],
                abs(user_profile['risk_tolerance'] - product['risk_level']),  # risk_diff
                user_profile['horizon_numeric'],
                product['horizon_numeric'],
                abs(user_profile['horizon_numeric'] - product['horizon_numeric']),  # horizon_diff
                user_profile['age_numeric'],
                user_profile['income_numeric'],
                user_profile['financial_literacy'],
                product['avg_annual_return'],
                product['liquidity_numeric'],
                product['risk_level'] / max(0.01, product['avg_annual_return']),  # risk_return_ratio
                product['min_investment'] / max(1, user_profile['income_numeric']),  # affordability_ratio
                50000,  # dummy investment_amount
                12      # dummy duration_months
            ]])
            
            # Scale combined features
            combined_features_scaled = self.feature_scaler.transform(combined_features)
            
            # Get predictions from both models
            rf_prob = self.rf_model.predict_proba(combined_features_scaled)[0, 1]
            nn_prob = self.nn_model.predict([user_features_scaled, product_features_scaled])[0, 0]
            
            # Combine scores (weighted average)
            combined_score = (rf_prob * 0.6) + (nn_prob * 0.4)
            
            # Store score
            product_scores[product['product_id']] = {
                'product_id': product['product_id'],
                'name': product['name'],
                'category': product['category'],
                'risk_level': product['risk_level'],
                'avg_annual_return': product['avg_annual_return'],
                'min_investment': product['min_investment'],
                'cluster': product['cluster'],
                'score': combined_score
            }
        
        # Get top recommendations
        if diversify:
            # Group by cluster
            cluster_products = {}
            for product_id, info in product_scores.items():
                cluster = info['cluster']
                if cluster not in cluster_products:
                    cluster_products[cluster] = []
                cluster_products[cluster].append(info)
            
            # Get top product from each cluster
            recommendations = []
            for cluster in sorted(cluster_products.keys()):
                sorted_cluster_products = sorted(cluster_products[cluster], key=lambda x: x['score'], reverse=True)
                if sorted_cluster_products:
                    recommendations.append(sorted_cluster_products[0])
            
            # Sort final recommendations by score
            recommendations = sorted(recommendations, key=lambda x: x['score'], reverse=True)[:top_n]
        else:
            # Simply get top N products by score
            recommendations = sorted(
                [info for _, info in product_scores.items()], 
                key=lambda x: x['score'], 
                reverse=True
            )[:top_n]
        
        return recommendations
        
    def explain_recommendation(self, recommendation, user_profile):
        """Generate explanation for why a product was recommended"""
        explanations = []
        
        # Risk tolerance alignment
        risk_diff = abs(user_profile['risk_tolerance'] - recommendation['risk_level'])
        if risk_diff <= 1:
            explanations.append(f"This investment aligns well with your risk tolerance level of {user_profile['risk_tolerance']}/5.")
        else:
            risk_comparison = "higher" if recommendation['risk_level'] > user_profile['risk_tolerance'] else "lower"
            explanations.append(f"This investment has a {risk_comparison} risk level ({recommendation['risk_level']}/5) "
                               f"compared to your risk tolerance ({user_profile['risk_tolerance']}/5).")
        
        # Return potential
        returns_percent = recommendation['avg_annual_return'] * 100
        explanations.append(f"It has an average annual return of {returns_percent:.1f}%.")
        
        # Investment horizon
        horizon_map = {1: 'Short-term', 2: 'Medium-term', 3: 'Long-term'}
        user_horizon = horizon_map.get(user_profile['horizon_numeric'], 'Medium-term')
        product_horizon = horizon_map.get(recommendation.get('horizon_numeric', 2), 'Medium-term')
        
        if user_horizon == product_horizon:
            explanations.append(f"This aligns with your {user_horizon.lower()} investment horizon.")
        else:
            explanations.append(f"While you indicated a {user_horizon.lower()} investment horizon, "
                              f"this is a {product_horizon.lower()} investment.")
        
        # Affordability
        income_map = {1: '<15K', 2: '15K-30K', 3: '30K-50K', 4: '50K-100K', 5: '>100K'}
        user_income = income_map.get(user_profile['income_numeric'], 'Medium')
        
        min_investment = recommendation['min_investment']
        if min_investment <= 5000:
            explanations.append(f"With a minimum investment of KES {min_investment:,.0f}, this is accessible "
                              f"for your income level ({user_income}).")
        elif min_investment > 50000 and user_profile['income_numeric'] < 4:
            explanations.append(f"The minimum investment of KES {min_investment:,.0f} may be significant "
                              f"for your income level ({user_income}).")
        else:
            explanations.append(f"This requires a minimum investment of KES {min_investment:,.0f}.")
        
        # Category specific explanation
        category = recommendation['category']
        if category == 'Equity':
            explanations.append("Stocks provide opportunity for growth through capital appreciation and dividends.")
        elif category == 'Government Security':
            explanations.append("Government securities offer stable returns with minimal risk.")
        elif category == 'Money Market':
            explanations.append("Money market funds provide liquidity and stable short-term returns.")
        elif category == 'SACCO':
            explanations.append("SACCOs offer competitive returns and may provide access to loans.")
        elif category == 'Real Estate':
            explanations.append("Real estate investments can provide both rental income and capital appreciation.")
        
        return explanations

# Example usage:
# recommender = InvestmentRecommender('/path/to/models')
# user_profile = {
#     'age_numeric': 2,
#     'income_numeric': 3,
#     'financial_literacy': 7,
#     'risk_tolerance': 4,
#     'horizon_numeric': 3
# }
# recommendations = recommender.recommend_products(user_profile)
'''

# Save the recommender module
module_path = os.path.join(output_dir, 'investment_recommender.py')
with open(module_path, 'w') as f:
    f.write(recommender_module)
print(f"Saved recommender module to {module_path}")

# Create a requirements.txt for the recommendation model
requirements = """
# Requirements for PesaGuru Investment Recommendation Model
numpy>=1.19.5
pandas>=1.3.0
scikit-learn>=1.0.0
tensorflow>=2.5.0
matplotlib>=3.4.0
seaborn>=0.11.0
python-dotenv>=0.19.0
requests>=2.25.0
"""

requirements_path = os.path.join(output_dir, 'requirements.txt')
with open(requirements_path, 'w') as f:
    f.write(requirements)
print(f"Saved requirements file to {requirements_path}")

# Create a simple README.md file for the model
readme = f"""# PesaGuru Investment Recommendation Model

Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

## Overview
This recommendation model provides personalized investment advice for users in Kenya based on their risk profile, financial goals, and market conditions.

## Models
- **Random Forest Classifier**: Predicts if an investment product is suitable for a given user
- **Neural Network**: Scores the compatibility between user profiles and investment products
- **KMeans Clustering**: Groups similar investment products for portfolio diversification

## Performance Metrics
- Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}
- Precision: {eval_results['global_metrics']['precision']:.4f}
- Recall: {eval_results['global_metrics']['recall']:.4f}
- F1 Score: {eval_results['global_metrics']['f1_score']:.4f}

## Usage
```python
from investment_recommender import InvestmentRecommender

# Initialize recommender
recommender = InvestmentRecommender('path/to/models')

# User profile
user_profile = {
    'age_numeric': 2,
    'income_numeric': 3,
    'financial_literacy': 7,
    'risk_tolerance': 4,
    'horizon_numeric': 3
}

# Get recommendations
recommendations = recommender.recommend_products(user_profile, top_n=3, diversify=True)