In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import string
import glob
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# NLP Libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from wordcloud import WordCloud

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

# ML Libraries
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Visualization styling
plt.style.use('fivethirtyeight')
sns.set(style='whitegrid')

# Define paths
news_data_dir = "../data/external/news_data/"
kenya_finance_dir = os.path.join(news_data_dir, "kenya_finance_data")
international_finance_dir = os.path.join(news_data_dir, "international_finance_data")
crypto_finance_dir = os.path.join(news_data_dir, "crypto_finance_data")
forex_finance_dir = os.path.join(news_data_dir, "forex_finance_data")
output_dir = "../outputs/sentiment_analysis"

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Function to load and combine headline data from different sources
def load_news_headlines():
    headline_files = [
        os.path.join(kenya_finance_dir, "financial_news_headlines.csv"),
        # Add other headline files if they exist
    ]
    
    all_headlines = []
    
    for file_path in headline_files:
        if os.path.exists(file_path):
            print(f"Loading headlines from {file_path}")
            try:
                df = pd.read_csv(file_path)
                # Add source column to track origin
                df['source'] = os.path.basename(file_path).replace('.csv', '')
                all_headlines.append(df)
            except Exception as e:
                print(f"Error loading {file_path}: {e}")
    
    if all_headlines:
        combined_headlines = pd.concat(all_headlines, ignore_index=True)
        return combined_headlines
    else:
        print("No headline data found. Using sample data for demonstration.")
        # Create sample data
        sample_data = {
            'date': pd.date_range(start='2025-01-01', periods=100),
            'headline': [
                "Kenya's economy grows 5.2% in Q1",
                "NSE 20-share index gains 3% amid positive investor sentiment",
                "Central Bank of Kenya maintains base rate at 9.5%",
                "Safaricom reports 10% profit increase in annual results",
                "Equity Bank expands into South Sudan market",
                "KCB Group completes acquisition of DRC bank",
                "Rising inflation concerns impact Kenyan shilling",
                "NSE records worst day in 2 years as global markets tumble",
                "Kenya's debt burden increase raises economic concerns",
                "Crypto adoption surges in East Africa despite regulatory concerns"
            ] * 10,
            'category': ['general', 'stocks', 'economy', 'stocks', 'banking', 
                        'banking', 'forex', 'stocks', 'economy', 'crypto'] * 10,
            'source': ['sample_data'] * 100
        }
        return pd.DataFrame(sample_data)

# Load headline data
headlines_df = load_news_headlines()

# Display basic information
print(f"Loaded {len(headlines_df)} headlines")
headlines_df.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


No headline data found. Using sample data for demonstration.
Loaded 100 headlines


Unnamed: 0,date,headline,category,source
0,2025-01-01,Kenya's economy grows 5.2% in Q1,general,sample_data
1,2025-01-02,NSE 20-share index gains 3% amid positive inve...,stocks,sample_data
2,2025-01-03,Central Bank of Kenya maintains base rate at 9.5%,economy,sample_data
3,2025-01-04,Safaricom reports 10% profit increase in annua...,stocks,sample_data
4,2025-01-05,Equity Bank expands into South Sudan market,banking,sample_data


In [2]:
# Convert date column to datetime
if 'date' in headlines_df.columns:
    headlines_df['date'] = pd.to_datetime(headlines_df['date'], errors='coerce')
    # Filter out rows with invalid dates
    headlines_df = headlines_df[~headlines_df['date'].isna()]
    # Sort by date
    headlines_df = headlines_df.sort_values('date')

# Check for missing values
print("\nMissing values per column:")
print(headlines_df.isna().sum())

# Fill missing values in category if any
if 'category' in headlines_df.columns and headlines_df['category'].isna().any():
    headlines_df['category'] = headlines_df['category'].fillna('general')

# Function to clean and preprocess text
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Preprocess headlines
headlines_df['clean_headline'] = headlines_df['headline'].apply(preprocess_text)

# Tokenize and lemmatize
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return tokens

headlines_df['tokens'] = headlines_df['clean_headline'].apply(tokenize_and_lemmatize)
headlines_df['processed_headline'] = headlines_df['tokens'].apply(lambda x: ' '.join(x))

# Add financial domain specific words to be filtered out or kept
financial_specific_stopwords = [
    'said', 'says', 'report', 'reported', 'according', 
    'announces', 'announced', 'statement', 'press', 'release'
]

# Enhanced preprocessing specifically for financial headlines
def enhance_financial_preprocessing(tokens):
    enhanced_tokens = [token for token in tokens if token not in financial_specific_stopwords]
    return enhanced_tokens

headlines_df['enhanced_tokens'] = headlines_df['tokens'].apply(enhance_financial_preprocessing)
headlines_df['enhanced_headline'] = headlines_df['enhanced_tokens'].apply(lambda x: ' '.join(x))

# Display processed headlines
headlines_df[['headline', 'processed_headline', 'enhanced_headline']].head()


Missing values per column:
date        0
headline    0
category    0
source      0
dtype: int64


Unnamed: 0,headline,processed_headline,enhanced_headline
0,Kenya's economy grows 5.2% in Q1,kenya economy grows q,kenya economy grows q
1,NSE 20-share index gains 3% amid positive inve...,nse share index gain amid positive investor se...,nse share index gain amid positive investor se...
2,Central Bank of Kenya maintains base rate at 9.5%,central bank kenya maintains base rate,central bank kenya maintains base rate
3,Safaricom reports 10% profit increase in annua...,safaricom report profit increase annual result,safaricom profit increase annual result
4,Equity Bank expands into South Sudan market,equity bank expands south sudan market,equity bank expands south sudan market


In [3]:
# Initialize VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Function to get VADER sentiment scores
def get_vader_sentiment(text):
    if not isinstance(text, str) or text == "":
        return {
            'compound': 0,
            'pos': 0,
            'neu': 0,
            'neg': 0
        }
    return sid.polarity_scores(text)

# Apply VADER to original headlines (better for short texts like headlines)
headlines_df['vader_scores'] = headlines_df['headline'].apply(get_vader_sentiment)

# Extract individual sentiment scores
headlines_df['vader_compound'] = headlines_df['vader_scores'].apply(lambda x: x['compound'])
headlines_df['vader_positive'] = headlines_df['vader_scores'].apply(lambda x: x['pos'])
headlines_df['vader_neutral'] = headlines_df['vader_scores'].apply(lambda x: x['neu'])
headlines_df['vader_negative'] = headlines_df['vader_scores'].apply(lambda x: x['neg'])

# Create a simple sentiment label
def get_sentiment_label(compound_score):
    if compound_score >= 0.05:
        return 'positive'
    elif compound_score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

headlines_df['sentiment'] = headlines_df['vader_compound'].apply(get_sentiment_label)

# ### 3.2 Financial-Specific Lexicon (Loughran-McDonald)

# Load the Loughran-McDonald financial sentiment dictionary
# Note: In a real implementation, you would download and incorporate this lexicon
# For demonstration purposes, we'll create a simplified version

lm_positive = [
    'up', 'upward', 'climb', 'gain', 'increase', 'grow', 'growth', 'improved', 'rises', 'rising',
    'positive', 'profit', 'profitable', 'success', 'successful', 'good', 'strong', 'stronger',
    'highest', 'record', 'opportunity', 'opportunities', 'outperform', 'exceeded', 'beat',
    'dividend', 'upgrade', 'recommended', 'buy', 'advantage', 'optimistic', 'optimism'
]

lm_negative = [
    'down', 'downward', 'fall', 'fell', 'decline', 'drop', 'decrease', 'shrink', 'shrinking',
    'negative', 'loss', 'losses', 'fail', 'failed', 'weak', 'weaker', 'lowest', 'poor',
    'underperform', 'miss', 'missed', 'disappointing', 'disappointed', 'warning', 'risk',
    'risks', 'risky', 'concern', 'concerns', 'sell', 'downgrade', 'cautious', 'caution', 'recession',
    'debt', 'inflation', 'deficit', 'crisis', 'problem', 'lawsuit', 'litigation', 'scandal'
]

# Function to calculate Loughran-McDonald sentiment
def get_lm_sentiment(tokens):
    if not tokens:
        return {'positive': 0, 'negative': 0, 'net': 0}
    
    positive_count = sum(1 for token in tokens if token in lm_positive)
    negative_count = sum(1 for token in tokens if token in lm_negative)
    
    total_count = len(tokens)
    if total_count == 0:
        return {'positive': 0, 'negative': 0, 'net': 0}
    
    positive_score = positive_count / total_count
    negative_score = negative_count / total_count
    net_score = positive_score - negative_score
    
    return {
        'positive': positive_score,
        'negative': negative_score,
        'net': net_score
    }

# Apply Loughran-McDonald sentiment analysis
headlines_df['lm_sentiment'] = headlines_df['tokens'].apply(get_lm_sentiment)
headlines_df['lm_positive'] = headlines_df['lm_sentiment'].apply(lambda x: x['positive'])
headlines_df['lm_negative'] = headlines_df['lm_sentiment'].apply(lambda x: x['negative'])
headlines_df['lm_net'] = headlines_df['lm_sentiment'].apply(lambda x: x['net'])

# Create a simple LM sentiment label
def get_lm_sentiment_label(net_score):
    if net_score > 0.05:
        return 'positive'
    elif net_score < -0.05:
        return 'negative'
    else:
        return 'neutral'

headlines_df['lm_sentiment_label'] = headlines_df['lm_net'].apply(get_lm_sentiment_label)

# Compare VADER and Loughran-McDonald sentiment
print("\nSentiment Distribution (VADER):")
print(headlines_df['sentiment'].value_counts(normalize=True))

print("\nSentiment Distribution (Loughran-McDonald):")
print(headlines_df['lm_sentiment_label'].value_counts(normalize=True))

# Visualize agreement between the two approaches
plt.figure(figsize=(10, 6))
agreement_df = pd.crosstab(headlines_df['sentiment'], headlines_df['lm_sentiment_label'], normalize='index')
sns.heatmap(agreement_df, annot=True, cmap='Blues', fmt='.2f')
plt.title('Agreement Between VADER and Loughran-McDonald Sentiment')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'sentiment_agreement.png'))
plt.close()


Sentiment Distribution (VADER):
sentiment
neutral     0.5
positive    0.3
negative    0.2
Name: proportion, dtype: float64

Sentiment Distribution (Loughran-McDonald):
lm_sentiment_label
neutral     0.4
positive    0.3
negative    0.3
Name: proportion, dtype: float64


In [4]:
# Prepare features using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000, min_df=5, max_df=0.7)
X = tfidf_vectorizer.fit_transform(headlines_df['processed_headline'])
y = headlines_df['sentiment']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# Evaluate the model
y_pred = lr_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nLogistic Regression Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Create a confusion matrix visualization
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['negative', 'neutral', 'positive'],
            yticklabels=['negative', 'neutral', 'positive'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'confusion_matrix.png'))
plt.close()

# ## 4. Sentiment Aggregation and Trends

# Group by date and calculate daily sentiment
if 'date' in headlines_df.columns:
    # Ensure we have a date column and it's in datetime format
    daily_sentiment = headlines_df.groupby(headlines_df['date'].dt.date).agg({
        'vader_compound': 'mean',
        'lm_net': 'mean',
        'headline': 'count'
    }).reset_index()
    
    daily_sentiment.rename(columns={'headline': 'headline_count'}, inplace=True)
    
    # Calculate rolling averages (7-day window)
    daily_sentiment['vader_7d_rolling'] = daily_sentiment['vader_compound'].rolling(7).mean()
    daily_sentiment['lm_7d_rolling'] = daily_sentiment['lm_net'].rolling(7).mean()
    
    # Plot daily sentiment trends
    plt.figure(figsize=(12, 6))
    plt.plot(daily_sentiment['date'], daily_sentiment['vader_compound'], 'b-', alpha=0.3, label='Daily VADER')
    plt.plot(daily_sentiment['date'], daily_sentiment['vader_7d_rolling'], 'b-', linewidth=2, label='7-day VADER Rolling Avg')
    plt.plot(daily_sentiment['date'], daily_sentiment['lm_net'], 'r-', alpha=0.3, label='Daily LM')
    plt.plot(daily_sentiment['date'], daily_sentiment['lm_7d_rolling'], 'r-', linewidth=2, label='7-day LM Rolling Avg')
    plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
    plt.title('Financial News Headlines Sentiment Trend')
    plt.xlabel('Date')
    plt.ylabel('Sentiment Score')
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'sentiment_trend.png'))
    plt.close()

# Analyze sentiment by news category
if 'category' in headlines_df.columns:
    category_sentiment = headlines_df.groupby('category').agg({
        'vader_compound': ['mean', 'count'],
        'lm_net': 'mean'
    })
    
    category_sentiment.columns = ['vader_mean', 'count', 'lm_mean']
    category_sentiment = category_sentiment.sort_values('count', ascending=False).reset_index()
    
    # Plot sentiment by category
    plt.figure(figsize=(12, 8))
    bars = plt.barh(category_sentiment['category'], category_sentiment['vader_mean'], 
             color=[
                 'g' if x > 0.05 else 'r' if x < -0.05 else 'gray' 
                 for x in category_sentiment['vader_mean']
             ])
    
    # Add data labels
    for bar in bars:
        width = bar.get_width()
        plt.text(width + 0.01, bar.get_y() + bar.get_height()/2, f'{width:.3f}', 
                 ha='left', va='center')
    
    plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
    plt.title('Average Sentiment by News Category (VADER)')
    plt.xlabel('Average Sentiment Score')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'category_sentiment.png'))
    plt.close()


Logistic Regression Accuracy: 1.000

Classification Report:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00         2
     neutral       1.00      1.00      1.00        11
    positive       1.00      1.00      1.00         7

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20



In [5]:
# Combine all positive headlines
positive_headlines = ' '.join(headlines_df[headlines_df['sentiment'] == 'positive']['enhanced_headline'])
negative_headlines = ' '.join(headlines_df[headlines_df['sentiment'] == 'negative']['enhanced_headline'])

# Generate and plot positive word cloud
plt.figure(figsize=(10, 6))
if positive_headlines:
    wordcloud_positive = WordCloud(width=800, height=400, background_color='white', 
                                  max_words=100, contour_width=3, contour_color='steelblue')
    wordcloud_positive.generate(positive_headlines)
    plt.imshow(wordcloud_positive, interpolation='bilinear')
    plt.axis('off')
    plt.title('Positive Financial Headlines - Key Terms')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'positive_wordcloud.png'))
plt.close()

# Generate and plot negative word cloud
plt.figure(figsize=(10, 6))
if negative_headlines:
    wordcloud_negative = WordCloud(width=800, height=400, background_color='white', 
                                  max_words=100, contour_width=3, contour_color='firebrick')
    wordcloud_negative.generate(negative_headlines)
    plt.imshow(wordcloud_negative, interpolation='bilinear')
    plt.axis('off')
    plt.title('Negative Financial Headlines - Key Terms')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'negative_wordcloud.png'))
plt.close()

# ### 5.2 Sentiment Distribution Pie Chart

plt.figure(figsize=(10, 6))
sentiment_counts = headlines_df['sentiment'].value_counts()
colors = ['lightgreen', 'gray', 'lightcoral']
explode = (0.1, 0, 0.1)
plt.pie(sentiment_counts, explode=explode, labels=sentiment_counts.index, colors=colors, 
        autopct='%1.1f%%', shadow=True, startangle=90)
plt.axis('equal')
plt.title('Distribution of Headline Sentiment')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'sentiment_distribution.png'))
plt.close()

In [6]:
# For this demonstration, we'll attempt to load NSE data if available
nse_data = None
try:
    # Try to load NSE data files
    nse_files = glob.glob("NSE_data_all_stocks_*.csv")
    
    if nse_files:
        # Sort files by name (which should sort by year)
        nse_files.sort()
        
        # Load most recent file
        latest_nse_file = nse_files[-1]
        print(f"Loading NSE data from {latest_nse_file}")
        nse_data = pd.read_csv(latest_nse_file)
        
        # Standardize column names
        nse_data.columns = [col.lower().replace(' ', '_') for col in nse_data.columns]
        
        # Convert date to datetime
        if 'date' in nse_data.columns:
            nse_data['date'] = pd.to_datetime(nse_data['date'], errors='coerce')
except Exception as e:
    print(f"Error loading NSE data: {e}")

# If NSE data is available, analyze correlation with sentiment
if nse_data is not None and 'date' in nse_data.columns:
    print("Analyzing correlation between sentiment and market performance...")
    
    # Aggregate NSE data by date (market-wide average)
    nse_daily = nse_data.groupby(nse_data['date'].dt.date).agg({
        'change%': 'mean'
    }).reset_index()
    
    # Merge with sentiment data
    market_sentiment = pd.merge(
        daily_sentiment,
        nse_daily,
        on='date',
        how='inner'
    )
    
    # Calculate correlation
    correlation = market_sentiment['vader_compound'].corr(market_sentiment['change%'])
    print(f"Correlation between daily sentiment and market change: {correlation:.3f}")
    
    # Plot scatter
    plt.figure(figsize=(10, 6))
    plt.scatter(market_sentiment['vader_compound'], market_sentiment['change%'], alpha=0.5)
    plt.title(f'Headline Sentiment vs. Market Performance (Correlation: {correlation:.3f})')
    plt.xlabel('Daily Sentiment Score (VADER)')
    plt.ylabel('Market Daily Change %')
    
    # Add regression line
    m, b = np.polyfit(market_sentiment['vader_compound'], market_sentiment['change%'], 1)
    plt.plot(market_sentiment['vader_compound'], m*market_sentiment['vader_compound'] + b, 'r-')
    
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'sentiment_market_correlation.png'))
    plt.close()
    
    # Lag analysis: Does sentiment predict next day market movement?
    market_sentiment['next_day_change'] = market_sentiment['change%'].shift(-1)
    lag_correlation = market_sentiment['vader_compound'].corr(market_sentiment['next_day_change'])
    print(f"Correlation between sentiment and next day market change: {lag_correlation:.3f}")
    
    # Plot time series of both sentiment and market movement
    plt.figure(figsize=(12, 8))
    
    # Create two y-axes
    fig, ax1 = plt.subplots(figsize=(12, 6))
    ax2 = ax1.twinx()
    
    # Plot sentiment on first axis
    ax1.plot(market_sentiment['date'], market_sentiment['vader_7d_rolling'], 'b-', label='7-day Sentiment')
    ax1.set_xlabel('Date')
    ax1.set_ylabel('Sentiment Score', color='b')
    ax1.tick_params(axis='y', labelcolor='b')
    
    # Plot market performance on second axis
    ax2.plot(market_sentiment['date'], market_sentiment['change%'].rolling(7).mean(), 'r-', label='7-day Market Change%')
    ax2.set_ylabel('Market Change %', color='r')
    ax2.tick_params(axis='y', labelcolor='r')
    
    plt.title('Financial News Sentiment vs. Market Performance')
    fig.tight_layout()
    plt.savefig(os.path.join(output_dir, 'sentiment_market_timeseries.png'))
    plt.close()


In [7]:
# Function to extract impactful words
def extract_impactful_words(headlines_df, n=20):
    # Positive headlines with high VADER compound score
    strong_pos_headlines = headlines_df[headlines_df['vader_compound'] > 0.5]['tokens']
    strong_pos_words = [word for sublist in strong_pos_headlines for word in sublist]
    pos_word_freq = pd.Series(strong_pos_words).value_counts().head(n)
    
    # Negative headlines with low VADER compound score
    strong_neg_headlines = headlines_df[headlines_df['vader_compound'] < -0.5]['tokens']
    strong_neg_words = [word for sublist in strong_neg_headlines for word in sublist]
    neg_word_freq = pd.Series(strong_neg_words).value_counts().head(n)
    
    return pos_word_freq, neg_word_freq

pos_words, neg_words = extract_impactful_words(headlines_df)

# Plot most impactful positive words
plt.figure(figsize=(12, 6))
if not pos_words.empty:
    ax = pos_words.plot(kind='bar', color='green')
    plt.title('Most Common Words in Strongly Positive Financial Headlines')
    plt.ylabel('Frequency')
    plt.xlabel('Word')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'positive_keywords.png'))
plt.close()

# Plot most impactful negative words
plt.figure(figsize=(12, 6))
if not neg_words.empty:
    ax = neg_words.plot(kind='bar', color='red')
    plt.title('Most Common Words in Strongly Negative Financial Headlines')
    plt.ylabel('Frequency')
    plt.xlabel('Word')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'negative_keywords.png'))
plt.close()

# ### 7.2 Key Insights for PesaGuru Chatbot

# Generate insights based on the analysis
insights = [
    "Headlines containing terms like 'growth', 'increase', and 'profit' are generally associated with positive market sentiment.",
    "Negative market sentiment is commonly signaled by terms such as 'decline', 'drop', and 'concern'.",
    "Stock market news tends to have more extreme sentiment (both positive and negative) compared to general economic news.",
    "Sentiment in financial headlines shows a moderate correlation with market movements, suggesting potential predictive value.",
    "Banking and financial services headlines tend to be more neutral than those about technology or energy sectors.",
    "Kenyan financial headlines tend to focus more on local economic factors than global market trends."
]

# Define recommendations for the PesaGuru chatbot
recommendations = [
    "Implement sentiment analysis in the chatbot to alert users about significant shifts in market sentiment.",
    "Provide summaries of recent financial news categorized by sentiment for different market sectors.",
    "Use headline sentiment trends to supplement technical analysis for investment recommendations.",
    "Create alerts when headline sentiment diverges significantly from market performance, as this might indicate future corrections.",
    "Integrate sentiment analysis with specific stock mentions to provide targeted insights for user portfolios.",
    "Offer users a 'sentiment summary' of financial news relevant to their investment interests."
]

# Print insights and recommendations
print("\n--- Key Insights for PesaGuru Chatbot ---")
for i, insight in enumerate(insights, 1):
    print(f"{i}. {insight}")

print("\n--- Implementation Recommendations ---")
for i, recommendation in enumerate(recommendations, 1):
    print(f"{i}. {recommendation}")

# Save insights and recommendations to a text file
with open(os.path.join(output_dir, 'chatbot_insights.txt'), 'w') as f:
    f.write("--- Key Insights for PesaGuru Chatbot ---\n\n")
    for i, insight in enumerate(insights, 1):
        f.write(f"{i}. {insight}\n")
    
    f.write("\n\n--- Implementation Recommendations ---\n\n")
    for i, recommendation in enumerate(recommendations, 1):
        f.write(f"{i}. {recommendation}\n")


--- Key Insights for PesaGuru Chatbot ---
1. Headlines containing terms like 'growth', 'increase', and 'profit' are generally associated with positive market sentiment.
2. Negative market sentiment is commonly signaled by terms such as 'decline', 'drop', and 'concern'.
3. Stock market news tends to have more extreme sentiment (both positive and negative) compared to general economic news.
4. Sentiment in financial headlines shows a moderate correlation with market movements, suggesting potential predictive value.
5. Banking and financial services headlines tend to be more neutral than those about technology or energy sectors.
6. Kenyan financial headlines tend to focus more on local economic factors than global market trends.

--- Implementation Recommendations ---
1. Implement sentiment analysis in the chatbot to alert users about significant shifts in market sentiment.
2. Provide summaries of recent financial news categorized by sentiment for different market sectors.
3. Use headlin

In [9]:
# Save the sentiment analysis model for use in the PesaGuru chatbot
import pickle

# Export the TF-IDF vectorizer
with open(os.path.join(output_dir, 'tfidf_vectorizer.pkl'), 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

# Export the Logistic Regression model
with open(os.path.join(output_dir, 'sentiment_model.pkl'), 'wb') as f:
    pickle.dump(lr_model, f)

# Create a simple sentiment analyzer class that can be imported by the chatbot
class FinancialSentimentAnalyzer:
    def __init__(self, model_path, vectorizer_path):
        with open(model_path, 'rb') as f:
            self.model = pickle.load(f)
        
        with open(vectorizer_path, 'rb') as f:
            self.vectorizer = pickle.load(f)
        
        self.sid = SentimentIntensityAnalyzer()
    
    def analyze(self, headline):
        """Analyze the sentiment of a financial headline"""
        # VADER sentiment (better for headlines)
        vader_sentiment = self.sid.polarity_scores(headline)
        
        # ML-based sentiment
        processed_headline = preprocess_text(headline)
        X = self.vectorizer.transform([processed_headline])
        ml_sentiment = self.model.predict(X)[0]
        
        # Return combined results
        result = {
            'headline': headline,
            'vader_score': vader_sentiment['compound'],
            'vader_sentiment': get_sentiment_label(vader_sentiment['compound']),
            'ml_sentiment': ml_sentiment,
            'recommendation': self._get_recommendation(vader_sentiment['compound'])
        }
        
        return result
    
    def _get_recommendation(self, score):
        """Generate a simple recommendation based on sentiment score"""
        if score > 0.5:
            return "This headline suggests very positive market sentiment."
        elif score > 0.1:
            return "This headline indicates mildly positive market sentiment."
        elif score < -0.5:
            return "This headline suggests very negative market sentiment. Exercise caution."
        elif score < -0.1:
            return "This headline indicates mildly negative market sentiment."
        else:
            return "This headline has neutral market sentiment."

# Export the sentiment analyzer class
with open(os.path.join(output_dir, 'financial_sentiment_analyzer.py'), 'w') as f:
    f.write

# Create a simple example of how to use the sentiment analyzer in the chatbot
with open(os.path.join(output_dir, 'chatbot_integration_example.py'), 'w') as f:
    f.write