In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import os
import json
import requests
from datetime import datetime, timedelta
import time
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

# NLP libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import spacy
from textblob import TextBlob
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

# Load spaCy model for English
try:
    nlp = spacy.load('en_core_web_sm')
except:
    # If the model isn't installed, download it
    import subprocess
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load('en_core_web_sm')

# Set plot style
plt.style.use('fivethirtyeight')
sns.set(style='whitegrid')

# For saving outputs
OUTPUT_DIR = "../../../outputs/sentiment_analysis"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# API configuration
API_CONFIG = {
    "twitter": {
        "rapidapi_key": "29a112a0f4mshdb1b2aa2ac46841p1b3131jsn23bae608f9ab"
    },
    "reddit": {
         "rapidapi_key": "29a112a0f4mshdb1b2aa2ac46841p1b3131jsn23bae608f9ab"
    },
    "news": {
        "rapidapi_key": "29a112a0f4mshdb1b2aa2ac46841p1b3131jsn23bae608f9ab"
    }
}

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
def get_twitter_data(query, count=100, lang="en"):
    """
    Fetch tweets related to a specific query using RapidAPI
    
    Args:
        query (str): Search query (e.g., "#SCOM", "Safaricom stock")
        count (int): Number of tweets to retrieve
        lang (str): Language code
        
    Returns:
        pandas.DataFrame: DataFrame containing tweets and metadata
    """
    try:
        url = "https://twitter154.p.rapidapi.com/search/search"
        
        querystring = {
            "query": query,
            "section": "top",
            "min_retweets": "5",
            "min_likes": "5", 
            "limit": str(count),
            "language": lang
        }
        
        headers = {
            "X-RapidAPI-Key": API_CONFIG["twitter"]["rapidapi_key"],
            "X-RapidAPI-Host": "twitter154.p.rapidapi.com"
        }
        
        response = requests.get(url, headers=headers, params=querystring)
        
        if response.status_code == 200:
            data = response.json()
            
            # Extract relevant fields
            tweets = []
            for result in data.get('results', []):
                tweet = {
                    'id': result.get('tweet_id'),
                    'text': result.get('text'),
                    'created_at': result.get('creation_date'),
                    'user': result.get('user', {}).get('username'),
                    'likes': result.get('favorite_count'),
                    'retweets': result.get('retweet_count')
                }
                tweets.append(tweet)
                
            df = pd.DataFrame(tweets)
            df['source'] = 'twitter'
            df['created_at'] = pd.to_datetime(df['created_at'])
            
            print(f"Retrieved {len(df)} tweets for query: '{query}'")
            return df
        else:
            print(f"Error fetching Twitter data: {response.status_code}")
            return pd.DataFrame()
            
    except Exception as e:
        print(f"Error in Twitter API call: {str(e)}")
        return pd.DataFrame()

def get_reddit_data(subreddit, time_filter="week", limit=100):
    """
    Fetch posts from a specific subreddit
    
    Args:
        subreddit (str): Subreddit name (e.g., "investing", "stocks")
        time_filter (str): Time filter for posts (hour, day, week, month, year, all)
        limit (int): Maximum number of posts to retrieve
        
    Returns:
        pandas.DataFrame: DataFrame containing Reddit posts and metadata
    """
    try:
        url = f"https://reddit-api-five.vercel.app/api/search?q=&subreddit={subreddit}&sort=top&time={time_filter}&limit={limit}"
        response = requests.get(url)
        
        if response.status_code == 200:
            data = response.json()
            posts = data.get('posts', [])
            
            # Extract relevant fields
            reddit_posts = []
            for post in posts:
                post_data = {
                    'id': post.get('id'),
                    'title': post.get('title'),
                    'text': post.get('selftext', ''),
                    'created_at': datetime.fromtimestamp(post.get('created_utc', 0)),
                    'score': post.get('score'),
                    'url': post.get('url'),
                    'num_comments': post.get('num_comments')
                }
                
                # Combine title and text for sentiment analysis
                post_data['full_text'] = post_data['title'] + ' ' + post_data['text']
                reddit_posts.append(post_data)
                
            df = pd.DataFrame(reddit_posts)
            df['source'] = 'reddit'
            
            print(f"Retrieved {len(df)} posts from r/{subreddit}")
            return df
        else:
            print(f"Error fetching Reddit data: {response.status_code}")
            return pd.DataFrame()
            
    except Exception as e:
        print(f"Error in Reddit API call: {str(e)}")
        return pd.DataFrame()

def get_financial_news(query, limit=10):
    """
    Fetch financial news articles about a specific topic
    
    Args:
        query (str): Search query (e.g., "Safaricom", "Kenya banking sector")
        limit (int): Maximum number of articles to retrieve
        
    Returns:
        pandas.DataFrame: DataFrame containing news articles
    """
    try:
        url = "https://news-api14.p.rapidapi.com/top-headlines"
        
        querystring = {
            "country": "us,gb,ke",
            "language": "en",
            "pageSize": str(limit),
            "category": "business",
            "q": query
        }
        
        headers = {
            "X-RapidAPI-Key": API_CONFIG["news"]["rapidapi_key"],
            "X-RapidAPI-Host": "news-api14.p.rapidapi.com"
        }
        
        response = requests.get(url, headers=headers, params=querystring)
        
        if response.status_code == 200:
            data = response.json()
            articles = data.get('articles', [])
            
            # Extract relevant fields
            news_articles = []
            for article in articles:
                article_data = {
                    'title': article.get('title'),
                    'description': article.get('description', ''),
                    'content': article.get('content', ''),
                    'url': article.get('url'),
                    'source': article.get('source', {}).get('name'),
                    'published_at': article.get('publishedAt')
                }
                
                # Combine title, description and content for sentiment analysis
                article_data['text'] = ' '.join([
                    article_data['title'], 
                    article_data['description'], 
                    article_data['content']
                ])
                
                news_articles.append(article_data)
                
            df = pd.DataFrame(news_articles)
            df['source'] = 'news'
            df['created_at'] = pd.to_datetime(df['published_at'])
            
            print(f"Retrieved {len(df)} news articles for query: '{query}'")
            return df
        else:
            print(f"Error fetching News data: {response.status_code}")
            return pd.DataFrame()
            
    except Exception as e:
        print(f"Error in News API call: {str(e)}")
        return pd.DataFrame()


In [4]:

def clean_text(text):
    """Clean and normalize text for sentiment analysis"""
    if pd.isna(text) or text is None:
        return ""
    
    # Convert to string if not already
    text = str(text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Remove user mentions and hashtags (keep hashtag text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#(\w+)', r'\1', text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text.lower()

def remove_stopwords(text, additional_stopwords=None):
    """Remove stopwords from text"""
    stop_words = set(stopwords.words('english'))
    
    # Add financial-specific stopwords
    financial_stopwords = {
        'stock', 'stocks', 'market', 'markets', 'price', 'prices',
        'trade', 'trading', 'investor', 'investors', 'investment',
        'investments', 'share', 'shares'
    }
    
    stop_words.update(financial_stopwords)
    
    if additional_stopwords:
        stop_words.update(set(additional_stopwords))
    
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    
    return ' '.join(filtered_text)

def lemmatize_text(text):
    """Lemmatize text to get base words"""
    lemmatizer = WordNetLemmatizer()
    word_tokens = word_tokenize(text)
    lemmatized_text = [lemmatizer.lemmatize(word) for word in word_tokens]
    
    return ' '.join(lemmatized_text)

def preprocess_text(text, remove_stop=True, lemmatize=True):
    """Full preprocessing pipeline for text data"""
    if pd.isna(text) or not text:
        return ""
    
    # Clean the text
    cleaned_text = clean_text(text)
    
    # Remove stopwords if requested
    if remove_stop:
        cleaned_text = remove_stopwords(cleaned_text)
    
    # Lemmatize if requested
    if lemmatize:
        cleaned_text = lemmatize_text(cleaned_text)
    
    return cleaned_text

def preprocess_dataframe(df, text_column):
    """Preprocess text data in a DataFrame column"""
    df['clean_text'] = df[text_column].apply(preprocess_text)
    return df

In [5]:
def get_vader_sentiment(text):
    """
    Get sentiment scores using VADER (Valence Aware Dictionary and Sentiment Reasoner)
    VADER is particularly useful for social media text
    """
    if pd.isna(text) or not text:
        return {'compound': 0, 'pos': 0, 'neu': 0, 'neg': 0}
    
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(text)
    
    return sentiment_scores

def get_textblob_sentiment(text):
    """Get sentiment using TextBlob"""
    if pd.isna(text) or not text:
        return {'polarity': 0, 'subjectivity': 0}
    
    analysis = TextBlob(text)
    return {
        'polarity': analysis.sentiment.polarity,
        'subjectivity': analysis.sentiment.subjectivity
    }

def classify_sentiment(compound_score):
    """Classify sentiment based on compound score"""
    if compound_score >= 0.05:
        return 'positive'
    elif compound_score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

def analyze_sentiment(df, text_column='clean_text'):
    """
    Analyze sentiment of text in a DataFrame column using multiple methods
    
    Args:
        df (pandas.DataFrame): DataFrame containing text data
        text_column (str): Column containing preprocessed text
        
    Returns:
        pandas.DataFrame: DataFrame with added sentiment columns
    """
    # Make a copy to avoid modifying the original
    result_df = df.copy()
    
    # Initialize new columns
    result_df['vader_compound'] = 0.0
    result_df['vader_pos'] = 0.0
    result_df['vader_neu'] = 0.0
    result_df['vader_neg'] = 0.0
    result_df['textblob_polarity'] = 0.0
    result_df['textblob_subjectivity'] = 0.0
    result_df['sentiment_label'] = 'neutral'
    
    # Apply sentiment analysis to each row
    for idx, row in tqdm(result_df.iterrows(), total=len(result_df), desc="Analyzing sentiment"):
        text = row[text_column]
        
        # VADER sentiment
        vader_scores = get_vader_sentiment(text)
        result_df.at[idx, 'vader_compound'] = vader_scores['compound']
        result_df.at[idx, 'vader_pos'] = vader_scores['pos']
        result_df.at[idx, 'vader_neu'] = vader_scores['neu']
        result_df.at[idx, 'vader_neg'] = vader_scores['neg']
        
        # TextBlob sentiment
        textblob_scores = get_textblob_sentiment(text)
        result_df.at[idx, 'textblob_polarity'] = textblob_scores['polarity']
        result_df.at[idx, 'textblob_subjectivity'] = textblob_scores['subjectivity']
        
        # Classify sentiment based on VADER compound score
        result_df.at[idx, 'sentiment_label'] = classify_sentiment(vader_scores['compound'])
    
    # Calculate the sentiment score (combined from both methods)
    # Normalize both to range -1 to 1, and take average
    result_df['sentiment_score'] = (result_df['vader_compound'] + result_df['textblob_polarity']) / 2
    
    return result_df

def calculate_sentiment_stats(df):
    """Calculate summary statistics for sentiment analysis"""
    stats = {
        'total_count': len(df),
        'positive_count': sum(df['sentiment_label'] == 'positive'),
        'neutral_count': sum(df['sentiment_label'] == 'neutral'),
        'negative_count': sum(df['sentiment_label'] == 'negative'),
        'positive_percentage': (sum(df['sentiment_label'] == 'positive') / len(df)) * 100 if len(df) > 0 else 0,
        'neutral_percentage': (sum(df['sentiment_label'] == 'neutral') / len(df)) * 100 if len(df) > 0 else 0,
        'negative_percentage': (sum(df['sentiment_label'] == 'negative') / len(df)) * 100 if len(df) > 0 else 0,
        'average_compound': df['vader_compound'].mean(),
        'average_polarity': df['textblob_polarity'].mean(),
        'average_sentiment_score': df['sentiment_score'].mean()
    }
    
    return stats

In [6]:
def plot_sentiment_distribution(df, title="Sentiment Distribution", figsize=(12, 6)):
    """Plot the distribution of sentiment labels"""
    plt.figure(figsize=figsize)
    
    # Count the sentiment labels
    sentiment_counts = df['sentiment_label'].value_counts()
    
    # Create a bar chart
    ax = sentiment_counts.plot(kind='bar', color=['green', 'gray', 'red'])
    
    # Add percentage labels on top of bars
    for i, count in enumerate(sentiment_counts):
        percentage = (count / len(df)) * 100
        ax.text(i, count + (max(sentiment_counts) * 0.02), f"{percentage:.1f}%", 
                ha='center', va='bottom', fontsize=12)
    
    plt.title(title, fontsize=16)
    plt.xlabel('Sentiment', fontsize=14)
    plt.ylabel('Count', fontsize=14)
    plt.xticks(rotation=0, fontsize=12)
    plt.yticks(fontsize=12)
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    
    return plt

def plot_sentiment_timeline(df, date_column='created_at', title="Sentiment Over Time"):
    """Plot sentiment trends over time"""
    # Ensure datetime format
    df[date_column] = pd.to_datetime(df[date_column])
    
    # Group by date and sentiment
    df['date'] = df[date_column].dt.date
    sentiment_by_date = df.groupby(['date', 'sentiment_label']).size().unstack().fillna(0)
    
    # Calculate daily totals and percentages
    sentiment_by_date['total'] = sentiment_by_date.sum(axis=1)
    
    for sentiment in ['positive', 'neutral', 'negative']:
        if sentiment in sentiment_by_date.columns:
            sentiment_by_date[f'{sentiment}_pct'] = (sentiment_by_date[sentiment] / sentiment_by_date['total']) * 100
    
    # Plot the percentages over time
    plt.figure(figsize=(14, 8))
    
    if 'positive_pct' in sentiment_by_date.columns:
        plt.plot(sentiment_by_date.index, sentiment_by_date['positive_pct'], 'g-', label='Positive')
    
    if 'neutral_pct' in sentiment_by_date.columns:
        plt.plot(sentiment_by_date.index, sentiment_by_date['neutral_pct'], 'b-', label='Neutral')
    
    if 'negative_pct' in sentiment_by_date.columns:
        plt.plot(sentiment_by_date.index, sentiment_by_date['negative_pct'], 'r-', label='Negative')
    
    plt.title(title, fontsize=16)
    plt.xlabel('Date', fontsize=14)
    plt.ylabel('Percentage (%)', fontsize=14)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    return plt

def generate_wordcloud(df, column='clean_text', title="Word Cloud", figsize=(12, 8)):
    """Generate a word cloud from text data"""
    # Combine all text into a single string
    text = ' '.join(df[column].dropna())
    
    # Generate the word cloud
    wordcloud = WordCloud(
        width=800, 
        height=500, 
        background_color='white',
        max_words=150,
        colormap='viridis',
        collocations=False
    ).generate(text)
    
    # Plot the word cloud
    plt.figure(figsize=figsize)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=16)
    plt.tight_layout()
    
    return plt

def generate_sentiment_wordcloud(df, sentiment_type='positive', figsize=(12, 8)):
    """Generate a word cloud for a specific sentiment type"""
    # Filter data by sentiment
    filtered_df = df[df['sentiment_label'] == sentiment_type]
    
    # Generate title based on sentiment
    title = f"Most Common Words in {sentiment_type.capitalize()} Posts"
    
    # Generate and return the word cloud
    return generate_wordcloud(filtered_df, column='clean_text', title=title, figsize=figsize)

def plot_source_sentiment(df, title="Sentiment Distribution by Source", figsize=(14, 8)):
    """Plot sentiment distribution across different data sources"""
    # Check if there's more than one source
    if len(df['source'].unique()) <= 1:
        print("Only one source present in data, skipping source comparison plot.")
        return None
    
    # Group by source and sentiment
    source_sentiment = df.groupby(['source', 'sentiment_label']).size().unstack().fillna(0)
    
    # Calculate percentages
    source_sentiment_pct = source_sentiment.div(source_sentiment.sum(axis=1), axis=0) * 100
    
    # Plot
    plt.figure(figsize=figsize)
    ax = source_sentiment_pct.plot(kind='bar', stacked=True, 
                                   color=['red', 'gray', 'green'])
    
    # Add percentage labels
    for i, source in enumerate(source_sentiment_pct.index):
        previous_height = 0
        for j, sentiment in enumerate(source_sentiment_pct.columns):
            height = source_sentiment_pct.loc[source, sentiment]
            if height > 5:  # Only show percentage if it's large enough
                ax.text(i, previous_height + (height/2), f"{height:.1f}%", 
                        ha='center', va='center', fontsize=10, color='black')
            previous_height += height
    
    plt.title(title, fontsize=16)
    plt.xlabel('Source', fontsize=14)
    plt.ylabel('Percentage (%)', fontsize=14)
    plt.xticks(rotation=0, fontsize=12)
    plt.yticks(fontsize=12)
    plt.legend(title='Sentiment')
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    
    return plt

def plot_top_keywords(df, column='clean_text', n=20, figsize=(12, 8)):
    """Plot the most frequent keywords in the dataset"""
    # Initialize the TF-IDF vectorizer
    tfidf = TfidfVectorizer(max_features=n, stop_words='english')
    
    # Fit and transform the text data
    tfidf_matrix = tfidf.fit_transform(df[column].dropna())
    
    # Get feature names and TF-IDF scores
    feature_names = tfidf.get_feature_names_out()
    tfidf_scores = tfidf_matrix.sum(axis=0).A1
    
    # Create a DataFrame with feature names and scores
    keywords_df = pd.DataFrame({
        'keyword': feature_names,
        'tfidf_score': tfidf_scores
    })
    
    # Sort by TF-IDF score in descending order
    keywords_df = keywords_df.sort_values('tfidf_score', ascending=False)
    
    # Plot
    plt.figure(figsize=figsize)
    sns.barplot(x='tfidf_score', y='keyword', data=keywords_df, palette='viridis')
    plt.title(f"Top {n} Keywords by TF-IDF Score", fontsize=16)
    plt.xlabel('TF-IDF Score', fontsize=14)
    plt.ylabel('Keyword', fontsize=14)
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    
    return plt, keywords_df

def plot_sentiment_comparison(df1, df2, label1, label2, title="Sentiment Comparison", figsize=(12, 6)):
    """Compare sentiment distribution between two datasets"""
    # Calculate sentiment percentages for each dataset
    def get_sentiment_percentages(df):
        total = len(df)
        return {
            'positive': (sum(df['sentiment_label'] == 'positive') / total) * 100 if total > 0 else 0,
            'neutral': (sum(df['sentiment_label'] == 'neutral') / total) * 100 if total > 0 else 0,
            'negative': (sum(df['sentiment_label'] == 'negative') / total) * 100 if total > 0 else 0
        }
    
    pct1 = get_sentiment_percentages(df1)
    pct2 = get_sentiment_percentages(df2)
    
    # Create a DataFrame for plotting
    comparison_df = pd.DataFrame({
        'Sentiment': ['Positive', 'Neutral', 'Negative'],
        label1: [pct1['positive'], pct1['neutral'], pct1['negative']],
        label2: [pct2['positive'], pct2['neutral'], pct2['negative']]
    })
    
    # Plot
    plt.figure(figsize=figsize)
    
    x = np.arange(len(comparison_df['Sentiment']))
    width = 0.35
    
    plt.bar(x - width/2, comparison_df[label1], width, label=label1, color='cornflowerblue')
    plt.bar(x + width/2, comparison_df[label2], width, label=label2, color='lightcoral')
    
    # Add percentage labels
    for i, v in enumerate(comparison_df[label1]):
        plt.text(i - width/2, v + 1, f"{v:.1f}%", ha='center', va='bottom')
    
    for i, v in enumerate(comparison_df[label2]):
        plt.text(i + width/2, v + 1, f"{v:.1f}%", ha='center', va='bottom')
    
    plt.title(title, fontsize=16)
    plt.xlabel('Sentiment', fontsize=14)
    plt.ylabel('Percentage (%)', fontsize=14)
    plt.xticks(x, comparison_df['Sentiment'])
    plt.legend()
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    
    return plt

In [7]:
def detect_sentiment_shift(current_df, historical_df, threshold=15):
    """
    Detect significant shifts in sentiment compared to historical data
    
    Args:
        current_df (pandas.DataFrame): Current sentiment data
        historical_df (pandas.DataFrame): Historical sentiment data
        threshold (float): Percentage change threshold to trigger an alert
        
    Returns:
        dict: Alert information if threshold exceeded, or None
    """
    # Calculate current sentiment percentages
    current_stats = calculate_sentiment_stats(current_df)
    
    # Calculate historical sentiment percentages
    historical_stats = calculate_sentiment_stats(historical_df)
    
    # Calculate changes
    positive_change = current_stats['positive_percentage'] - historical_stats['positive_percentage']
    negative_change = current_stats['negative_percentage'] - historical_stats['negative_percentage']
    
    # Determine if alert should be triggered
    alert = None
    
    if abs(positive_change) >= threshold:
        direction = "increase" if positive_change > 0 else "decrease"
        alert = {
            "type": f"positive_sentiment_{direction}",
            "change": abs(positive_change),
            "message": f"Positive sentiment has {direction}d by {abs(positive_change):.2f}% "
                       f"(Current: {current_stats['positive_percentage']:.2f}%, "
                       f"Historical: {historical_stats['positive_percentage']:.2f}%)"
        }
    
    elif abs(negative_change) >= threshold:
        direction = "increase" if negative_change > 0 else "decrease"
        alert = {
            "type": f"negative_sentiment_{direction}",
            "change": abs(negative_change),
            "message": f"Negative sentiment has {direction}d by {abs(negative_change):.2f}% "
                       f"(Current: {current_stats['negative_percentage']:.2f}%, "
                       f"Historical: {historical_stats['negative_percentage']:.2f}%)"
        }
    
    return alert

def generate_sentiment_report(df, topic, time_period="last 7 days"):
    """
    Generate a summary report of sentiment analysis
    
    Args:
        df (pandas.DataFrame): DataFrame with sentiment analysis results
        topic (str): Topic or stock being analyzed
        time_period (str): Time period of the data
        
    Returns:
        str: Markdown-formatted report
    """
    # Calculate stats
    stats = calculate_sentiment_stats(df)
    
    # Calculate most common positive and negative keywords
    positive_df = df[df['sentiment_label'] == 'positive']
    negative_df = df[df['sentiment_label'] == 'negative']
    
    # Extract keywords using TF-IDF
    tfidf = TfidfVectorizer(max_features=10, stop_words='english')
    
    # Get top positive keywords if enough data
    top_positive_keywords = []
    if len(positive_df) >= 5:
        tfidf_matrix = tfidf.fit_transform(positive_df['clean_text'].dropna())
        feature_names = tfidf.get_feature_names_out()
        tfidf_scores = tfidf_matrix.sum(axis=0).A1
        top_positive_keywords = [feature_names[i] for i in tfidf_scores.argsort()[::-1][:5]]
    
    # Get top negative keywords if enough data
    top_negative_keywords = []
    if len(negative_df) >= 5:
        tfidf_matrix = tfidf.fit_transform(negative_df['clean_text'].dropna())
        feature_names = tfidf.get_feature_names_out()
        tfidf_scores = tfidf_matrix.sum(axis=0).A1
        top_negative_keywords = [feature_names[i] for i in tfidf_scores.argsort()[::-1][:5]]
    
    # Generate report
    report = f"""# Sentiment Analysis Report: {topic}
## Overview ({time_period})

- **Total Items Analyzed**: {stats['total_count']}
- **Data Sources**: {', '.join(df['source'].unique())}
- **Time Range**: {df['created_at'].min().date()} to {df['created_at'].max().date()}

## Sentiment Distribution

- **Positive**: {stats['positive_count']} ({stats['positive_percentage']:.2f}%)
- **Neutral**: {stats['neutral_count']} ({stats['neutral_percentage']:.2f}%)
- **Negative**: {stats['negative_count']} ({stats['negative_percentage']:.2f}%)

## Average Sentiment Scores

- **VADER Compound Score**: {stats['average_compound']:.4f} (Range: -1 to +1)
- **TextBlob Polarity**: {stats['average_polarity']:.4f} (Range: -1 to +1)
- **Combined Sentiment Score**: {stats['average_sentiment_score']:.4f}

## Key Insights

"""
    
    # Add overall sentiment assessment
    if stats['average_sentiment_score'] >= 0.25:
        report += "- **Overall Sentiment**: Strongly Positive ✅✅\n"
    elif stats['average_sentiment_score'] >= 0.05:
        report += "- **Overall Sentiment**: Positive ✅\n"
    elif stats['average_sentiment_score'] <= -0.25:
        report += "- **Overall Sentiment**: Strongly Negative ❌❌\n"
    elif stats['average_sentiment_score'] <= -0.05:
        report += "- **Overall Sentiment**: Negative ❌\n"
    else:
        report += "- **Overall Sentiment**: Neutral ⚖️\n"
    
    # Add top keywords
    if top_positive_keywords:
        report += f"- **Top Positive Keywords**: {', '.join(top_positive_keywords)}\n"
    
    if top_negative_keywords:
        report += f"- **Top Negative Keywords**: {', '.join(top_negative_keywords)}\n"
    
    # Add source breakdown if multiple sources
    if len(df['source'].unique()) > 1:
        report += "\n## Sentiment by Source\n\n"
        for source in df['source'].unique():
            source_df = df[df['source'] == source]
            source_stats = calculate_sentiment_stats(source_df)
            report += f"- **{source.capitalize()}**: "
            report += f"Positive: {source_stats['positive_percentage']:.2f}%, "
            report += f"Neutral: {source_stats['neutral_percentage']:.2f}%, "
            report += f"Negative: {source_stats['negative_percentage']:.2f}%\n"
    
    # Add sentiment over time insight if enough data points
    if len(df['created_at'].dt.date.unique()) >= 3:
        report += "\n## Sentiment Trends\n\n"
        
        latest_date = df['created_at'].max().date()
        earliest_date = df['created_at'].min().date()
        
        # Compare earliest vs latest day
        early_df = df[df['created_at'].dt.date == earliest_date]
        late_df = df[df['created_at'].dt.date == latest_date]
        
        if len(early_df) > 0 and len(late_df) > 0:
            early_stats = calculate_sentiment_stats(early_df)
            late_stats = calculate_sentiment_stats(late_df)
            
            positive_change = late_stats['positive_percentage'] - early_stats['positive_percentage']
            
            if abs(positive_change) >= 10:
                direction = "increased" if positive_change > 0 else "decreased"
                report += f"- Positive sentiment has **{direction}** by {abs(positive_change):.2f}% "
                report += f"from {earliest_date} to {latest_date}.\n"
    
    return report

def save_sentiment_data(df, filename="sentiment_data", format="csv"):
    """Save sentiment analysis results to a file"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    file_path = os.path.join(OUTPUT_DIR, f"{filename}_{timestamp}.{format}")
    
    if format.lower() == "csv":
        df.to_csv(file_path, index=False)
    elif format.lower() == "json":
        df.to_json(file_path, orient="records", date_format="iso")
    elif format.lower() == "excel":
        df.to_excel(file_path, index=False)
    else:
        raise ValueError(f"Unsupported format: {format}")
    
    print(f"Saved sentiment data to {file_path}")
    return file_path

In [8]:
def analyze_stock_sentiment(stock_code, stock_name=None, days=7):
    """
    Analyze sentiment for a specific stock across multiple data sources
    
    Args:
        stock_code (str): Stock code (e.g., "SCOM" for Safaricom)
        stock_name (str, optional): Stock name for better search results
        days (int): Number of days to look back
        
    Returns:
        pandas.DataFrame: Combined sentiment analysis results
    """
    if stock_name is None:
        stock_name = stock_code
    
    print(f"Analyzing sentiment for {stock_name} ({stock_code}) over the past {days} days...")
    
    # Build search queries
    twitter_query = f"${stock_code} OR #{stock_code} OR {stock_name} stock"
    reddit_subreddits = ["investing", "stocks", "StockMarket"]
    news_query = f"{stock_name} stock OR {stock_code}"
    
    # Collect data from different sources
    twitter_data = get_twitter_data(twitter_query, count=100)
    
    # Collect Reddit data from multiple subreddits
    reddit_dfs = []
    for subreddit in reddit_subreddits:
        df = get_reddit_data(subreddit, time_filter="week" if days <= 7 else "month")
        # Filter for posts containing the stock code or name
        if not df.empty:
            df = df[df['full_text'].str.contains(stock_code, case=False) | 
                    df['full_text'].str.contains(stock_name, case=False)]
            reddit_dfs.append(df)
    
    reddit_data = pd.concat(reddit_dfs, ignore_index=True) if reddit_dfs else pd.DataFrame()
    
    news_data = get_financial_news(news_query, limit=20)
    
    # Combine data from all sources
    dfs = []
    if not twitter_data.empty:
        twitter_data['text_to_analyze'] = twitter_data['text']
        dfs.append(twitter_data)
    
    if not reddit_data.empty:
        reddit_data['text_to_analyze'] = reddit_data['full_text']
        dfs.append(reddit_data)
    
    if not news_data.empty:
        news_data['text_to_analyze'] = news_data['text']
        dfs.append(news_data)
    
    # If no data was collected, return empty DataFrame
    if not dfs:
        print(f"No data found for {stock_name} ({stock_code})")
        return pd.DataFrame()
    
    # Combine all data
    combined_df = pd.concat(dfs, ignore_index=True)
    
    # Preprocess text
    combined_df = preprocess_dataframe(combined_df, 'text_to_analyze')
    
    # Analyze sentiment
    sentiment_df = analyze_sentiment(combined_df)
    
    # Filter by date if requested
    if days > 0:
        cutoff_date = datetime.now() - timedelta(days=days)
        sentiment_df = sentiment_df[sentiment_df['created_at'] >= cutoff_date]
    
    # Add metadata
    sentiment_df['stock_code'] = stock_code
    sentiment_df['stock_name'] = stock_name
    sentiment_df['analysis_date'] = datetime.now()
    
    print(f"Sentiment analysis complete for {stock_name} ({stock_code})")
    print(f"Total items analyzed: {len(sentiment_df)}")
    
    return sentiment_df

def analyze_multiple_stocks(stock_list):
    """
    Analyze sentiment for multiple stocks
    
    Args:
        stock_list (list): List of dictionaries with 'code' and 'name' keys
        
    Returns:
        dict: Dictionary of DataFrames with sentiment analysis results for each stock
    """
    results = {}
    
    for stock in stock_list:
        stock_code = stock['code']
        stock_name = stock['name']
        
        print(f"\nAnalyzing {stock_name} ({stock_code})...")
        
        # Analyze sentiment for this stock
        sentiment_df = analyze_stock_sentiment(stock_code, stock_name)
        
        # Store results if we got data
        if not sentiment_df.empty:
            results[stock_code] = sentiment_df
    
    return results

def compare_stock_sentiment(stock1, stock2):
    """
    Compare sentiment between two stocks
    
    Args:
        stock1 (dict): Dictionary with 'code' and 'name' for first stock
        stock2 (dict): Dictionary with 'code' and 'name' for second stock
        
    Returns:
        tuple: DataFrames with sentiment analysis results for both stocks
    """
    # Analyze sentiment for both stocks
    stock1_df = analyze_stock_sentiment(stock1['code'], stock1['name'])
    stock2_df = analyze_stock_sentiment(stock2['code'], stock2['name'])
    
    # Compare results
    if not stock1_df.empty and not stock2_df.empty:
        # Generate comparison visualization
        plt = plot_sentiment_comparison(
            stock1_df, stock2_df, 
            stock1['name'], stock2['name'],
            title=f"Sentiment Comparison: {stock1['name']} vs {stock2['name']}"
        )
        
        plt.savefig(os.path.join(OUTPUT_DIR, f"comparison_{stock1['code']}_{stock2['code']}.png"))
        plt.close()
        
        # Generate comparison report
        stock1_stats = calculate_sentiment_stats(stock1_df)
        stock2_stats = calculate_sentiment_stats(stock2_df)
        
        print("\nComparison Results:")
        print(f"{stock1['name']} Sentiment Score: {stock1_stats['average_sentiment_score']:.4f}")
        print(f"{stock2['name']} Sentiment Score: {stock2_stats['average_sentiment_score']:.4f}")
        
        if stock1_stats['average_sentiment_score'] > stock2_stats['average_sentiment_score']:
            print(f"{stock1['name']} has more positive sentiment")
        else:
            print(f"{stock2['name']} has more positive sentiment")
    
    return stock1_df, stock2_df

def analyze_market_sentiment(market_name="NSE", days=7):
    """
    Analyze overall market sentiment
    
    Args:
        market_name (str): Market name (e.g., "NSE" for Nairobi Stock Exchange)
        days (int): Number of days to look back
        
    Returns:
        pandas.DataFrame: Market sentiment analysis results
    """
    print(f"Analyzing overall {market_name} market sentiment...")
    
    # Build search queries
    twitter_query = f"{market_name} stock market OR {market_name} index"
    news_query = f"{market_name} stock market"
    
    # Collect data
    twitter_data = get_twitter_data(twitter_query, count=100)
    
    reddit_dfs = []
    for subreddit in ["investing", "stocks", "finance"]:
        df = get_reddit_data(subreddit, time_filter="week" if days <= 7 else "month")
        if not df.empty:
            df = df[df['full_text'].str.contains(market_name, case=False)]
            reddit_dfs.append(df)
    
    reddit_data = pd.concat(reddit_dfs, ignore_index=True) if reddit_dfs else pd.DataFrame()
    
    news_data = get_financial_news(news_query, limit=30)
    
    # Combine data
    dfs = []
    if not twitter_data.empty:
        twitter_data['text_to_analyze'] = twitter_data['text']
        dfs.append(twitter_data)
    
    if not reddit_data.empty:
        reddit_data['text_to_analyze'] = reddit_data['full_text']
        dfs.append(reddit_data)
    
    if not news_data.empty:
        news_data['text_to_analyze'] = news_data['text']
        dfs.append(news_data)
    
    if not dfs:
        print(f"No data found for {market_name} market")
        return pd.DataFrame()
    
    # Combine all data
    combined_df = pd.concat(dfs, ignore_index=True)
    
    # Preprocess text
    combined_df = preprocess_dataframe(combined_df, 'text_to_analyze')
    
    # Analyze sentiment
    sentiment_df = analyze_sentiment(combined_df)
    
    # Filter by date if requested
    if days > 0:
        cutoff_date = datetime.now() - timedelta(days=days)
        sentiment_df = sentiment_df[sentiment_df['created_at'] >= cutoff_date]
    
    # Add metadata
    sentiment_df['market'] = market_name
    sentiment_df['analysis_date'] = datetime.now()
    
    print(f"Market sentiment analysis complete for {market_name}")
    print(f"Total items analyzed: {len(sentiment_df)}")
    
    return sentiment_df

def process_nse_data(stock_codes=None):
    """
    Process Nairobi Stock Exchange data and analyze sentiment for key stocks
    
    Args:
        stock_codes (list, optional): List of stock codes to analyze
            If None, analyzes a default set of top NSE stocks
            
    Returns:
        dict: Dictionary of sentiment analysis results
    """
    # Default list of key NSE stocks if none provided
    if stock_codes is None:
        stock_codes = [
            {"code": "SCOM", "name": "Safaricom"},
            {"code": "EQTY", "name": "Equity Group"},
            {"code": "KCB", "name": "KCB Group"},
            {"code": "COOP", "name": "Co-operative Bank"},
            {"code": "EABL", "name": "East African Breweries"}
        ]
    
    # Analyze market sentiment first
    market_df = analyze_market_sentiment("NSE")
    
    # Save market sentiment results
    if not market_df.empty:
        save_sentiment_data(market_df, filename="nse_market_sentiment")
        
        # Generate market report
        market_report = generate_sentiment_report(market_df, "NSE Market")
        
        # Save market report
        report_path = os.path.join(OUTPUT_DIR, f"nse_market_report_{datetime.now().strftime('%Y%m%d')}.md")
        with open(report_path, 'w') as f:
            f.write(market_report)
        
        print(f"Saved market report to {report_path}")
    
    # Analyze individual stocks
    stock_results = analyze_multiple_stocks(stock_codes)
    
    # Save results and generate visualizations for each stock
    for stock_code, sentiment_df in stock_results.items():
        if not sentiment_df.empty:
            # Save sentiment data
            save_sentiment_data(sentiment_df, filename=f"{stock_code}_sentiment")
            
            # Generate and save visualizations
            stock_name = sentiment_df['stock_name'].iloc[0]
            
            # Sentiment distribution
            plt = plot_sentiment_distribution(sentiment_df, title=f"Sentiment Distribution for {stock_name}")
            plt.savefig(os.path.join(OUTPUT_DIR, f"{stock_code}_sentiment_distribution.png"))
            plt.close()
            
            # Sentiment timeline if enough data
            if len(sentiment_df['created_at'].dt.date.unique()) > 1:
                plt = plot_sentiment_timeline(sentiment_df, title=f"Sentiment Trend for {stock_name}")
                plt.savefig(os.path.join(OUTPUT_DIR, f"{stock_code}_sentiment_timeline.png"))
                plt.close()
            
            # Word cloud
            plt = generate_wordcloud(sentiment_df, title=f"Word Cloud for {stock_name}")
            plt.savefig(os.path.join(OUTPUT_DIR, f"{stock_code}_wordcloud.png"))
            plt.close()
            
            # Generate report
            stock_report = generate_sentiment_report(sentiment_df, f"{stock_name} ({stock_code})")
            
            # Save stock report
            report_path = os.path.join(OUTPUT_DIR, f"{stock_code}_report_{datetime.now().strftime('%Y%m%d')}.md")
            with open(report_path, 'w') as f:
                f.write(stock_report)
            
            print(f"Saved stock report for {stock_name} to {report_path}")
    
    return {"market": market_df, "stocks": stock_results}