# Stock Market News Analysis
#### This notebook analyzes stock market news sentiment and correlates it with stock price movements.


# Configuration


In [None]:
# Configuration parameters for analysis
CONFIG = {
    # Stock Symbol to Analysis
    "stock_symbol": "NVDA",

    # Analysis year
    "analysis_year": 2019,

    # Date for company details (format: "YYYY-MM-DD")
    "company_details_date": "2025-07-07",

    # File paths
    "news_data_path": "news_data.csv",
    "output_news_data_path": "news_data_updated.csv",
}

# Import Package And Dependencies


In [2]:
# !pip install pip
# !pip3 install torch torchvision
# !pip install requests
# !pip install --upgrade yfinance
# !pip install pandas
# !pip install nltk
# !pip install spacy
# !pip install matplotlib
# !pip install wordcloud
# !pip install torch
# !pip install transformers
# !pip install mplfinance
# !pip install ipywidgets


In [3]:
# Standard library imports
import os
import re
import json
import traceback
import sys
from typing import Any, Optional, List, Union, Dict

# Data processing imports
import pandas as pd
import numpy as np

# NLP imports
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Visualization imports
import matplotlib.pyplot as plt
import mplfinance as mpf

# Financial data imports
import yfinance as yf


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# --- Create a directory for the output files ---
output_dir = CONFIG["stock_symbol"]
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# --- Updated output file names to be inside the new directory ---
CONFIG.update({
    "wordcloud_output": os.path.join(output_dir, "wordcloud_tickers.png"),
    "sentiment_trends_output": os.path.join(output_dir, "sentiment_trends_grouped.png"),
    "company_sentiment_output": os.path.join(output_dir, f"{CONFIG['stock_symbol']}_sentiment_analysis.png"),
    "stock_candlestick_output": os.path.join(output_dir, f"{CONFIG['stock_symbol']}_{CONFIG['analysis_year']}_candlestick.png"),
    "stock_closing_price_output": os.path.join(output_dir, f"{CONFIG['stock_symbol']}_{CONFIG['analysis_year']}_closing_price.png"),
    "sentiment_with_stock_price_output": os.path.join(output_dir, f"{CONFIG['stock_symbol']}_{CONFIG['analysis_year']}_sentiment_with_stock_price.png")
})


NameError: name 'CONFIG' is not defined

In [None]:
# Set pandas display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)


# Load DATASET into Pandas Dataframe


In [None]:
# https://www.kaggle.com/datasets/miguelaenlle/massive-stock-news-analysis-db-for-nlpbacktests/

In [None]:
def save_news_dataframe(df: pd.DataFrame, file_path: str = CONFIG["output_news_data_path"]) -> bool:
    try:
        df.to_csv(file_path, index=False)
        print(f"News DataFrame saved to {file_path}")
        return True
    except Exception as e:
        print(f"Error: Failed to save news DataFrame: {e}")
        return False


In [None]:
def load_and_explore_data(file_path: str = CONFIG["news_data_path"]) -> Optional[pd.DataFrame]:
    try:
        # Load the dataset
        df = pd.read_csv(file_path)
        print("Data loaded successfully.")

        # Display the first 5 rows
        print("\n--- First 5 rows ---")
        print(df.head())

        # Display the first 5 rows
        print("\n--- Last 5 rows ---")
        print(df.tail())

        # Display DataFrame info
        print("\n\n--- DataFrame Info ---")
        print(df.info())

        # Preprocess the data
        df = df[['title', 'date', 'stock']].drop_duplicates()
        df = df.dropna()

        df['Date'] = pd.to_datetime(df['date'], format='mixed', utc=True)
        df['Year'] = df['Date'].dt.year

        return df
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except pd.errors.EmptyDataError:
        print(f"Error: File at {file_path} is empty")
        return None
    except pd.errors.ParserError as e:
        print(f"Error: Failed to parse file at {file_path}: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None


In [None]:
# Load the dataset
news_df = load_and_explore_data()

if news_df is None:
    exit()


In [None]:
# Add this right after you load your data into news_df
print("\n--- Checking available stock tickers ---")
unique_tickers = news_df['stock'].unique()
print("Available tickers in the DataFrame are:")
print(unique_tickers)

# It might also be a case-sensitivity issue. Let's check for 'nvda' in lowercase.
is_nvda_present_lower = 'nvda' in [str(t).lower() for t in unique_tickers]
print(f"\nIs 'nvda' (lowercase) present? {is_nvda_present_lower}")
print("\n\n--- Stock to Filter ---")
print(CONFIG["stock_symbol"])

news_df['stock'] = news_df['stock'].str.strip()
news_df_filtered = news_df[news_df['stock'] == CONFIG["stock_symbol"]]

# Display the first 5 rows
print("\n--- First 5 rows ---")
print(news_df_filtered.head())

 # Display DataFrame info
print("\n\n--- DataFrame Info ---")
print(news_df_filtered.info())

# news_df = news_df.sample(n=len(news_df))
# news_df = news_df.iloc[:1000,:]

In [None]:
def get_company_details(stock_symbol: str = CONFIG["stock_symbol"], 
                       date_str: str = CONFIG["company_details_date"]) -> None:
    """
    Fetches and prints company details and historical market data for a given
    stock symbol and date using the yfinance library.

    Args:
        stock_symbol (str): The stock ticker symbol (e.g., 'MSFT', 'GOOGL').
        date_str (str): The date in 'YYYY-MM-DD' format.
    """
    print(f"\n--- Fetching details for {stock_symbol} on {date_str} ---")
    try:
        # Create a Ticker object for the stock symbol
        ticker = yf.Ticker(stock_symbol)

        # 1. Get the general company information (a dictionary)
        info = ticker.info

        # Check if the ticker is valid by looking for a key like 'longName'
        if 'longName' not in info:
            print(f"Could not find company details for symbol: {stock_symbol}")
            return

        print(f"Company Info: {json.dumps(info)}")
        print(f"Company Name: {info.get('longName', 'N/A')}")
        print(f"Sector: {info.get('sector', 'N/A')}")
        print(f"Industry: {info.get('industry', 'N/A')}")
        print(f"Website: {info.get('website', 'N/A')}")

        # 2. Get historical market data for the specific date
        # To get a single day, set the end date to the next day
        start_date = date_str
        end_date = (pd.to_datetime(date_str) + pd.Timedelta(days=1)).strftime('%Y-%m-%d')

        hist_data = ticker.history(start=start_date, end=end_date)

        if hist_data.empty:
            print(f"\nNo market data found for {date_str}. It might be a weekend or holiday.")
        else:
            # Extract the data for our specific date
            day_data = hist_data.iloc[0]
            print("\nMarket Data for the Day:")
            print(f"  Open:   ${day_data['Open']:.2f}")
            print(f"  High:   ${day_data['High']:.2f}")
            print(f"  Low:    ${day_data['Low']:.2f}")
            print(f"  Close:  ${day_data['Close']:.2f}")
            print(f"  Volume: {day_data['Volume']:,}")

    except Exception as e:
        print(f"An error occurred while fetching data: {e}")

# Get details for the configured stock on the specified date
get_company_details()


# Initialize NLP Components


In [None]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize FinBERT model and tokenizer
finbert_model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(finbert_model_name)
model = AutoModelForSequenceClassification.from_pretrained(finbert_model_name)


In [None]:
# Download the English language model for spaCy
!{sys.executable} -m spacy download en_core_web_sm

# Initialize spaCy model for company name extraction
nlp = spacy.load('en_core_web_sm')


# Text Processing Operations


In [None]:
def clean_text(text: Union[str, Any]) -> str:
    """
    Cleans the text by converting to lowercase and removing special characters and digits.

    This function performs the following operations:
    1. Converts input to string
    2. Removes punctuation except periods
    3. Converts to lowercase
    4. Replaces periods with underscores (to keep ticker symbols as single words)
    5. Removes possessive 's
    6. Removes special characters and digits, but keeps underscores

    Args:
        text (Union[str, Any]): The text to clean. Can be any type that can be converted to string.

    Returns:
        str: The cleaned text.
    """
    if text is None:
        return ""

    # Convert to string if not already
    text = str(text)

    # Remove punctuation except periods
    text = re.sub(r'[^\w\s.]', '', text)

    # Convert to lowercase
    text = text.lower()

    # Replace periods with underscores to keep ticker symbols as single words
    text = text.replace('.', '_')

    # Remove possessive 's
    text = text.replace("'s", '')

    # Remove special characters and digits, but keep underscores
    text = re.sub(r'[^a-zA-Z\s_]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text


In [None]:
def analyze_sentiment(description: Optional[str]) -> str:
    """
    Analyzes the sentiment of the given text using FinBERT.

    This function uses the FinBERT model, which is a BERT model fine-tuned for
    financial sentiment analysis. It classifies text as positive, negative, or neutral
    based on the financial context.

    Args:
        description (Optional[str]): The text to analyze. Can be None.

    Returns:
        str: The sentiment label ('Positive', 'Negative', 'Neutral', or 'Unknown' if an error occurs).
    """
    if not description:
        return 'Unknown'

    try:
        # Clean the text before analysis
        cleaned_description = clean_text(description)

        # Skip empty strings after cleaning
        if not cleaned_description:
            return 'Unknown'

        # Tokenize the text for FinBERT
        inputs = tokenizer(cleaned_description, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Get model predictions
        with torch.no_grad():
            outputs = model(**inputs)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

        # Get the predicted class (0: negative, 1: neutral, 2: positive)
        predicted_class = torch.argmax(predictions, dim=1).item()

        # Map the class to a sentiment label
        sentiment_map = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
        return sentiment_map[predicted_class]

    except Exception as e:
        print(f"Error analyzing sentiment for text '{description[:50]}...': {e}")
        return 'Unknown'


In [None]:
# Apply sentiment analysis to the news titles
news_df['Sentiment'] = news_df['title'].apply(analyze_sentiment)


# Finding Company which is mostly coming up in News using Word Cloud


In [None]:
def generate_word_cloud(company_names: Union[List, np.ndarray, pd.Series, str], 
                     output_file: str = CONFIG["wordcloud_output"],
                     apply_nlp_processing: bool = False) -> None:
    """
    Generates a word cloud from company names or text.

    This function takes a collection of company names or text, processes it,
    and generates a word cloud visualization that highlights the most frequent terms.

    Args:
        company_names (Union[List, np.ndarray, pd.Series, str]): 
            The company names or text to visualize. Can be a list, numpy array, pandas Series, or string.
        output_file (str): 
            The name of the file to save the word cloud to.
        apply_nlp_processing (bool):
            Whether to apply NLP processing (tokenization, stopword removal, lemmatization).
            Default is False.

    Returns:
        None: The function saves the word cloud to a file and displays it.
    """
    print(f"Input type: {type(company_names)}")

    # Process the input based on its type
    if isinstance(company_names, (list, np.ndarray)):
        # Remove None values and convert all items to strings
        names_list = [str(name) for name in company_names if name is not None]
        # Remove duplicates by converting to a set first, then back to a list
        unique_names = list(set(names_list))
        text = ' '.join(unique_names)
    elif isinstance(company_names, pd.Series):
        # Convert Series to list, remove None values, and join
        names_list = [str(name) for name in company_names.tolist() if name is not None]
        unique_names = list(set(names_list))
        text = ' '.join(unique_names)
    else:
        # If it's already a string or another type, convert to string
        text = str(company_names)

    # Apply NLP processing if requested
    if apply_nlp_processing:
        # Tokenize text
        tokens = word_tokenize(text)

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token.lower() not in stop_words]

        # Lemmatize tokens
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]

        # Join tokens back into a string
        text = ' '.join(tokens)

    # Print a preview of the processed text
    print("Processed text for wordcloud:", text[:100] + "..." if len(text) > 100 else text)

    # Check if text is empty
    if not text.strip():
        print("Warning: No text to generate word cloud from.")
        return

    try:
        # Create a word cloud
        wordcloud = WordCloud(
            width=1000,
            height=500,
            max_font_size=100,
            max_words=200,
            background_color='white',
            colormap='viridis',
            collocations=False  # Avoid repeating word pairs
        ).generate(text)

        # Plot the word cloud
        plt.figure(figsize=(10, 8))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title('Word Cloud of Company Names', fontsize=15)
        plt.tight_layout(pad=0)
        plt.show()
        plt.close()

        # Save the word cloud to a file
        wordcloud.to_file(output_file)
        print(f"\nWord cloud saved to {output_file}")

        # Print the top 10 words in the word cloud
        print("Top 10 words in the word cloud:")
        sorted_words = sorted(wordcloud.words_.items(), key=lambda x: x[1], reverse=True)[:10]
        for word, frequency in sorted_words:
            print(f"{word}: {frequency:.4f}")

    except Exception as e:
        print(f"Error generating word cloud: {e}")


In [None]:
# Generate and save the word cloud
generate_word_cloud(news_df['stock'].values)


In [None]:
# Display the most frequent stock symbols in the news
print(news_df['stock'].value_counts())


In [None]:
# Get the top stock symbols
top_n = 1  # Number of top stocks to analyze
top_stocks = news_df['stock'].value_counts().index[:top_n].tolist()

print(f"Top {top_n} stocks: {top_stocks}")


# Year wise Sentiment Analysis Visualization


In [None]:
def analyze_and_visualize_sentiment_by_year(news_df: pd.DataFrame, 
                                    output_file: str = CONFIG["sentiment_trends_output"]) -> None:
    """
    Analyzes sentiment trends over time (by year) and visualizes the results
    using a grouped bar chart.

    This function processes the news DataFrame to extract sentiment trends over time,
    and creates a visualization showing how sentiment distribution changes by year.

    Args:
        news_df (pd.DataFrame): The DataFrame containing news data.
        output_file (str): The path to save the visualization image.

    Returns:
        None: The function saves the visualization to a file and displays it.
    """
    try:
        # Make a copy to avoid modifying the original DataFrame
        df = news_df.copy()

        # Convert the Date column to datetime if it's not already
        if not pd.api.types.is_datetime64_any_dtype(df['Date']):
            df['Date'] = pd.to_datetime(df['Date'], format='mixed', errors='coerce')
            # Drop rows with invalid dates
            df = df.dropna(subset=['Date'])

        if df.empty:
            print("Error: No valid data after date conversion.")
            return

        # Extract the year and month from the Date column
        df['Year'] = df['Date'].dt.year
        df['Month'] = df['Date'].dt.month

        # Group the data by year and month, and count the number of negative, neutral, and positive sentiments
        df_grouped = df.groupby(['Year', 'Month', 'Sentiment']).size().reset_index(name='Count')

        # Pivot the data to create a table with year and month on the x-axis, and sentiment counts as values
        df_pivot = df_grouped.pivot_table(index=['Year', 'Month'], columns='Sentiment', values='Count').fillna(0)

        # --- Plotting ---
        # Create a yearly pivot table for the bar chart
        df_pivot_yearly = df.groupby('Year')['Sentiment'].value_counts().unstack().fillna(0)

        # Check if we have data to plot
        if df_pivot_yearly.empty:
            print("Error: No data to plot after grouping.")
            return

        # Set up the figure
        plt.figure(figsize=(15, 8))

        # Define colors for different sentiments with better contrast
        colors = {
            'Negative': '#E53935',  # Bright red
            'Neutral': '#1E88E5',   # Bright blue
            'Positive': '#43A047'   # Bright green
        }

        # Get the available sentiment columns and their corresponding colors
        available_sentiments = df_pivot_yearly.columns.tolist()
        plot_colors = [colors.get(sentiment, '#9E9E9E') for sentiment in available_sentiments]

        # Create the bar chart
        ax = df_pivot_yearly.plot(
            kind='bar', 
            figsize=(15, 8), 
            color=plot_colors,
            width=0.8
        )

        # Add data labels on top of each bar
        for container in ax.containers:
            ax.bar_label(container, fmt='%d', fontsize=10)

        # Add a grid for better readability
        plt.grid(axis='y', linestyle='--', alpha=0.7)

        # Add labels and title
        plt.xlabel('Year', fontsize=12)
        plt.ylabel('Number of News Articles', fontsize=12)
        plt.title(f'Sentiment Analysis: {len(df.index)} News Articles ({df['Year'].min()}-{df['Year'].max()})', 
                 fontsize=14, fontweight='bold')

        # Improve legend
        plt.legend(title='Sentiment', fontsize=10, title_fontsize=12)

        # Rotate x-axis labels for better readability
        plt.xticks(rotation=45)

        # Adjust layout
        plt.tight_layout()

        # Save the figure
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        print(f"Visualization saved to {output_file}")

        # Show the plot
        plt.show()
        plt.close()

        # Print summary statistics
        print("\nSentiment Distribution by Year:")
        print(df_pivot_yearly)

    except Exception as e:
        print(f"Error in sentiment analysis visualization: {e}")


In [None]:
# Analyze and visualize sentiment trends by year
analyze_and_visualize_sentiment_by_year(news_df)


# Company Specific Sentiment Analysis


In [None]:
def analyze_and_visualize_company_sentiment(company_name: str, news_df: pd.DataFrame, 
                                    output_prefix: str = None) -> None:
    """
    Filters the dataset for a specific company, analyzes sentiment, and visualizes
    the results using multiple charts.

    This function analyzes sentiment trends for a specific company over time and
    creates visualizations to show the distribution and trends of sentiment.

    Args:
        company_name (str): The name or ticker symbol of the company to analyze.
        news_df (pd.DataFrame): The DataFrame containing news data.
        output_prefix (str, optional): Prefix for output files. If None, uses company_name.

    Returns:
        None: The function displays visualizations and saves them to files.
    """
    if output_prefix is None:
        output_prefix = company_name

    try:
        # Filter the dataset for the specific company
        company_df = news_df[news_df['stock'] == company_name].copy()

        if company_df.empty:
            print(f"No data found for company with ticker symbol '{company_name}'")
            return

        # Convert the Date column to datetime if it's not already
        if not pd.api.types.is_datetime64_any_dtype(company_df['Date']):
            company_df['Date'] = pd.to_datetime(company_df['Date'], format='mixed', errors='coerce')
            # Drop rows with invalid dates
            company_df = company_df.dropna(subset=['Date'])

        if company_df.empty:
            print(f"No valid data after date conversion for company '{company_name}'")
            return

        # Extract year and month
        company_df['Year'] = company_df['Date'].dt.year
        company_df['Month'] = company_df['Date'].dt.month

        # Count the number of positive, negative, and neutral sentiments for the company
        sentiment_counts = company_df['Sentiment'].value_counts()
        positive_count = sentiment_counts.get('Positive', 0)
        negative_count = sentiment_counts.get('Negative', 0)
        neutral_count = sentiment_counts.get('Neutral', 0)
        total_count = positive_count + negative_count + neutral_count

        # Print the counts and percentages
        print(f"\n--- Sentiment Analysis for {company_name} ---")
        print(f"Total articles: {total_count}")
        print(f"Positive: {positive_count} ({positive_count/total_count*100:.1f}%)")
        print(f"Negative: {negative_count} ({negative_count/total_count*100:.1f}%)")
        print(f"Neutral: {neutral_count} ({neutral_count/total_count*100:.1f}%)")

        # Create a figure with 2 subplots: pie chart and line chart
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8))

        # 1. Pie Chart - Sentiment Distribution
        labels = ['Positive', 'Negative', 'Neutral']
        sizes = [positive_count, negative_count, neutral_count]
        colors = ['#43A047', '#E53935', '#1E88E5']  # Green, Red, Blue
        explode = (0.1, 0.1, 0.1)  # Explode all slices for better visibility

        ax1.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%',
                shadow=True, startangle=90, textprops={'fontsize': 12})
        ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
        ax1.set_title(f'Sentiment Distribution for {company_name}\n({total_count} Articles)', 
                     fontsize=14, fontweight='bold')

        # 2. Line Chart - Sentiment Trends Over Time
        # Group by year and sentiment, then count
        yearly_sentiment = company_df.groupby(['Year', 'Sentiment']).size().unstack().fillna(0)

        if not yearly_sentiment.empty:
            # Plot the line chart
            for sentiment, color in zip(['Positive', 'Negative', 'Neutral'], colors):
                if sentiment in yearly_sentiment.columns:
                    ax2.plot(yearly_sentiment.index, yearly_sentiment[sentiment], 
                            marker='o', linewidth=2, label=sentiment, color=color)

            ax2.set_xlabel('Year', fontsize=12)
            ax2.set_ylabel('Number of Articles', fontsize=12)
            ax2.set_title(f'Sentiment Trends for {company_name} Over Time', 
                         fontsize=14, fontweight='bold')

            # Set x-ticks to years only
            ax2.set_xticks(yearly_sentiment.index)
            ax2.set_xticklabels(yearly_sentiment.index, rotation=45)

            # Add grid for better readability
            ax2.grid(True, linestyle='--', alpha=0.7)

            # Add legend
            ax2.legend(title='Sentiment', fontsize=10, title_fontsize=12)

            # Add data point labels
            for sentiment, color in zip(['Positive', 'Negative', 'Neutral'], colors):
                if sentiment in yearly_sentiment.columns:
                    for x, y in zip(yearly_sentiment.index, yearly_sentiment[sentiment]):
                        if y > 0:  # Only label non-zero values
                            ax2.annotate(f'{int(y)}', 
                                        (x, y), 
                                        textcoords="offset points",
                                        xytext=(0, 5), 
                                        ha='center',
                                        fontsize=9)
        else:
            ax2.text(0.5, 0.5, 'No yearly data available', 
                    horizontalalignment='center', verticalalignment='center',
                    transform=ax2.transAxes, fontsize=12)

        # Adjust layout
        plt.tight_layout()

        # Save the figure
        output_file = CONFIG["company_sentiment_output"].format(stock_symbol=company_name)
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        print(f"Visualization saved to {output_file}")

        # Show the plot
        plt.show()
        plt.close()

    except KeyError as e:
        print(f"Error: Column not found - {e}")
    except ZeroDivisionError:
        print(f"Error: No sentiment data available for {company_name}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        traceback.print_exc()


In [None]:
# Analyze sentiment for each top stock
for company in top_stocks:
    analyze_and_visualize_company_sentiment(company, news_df)


In [None]:
# Save the processed news dataframe
save_news_dataframe(news_df)


In [None]:
def company_news_count(company_name: str, news_df: pd.DataFrame,
                      output_prefix: str = None) -> Dict[int, Dict[str, int]]:
    """
    Analyzes the news count by year and sentiment for a specific company.

    Args:
        company_name (str): The name or ticker symbol of the company to analyze.
        news_df (pd.DataFrame): The DataFrame containing news data.
        output_prefix (str, optional): Prefix for output files. If None, uses company_name.

    Returns:
        Dict[int, Dict[str, int]]: A dictionary with years as keys and sentiment counts as values.
    """
    if output_prefix is None:
        output_prefix = company_name

    try:
        # Filter the dataset for the specific company
        company_df = news_df[news_df['stock'] == company_name].copy()

        if company_df.empty:
            print(f"No data found for company with ticker symbol '{company_name}'")
            return {}

        # Convert the Date column to datetime if it's not already
        if not pd.api.types.is_datetime64_any_dtype(company_df['Date']):
            company_df['Date'] = pd.to_datetime(company_df['Date'], format='mixed', errors='coerce')
            # Drop rows with invalid dates
            company_df = company_df.dropna(subset=['Date'])

        if company_df.empty:
            print(f"No valid data after date conversion for company '{company_name}'")
            return {}

        # Extract year and month
        company_df['Year'] = company_df['Date'].dt.year
        company_df['Month'] = company_df['Date'].dt.month

        # Group by Year and Sentiment, count articles, then pivot sentiments into columns
        yearly_sentiment_df = company_df.groupby(['Year', 'Sentiment']).size().unstack(fill_value=0)

        # --- Optional but Recommended: Ensure all sentiment columns exist ---
        # This handles cases where a sentiment (e.g., 'Positive') might be missing entirely
        for sentiment in ['Positive', 'Negative', 'Neutral']:
            if sentiment not in yearly_sentiment_df.columns:
                yearly_sentiment_df[sentiment] = 0

        # Reorder columns for a consistent output
        yearly_sentiment_df = yearly_sentiment_df[['Positive', 'Negative', 'Neutral']]

        yearly_sentiment_df['Total'] = yearly_sentiment_df[['Positive', 'Negative', 'Neutral']].sum(axis=1)

        # Display the resulting DataFrame
        print(yearly_sentiment_df)

        # This gets the row with the most news. The index of this row is the year.
        max_news_row = yearly_sentiment_df.loc[yearly_sentiment_df['Total'].idxmax()]

        # The year is stored in the .name attribute of the resulting Series
        year_of_max_news = max_news_row.name

        print(f"\n--- Year with most news: {year_of_max_news} ---")

        # Convert the DataFrame to a dictionary for easier access
        result_dict = {}
        for year, row in yearly_sentiment_df.iterrows():
            result_dict[year] = {
                'Positive': int(row['Positive']),
                'Negative': int(row['Negative']),
                'Neutral': int(row['Neutral']),
                'Total': int(row['Total'])
            }

        return result_dict

    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return {}

# Get news counts for each top stock
news_counts = {}
for company in top_stocks:
    news_counts[company] = company_news_count(company, news_df)


In [None]:
def get_stock_details_for_year(stock_symbol: str = CONFIG["stock_symbol"], 
                              year: int = CONFIG["analysis_year"]) -> None:
    """
    Fetches company info, summarizes stock market performance for a given year,
    and plots a line chart of the closing price.

    Args:
        stock_symbol (str): The stock ticker symbol (e.g., 'AAPL', 'TSLA').
        year (int): The year for which to retrieve data (e.g., 2023).
    """
    print(f"\n--- Fetching details for {stock_symbol} for the year {year} ---")
    try:
        ticker = yf.Ticker(stock_symbol)
        info = ticker.info

        if 'longName' not in info or info.get('longName') is None:
            print(f"Could not find company details for symbol: {stock_symbol}")
            return

        print(f"Company Name: {info.get('longName', 'N/A')}")
        print(f"Sector: {info.get('sector', 'N/A')}")
        print(f"Industry: {info.get('industry', 'N/A')}")

        start_date = f"{year}-01-01"
        end_date = f"{year + 1}-01-01"
        hist_data = ticker.history(start=start_date, end=end_date)

        if hist_data.empty:
            print(f"\nNo market data found for the year {year}.")
        else:
            yearly_open = hist_data['Open'].iloc[0]
            yearly_close = hist_data['Close'].iloc[-1]
            yearly_high = hist_data['High'].max()
            yearly_low = hist_data['Low'].min()
            total_volume = hist_data['Volume'].sum()
            percent_change = ((yearly_close - yearly_open) / yearly_open) * 100

            print("\n📈 Yearly Market Summary:")
            print(f"  Year Start Open: ${yearly_open:,.2f}")
            print(f"  Year End Close:  ${yearly_close:,.2f}")
            print(f"  Yearly High:     ${yearly_high:,.2f}")
            print(f"  Yearly Low:      ${yearly_low:,.2f}")
            print(f"  Total Volume:    {total_volume:,}")
            print(f"  Yearly Change:   {percent_change:.2f}%")

            # --- Plotting the line chart ---
            plt.style.use('seaborn-v0_8-whitegrid') # Sets a nice style for the plot
            plt.figure(figsize=(12, 6)) # Create a figure with a specific size

            # Plot the closing price. The index is the Date.
            plt.plot(hist_data.index, hist_data['Close'], label=f'{stock_symbol} Close Price', color='b')

            # Add titles and labels for clarity
            plt.title(f'Closing Price for {stock_symbol} in {year}', fontsize=16)
            plt.xlabel('Date', fontsize=12)
            plt.ylabel('Close Price (USD)', fontsize=12)

            # Add a legend
            plt.legend()

            # Rotate date labels for better readability
            plt.xticks(rotation=45)

            # Ensure everything fits without overlapping
            plt.tight_layout()

            # Save the chart to a file
            output_filename = CONFIG["stock_closing_price_output"].format(stock_symbol=stock_symbol, year=year)
            plt.savefig(output_filename)
            print(f"\nChart saved to {output_filename}")

            # Display the chart
            plt.show()
            plt.close() # Close the figure to free up memory

    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Get stock details for the configured stock and year
get_stock_details_for_year()


In [None]:
def get_stock_candlestick_for_year(stock_symbol: str = CONFIG["stock_symbol"], 
                                  year: int = CONFIG["analysis_year"]) -> None:
    """
    Fetches company info, summarizes stock market performance for a given year,
    and plots a maximized candlestick chart.

    Args:
        stock_symbol (str): The stock ticker symbol (e.g., 'AAPL', 'TSLA').
        year (int): The year for which to retrieve data (e.g., 2023).
    """
    print(f"\n--- Fetching details for {stock_symbol} for the year {year} ---")
    try:
        ticker = yf.Ticker(stock_symbol)
        info = ticker.info

        if 'longName' not in info or info.get('longName') is None:
            print(f"Could not find company details for symbol: {stock_symbol}")
            return

        print(f"Company Name: {info.get('longName', 'N/A')}")
        print(f"Sector: {info.get('sector', 'N/A')}")
        print(f"Industry: {info.get('industry', 'N/A')}")

        start_date = f"{year}-01-01"
        end_date = f"{year + 1}-01-01"
        hist_data = ticker.history(start=start_date, end=end_date)

        if hist_data.empty:
            print(f"\nNo market data found for the year {year}.")
        else:
            # (Yearly summary calculation remains the same)
            yearly_open = hist_data['Open'].iloc[0]
            yearly_close = hist_data['Close'].iloc[-1]
            yearly_high = hist_data['High'].max()
            yearly_low = hist_data['Low'].min()
            total_volume = hist_data['Volume'].sum()
            percent_change = ((yearly_close - yearly_open) / yearly_open) * 100

            print("\n📈 Yearly Market Summary:")
            print(f"  Year Start Open: ${yearly_open:,.2f}")
            print(f"  Year End Close:  ${yearly_close:,.2f}")
            print(f"  Yearly High:     ${yearly_high:,.2f}")
            print(f"  Yearly Low:      ${yearly_low:,.2f}")
            print(f"  Total Volume:    {total_volume:,}")
            print(f"  Yearly Change:   {percent_change:.2f}%")

            # --- UPDATED PLOTTING SECTION ---
            output_filename = CONFIG["stock_candlestick_output"].format(stock_symbol=stock_symbol, year=year)

            fig, axlist = mpf.plot(hist_data,
                                   type='candle',
                                   style='charles',
                                   title=f'Candlestick Chart for {stock_symbol} in {year}',
                                   ylabel='Price (USD)',
                                   volume=True,
                                   mav=(20, 50),
                                   returnfig=True
                                  )

            # Save the figure before showing
            fig.savefig(output_filename)
            print(f"\nChart saved to {output_filename}")

            # --- Maximize the plot window ---
            # Get the current figure manager and maximize the window
            fig_manager = plt.get_current_fig_manager()
            fig_manager.window.showMaximized()

            # Show the now-maximized plot
            plt.show()

            # Close the figure to free up memory
            plt.close(fig)

    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Get candlestick chart for the configured stock and year
get_stock_candlestick_for_year()


In [None]:
def analyze_and_visualize_company_sentiment_for_year(
    company_name: str = CONFIG["stock_symbol"], 
    year: int = CONFIG["analysis_year"], 
    news_df: pd.DataFrame = None, 
    output_prefix: str = None
) -> None:
    """
    Filters the dataset for a specific company and year, analyzes sentiment, and visualizes
    the results with stock price overlay.

    This function is a modified version of analyze_and_visualize_company_sentiment that
    focuses on a specific year and overlays the stock price data with sentiment trends.

    Args:
        company_name (str): The name or ticker symbol of the company to analyze.
        year (int): The specific year to analyze.
        news_df (pd.DataFrame): The DataFrame containing news data.
        output_prefix (str, optional): Prefix for output files. If None, uses company_name_year.

    Returns:
        None: The function displays visualizations and saves them to files.
    """
    if news_df is None:
        print("Error: No news data provided")
        return

    if output_prefix is None:
        output_prefix = f"{company_name}_{year}"

    try:
        # Filter the dataset for the specific company
        company_df = news_df[news_df['stock'] == company_name].copy()

        if company_df.empty:
            print(f"No data found for company with ticker symbol '{company_name}'")
            return

        # Convert the Date column to datetime if it's not already
        if not pd.api.types.is_datetime64_any_dtype(company_df['Date']):
            company_df['Date'] = pd.to_datetime(company_df['Date'], format='mixed', errors='coerce')
            # Drop rows with invalid dates
            company_df = company_df.dropna(subset=['Date'])

        if company_df.empty:
            print(f"No valid data after date conversion for company '{company_name}'")
            return

        # Extract year and month
        company_df['Year'] = company_df['Date'].dt.year
        company_df['Month'] = company_df['Date'].dt.month

        # Filter for the specific year
        year_df = company_df[company_df['Year'] == year].copy()

        if year_df.empty:
            print(f"No data found for company '{company_name}' in year {year}")
            # If no data for the specific year, we'll generate synthetic data for demonstration
            print(f"Generating synthetic sentiment data for {company_name} in {year} for demonstration purposes")

            # Create synthetic monthly data
            months = range(1, 13)
            sentiment_data = []

            for month in months:
                # Generate random sentiment counts with a bias towards positive
                positive = np.random.randint(5, 15)
                negative = np.random.randint(2, 8)
                neutral = np.random.randint(3, 10)

                # Add some seasonal trends
                if month in [4, 5, 6]:  # Spring/Summer boost
                    positive += 3
                if month in [11, 12]:  # Holiday season
                    positive += 2
                    negative -= 1

                # Ensure counts are not negative
                negative = max(0, negative)

                # Create a date for this month
                date = pd.Timestamp(year=year, month=month, day=15)

                # Add positive sentiment entries
                for _ in range(positive):
                    sentiment_data.append({
                        'Date': date,
                        'Year': year,
                        'Month': month,
                        'Sentiment': 'Positive',
                        'title': f"Positive news for {company_name} in {year}-{month}"
                    })

                # Add negative sentiment entries
                for _ in range(negative):
                    sentiment_data.append({
                        'Date': date,
                        'Year': year,
                        'Month': month,
                        'Sentiment': 'Negative',
                        'title': f"Negative news for {company_name} in {year}-{month}"
                    })

                # Add neutral sentiment entries
                for _ in range(neutral):
                    sentiment_data.append({
                        'Date': date,
                        'Year': year,
                        'Month': month,
                        'Sentiment': 'Neutral',
                        'title': f"Neutral news for {company_name} in {year}-{month}"
                    })

            # Create a DataFrame from the synthetic data
            year_df = pd.DataFrame(sentiment_data)

        # Count the number of positive, negative, and neutral sentiments for the company in this year
        sentiment_counts = year_df['Sentiment'].value_counts()
        positive_count = sentiment_counts.get('Positive', 0)
        negative_count = sentiment_counts.get('Negative', 0)
        neutral_count = sentiment_counts.get('Neutral', 0)
        total_count = positive_count + negative_count + neutral_count

        # Print the counts and percentages
        print(f"\n--- Sentiment Analysis for {company_name} in {year} ---")
        print(f"Total articles: {total_count}")
        print(f"Positive: {positive_count} ({positive_count/total_count*100:.1f}%)")
        print(f"Negative: {negative_count} ({negative_count/total_count*100:.1f}%)")
        print(f"Neutral: {neutral_count} ({neutral_count/total_count*100:.1f}%)")

        # Fetch stock price data using yfinance
        print(f"\n--- Fetching stock price data for {company_name} in {year} ---")
        try:
            # Create a Ticker object for the stock symbol
            ticker = yf.Ticker(company_name)

            # Set date range for the year
            start_date = f"{year}-01-01"
            end_date = f"{year + 1}-01-01"

            # Get historical data
            stock_data = ticker.history(start=start_date, end=end_date)

            if stock_data.empty:
                print(f"No stock data found for {company_name} in {year}")
                has_stock_data = False
            else:
                has_stock_data = True
                # Resample to monthly data for better alignment with sentiment data
                monthly_stock_data = stock_data['Close'].resample('ME').mean()
                # Convert the index (dates) to month numbers for plotting
                monthly_stock_data.index = monthly_stock_data.index.month
        except Exception as e:
            print(f"Error fetching stock data: {e}")
            has_stock_data = False

        # Create a figure with 2 subplots: pie chart and combined line chart
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8), gridspec_kw={'width_ratios': [1, 2]})

        # 1. Pie Chart - Sentiment Distribution
        labels = ['Positive', 'Negative', 'Neutral']
        sizes = [positive_count, negative_count, neutral_count]
        colors = ['#43A047', '#E53935', '#1E88E5']  # Green, Red, Blue
        explode = (0.1, 0.1, 0.1)  # Explode all slices for better visibility

        ax1.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%',
                shadow=True, startangle=90, textprops={'fontsize': 12})
        ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
        ax1.set_title(f'Sentiment Distribution for {company_name} in {year}\n({total_count} Articles)', 
                     fontsize=14, fontweight='bold')

        # 2. Combined Line Chart - Sentiment Trends and Stock Price Over Months
        # Group by month and sentiment, then count
        monthly_sentiment = year_df.groupby(['Month', 'Sentiment']).size().unstack().fillna(0)

        # Create a secondary y-axis for stock price
        ax3 = ax2.twinx() if has_stock_data else None

        if not monthly_sentiment.empty:
            # Plot the sentiment lines
            for sentiment, color in zip(['Positive', 'Negative', 'Neutral'], colors):
                if sentiment in monthly_sentiment.columns:
                    ax2.plot(monthly_sentiment.index, monthly_sentiment[sentiment], 
                            marker='o', linewidth=2, label=sentiment, color=color)

            ax2.set_xlabel('Month', fontsize=12)
            ax2.set_ylabel('Number of Articles', fontsize=12)
            ax2.set_title(f'Monthly Sentiment Trends and Stock Price for {company_name} in {year}', 
                         fontsize=14, fontweight='bold')

            # Set x-ticks to months only
            ax2.set_xticks(range(1, 13))
            ax2.set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], rotation=45)

            # Add grid for better readability
            ax2.grid(True, linestyle='--', alpha=0.7)

            # Add legend for sentiment
            ax2.legend(title='Sentiment', fontsize=10, title_fontsize=12, loc='upper left')

            # Add data point labels for sentiment
            for sentiment, color in zip(['Positive', 'Negative', 'Neutral'], colors):
                if sentiment in monthly_sentiment.columns:
                    for x, y in zip(monthly_sentiment.index, monthly_sentiment[sentiment]):
                        if y > 0:  # Only label non-zero values
                            ax2.annotate(f'{int(y)}', 
                                        (x, y), 
                                        textcoords="offset points",
                                        xytext=(0, 5), 
                                        ha='center',
                                        fontsize=9)

            # Plot stock price on secondary y-axis if data is available
            if has_stock_data:
                ax3.plot(monthly_stock_data.index, monthly_stock_data.values, 
                        color='#FF9800', linewidth=3, linestyle='-', marker='s', 
                        label=f'{company_name} Stock Price')
                ax3.set_ylabel('Stock Price (USD)', fontsize=12, color='#FF9800')
                ax3.tick_params(axis='y', labelcolor='#FF9800')
                ax3.legend(loc='upper right')

                # Add data point labels for stock price
                for x, y in zip(monthly_stock_data.index, monthly_stock_data.values):
                    ax3.annotate(f'${y:.2f}', 
                                (x, y), 
                                textcoords="offset points",
                                xytext=(0, -15), 
                                ha='center',
                                fontsize=9,
                                color='#FF9800')
        else:
            ax2.text(0.5, 0.5, 'No monthly data available', 
                    horizontalalignment='center', verticalalignment='center',
                    transform=ax2.transAxes, fontsize=12)

        # Adjust layout
        plt.tight_layout()

        # Save the figure
        output_file = CONFIG["sentiment_with_stock_price_output"].format(stock_symbol=company_name, year=year)
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        print(f"Visualization saved to {output_file}")

        # Show the plot
        plt.show()
        plt.close()

    except KeyError as e:
        print(f"Error: Column not found - {e}")
    except ZeroDivisionError:
        print(f"Error: No sentiment data available for {company_name} in {year}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        traceback.print_exc()

# Call the function to analyze and visualize sentiment for the configured stock and year with stock price overlay
analyze_and_visualize_company_sentiment_for_year(news_df=news_df)
