In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
import re
import nlt
import numpy as np
import re

In [None]:


# Set visualization style
sns.set_theme(style="whitegrid")

# --- DATA LOADING ---
df = pd.DataFrame() # Initialize df outside the try block to prevent NameError
try:
    # ðŸŽ¯ CONFIRMED ABSOLUTE PATH: This bypasses all relative path issues.
    absolute_path = r'C:\Users\deres\OneDrive\Desktop\week1\week1\data\raw_analyst_ratings.csv'
    
    df = pd.read_csv(absolute_path)
    print(f"Data loaded successfully. Shape: {df.shape}")
except Exception as e:
    print(f"FATAL ERROR: Could not load data. Details: {e}")
    # df remains an empty DataFrame if loading fails

# --- Proceed ONLY if df was created successfully ---
if not df.empty:
    # Convert the 'date' column to datetime objects
    # FIX: Use explicit format to solve the ValueError:
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

    # Display first few rows to confirm loading and structure
    print("\n--- Initial Data Head ---")
    print(df.head())
else:
    print("\nAborting further analysis: DataFrame (df) is empty.")

In [None]:


# Set visualization style
sns.set_theme(style="whitegrid")

# --- DATA LOADING ---
try:
    # ðŸš¨ UPDATE THIS LINE with the path found in Cell 1 (e.g., '../data/raw_analyst_ratings.csv')
    df = pd.read_csv('raw_analyst_ratings.csv')
    print(f"Data loaded successfully. Shape: {df.shape}")
except Exception as e:
    print(f"FATAL ERROR: Could not load data. Details: {e}")
    # We stop here if the DataFrame is not created
    
# --- Proceed ONLY if df was created successfully ---
if not df.empty:
    # Convert the 'date' column to datetime objects
    # FIX: Re-add the specific format to avoid the ValueError that causes NaT
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

    # Display first few rows to confirm loading and structure
    print("\n--- Initial Data Head ---")
    print(df.head())
else:
    print("\nAborting further analysis: DataFrame (df) is empty.")

In [None]:
# 1. Calculate Lengths
if not df.empty:
    df['headline_length'] = df['headline'].apply(len)
    df['headline_word_count'] = df['headline'].apply(lambda x: len(str(x).split()))

    # 2. Print Basic Statistics
    print("\n--- Headline Length Statistics (Characters) ---")
    print(df['headline_length'].describe())

    # 3. Visualize Distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(df['headline_length'], bins=50, kde=True, color='purple')
    plt.title('Distribution of Headline Lengths (Characters)')
    plt.xlabel('Character Length')
    plt.ylabel('Number of Articles')
    plt.show()

In [None]:
if not df.empty:
    # 1. Count Articles per Publisher
    publisher_counts = df['publisher'].value_counts()

    print("\n--- Top 10 Most Active Publishers ---")
    print(publisher_counts.head(10))

    # 2. Visualize Top Publishers
    plt.figure(figsize=(12, 7))
    publisher_df = publisher_counts.head(10).reset_index()
    publisher_df.columns = ['Publisher', 'Count']
    sns.barplot(x='Count', y='Publisher', data=publisher_df, palette='viridis')
    plt.title('Top 10 Publishers by Article Count')
    plt.xlabel('Number of Articles')
    plt.ylabel('Publisher')
    plt.show()

In [None]:
if not df.empty:
    # 1. Analyze Trends by Day of the Week
    df['day_of_week'] = df['date'].dt.day_name()
    weekly_counts = df['day_of_week'].value_counts().reindex([
        'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'
    ])

    print("\n--- Article Counts by Day of the Week ---")
    print(weekly_counts)

    plt.figure(figsize=(8, 6))
    weekly_counts.plot(kind='bar', color='darkorange')
    plt.title('Article Frequency by Day of the Week')
    plt.ylabel('Article Count')
    plt.xticks(rotation=45)
    plt.show()

    # 2. Analyze Trends Over Time (Monthly)
    df['publication_month'] = df['date'].dt.to_period('M')
    monthly_counts = df['publication_month'].astype(str).value_counts().sort_index()

    plt.figure(figsize=(15, 6))
    monthly_counts.plot(kind='line', marker='o', color='teal')
    plt.title('Article Frequency Over Time (Monthly)')
    plt.xlabel('Date (Month)')
    plt.ylabel('Article Count')
    plt.xticks(rotation=45)
    plt.grid(True, linestyle='--')
    plt.tight_layout()
    plt.show()

In [None]:

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

# Download necessary NLTK data (run this once)
try:
    nltk.download('wordnet')
    nltk.download('stopwords')
except:
    pass # Already downloaded

# Initialize Lemmatizer and define custom stop words
lemmatizer = WordNetLemmatizer()
# Add common finance/news terms to the standard list of stop words
CUSTOM_STOPWORDS = set(stopwords.words('english') + [
    'stock', 'price', 'target', 'rating', 'analyst', 'maintains', 
    'upgrade', 'downgrade', 'buy', 'sell', 'hold', 'news', 'market', 
    'week', 'session', 'hit', 'high', 'low'
])

def preprocess(text):
    """Clean, tokenize, remove stopwords, and lemmatize text."""
    if pd.isna(text):
        return []

    # 1. Tokenize and remove short tokens
    tokens = [token for token in simple_preprocess(str(text), deacc=True) if len(token) >= 3]
    
    # 2. Remove stop words
    tokens = [token for token in tokens if token not in CUSTOM_STOPWORDS]
    
    # 3. Lemmatization (reducing words to their base form)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

# Apply preprocessing to the 'headline' column
if not df.empty:
    df['processed_headline'] = df['headline'].apply(preprocess)
    print("Preprocessing complete. Sample:")
    print(df[['headline', 'processed_headline']].head())

In [None]:
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel


if not df.empty:
    # 1. Create a Dictionary and Corpus for LDA
    # Dictionary maps each unique word to an ID
    dictionary = Dictionary(df['processed_headline'].tolist())
    
    # Filter out tokens that appear in less than 15 documents or more than 50% of the documents
    dictionary.filter_extremes(no_below=15, no_above=0.5)
    
    # Corpus is a list of (token_id, token_count) tuples for each document
    corpus = [dictionary.doc2bow(doc) for doc in df['processed_headline'].tolist()]
    
    # 2. Train the LDA Model
    NUM_TOPICS = 5 # A good starting point, can be optimized later

    lda_model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=NUM_TOPICS,
        random_state=42,
        chunksize=100,
        passes=10,
        per_word_topics=True
    )

    # 3. Print the Topics and Keywords
    print(f"\n--- LDA Model Topics (N={NUM_TOPICS}) ---")
    for idx, topic in lda_model.print_topics(num_words=5):
        print(f"Topic {idx}: {topic}")
    
    # 4. Assign the dominant topic back to the DataFrame
    def format_topics_sentences(ldamodel, corpus, texts):
        sent_topics_df = pd.DataFrame()
        
        # Get dominant topic for each document
        for i, row_list in enumerate(ldamodel[corpus]):
            row = row_list[0] if lda_model.per_word_topics else row_list
            # Ensure row is not empty
            if not row:
                continue

            row = sorted(row, key=lambda x: (x[1]), reverse=True)
            # Get the Dominant Topic and its probability (Perc_Contribution)
            dominant_topic = row[0][0]
            perc_contribution = row[0][1]
            
            # Get the keywords for that topic
            topic_keywords = ldamodel.print_topic(dominant_topic, 5)
            
            sent_topics_df = pd.concat([
                sent_topics_df,
                pd.Series([int(dominant_topic), round(perc_contribution, 4), topic_keywords]).to_frame().T
            ], ignore_index=True)
            
        sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
        
        # Add the original headline
        contents = pd.Series(texts)
        sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
        return sent_topics_df

    df_topic_keywords = format_topics_sentences(lda_model, corpus, df['headline'].tolist())
    
    # Show how the topics map to the headlines
    print("\n--- Headlines with Dominant Topic Assignment ---")
    print(df_topic_keywords.head())

In [None]:
# Assuming df is loaded and 'date' column is datetime
if not df.empty:
    print("--- Time Series Analysis: Publication Frequency ---")
    
    # 1. Monthly Frequency Trend
    df['publication_month'] = df['date'].dt.to_period('M')
    monthly_counts = df['publication_month'].value_counts().sort_index()

    plt.figure(figsize=(15, 6))
    monthly_counts.plot(kind='line', marker='o', color='teal', linewidth=2)
    plt.title('Article Frequency Over Time (Monthly)')
    plt.xlabel('Date (Month)')
    plt.ylabel('Article Count')
    plt.xticks(rotation=45)
    plt.grid(True, linestyle='--')
    plt.tight_layout()
    plt.show()

    # 2. Daily Frequency Trend (Can help spot specific event spikes)
    df['publication_day'] = df['date'].dt.to_period('D')
    daily_counts = df['publication_day'].value_counts().sort_index()

    plt.figure(figsize=(15, 6))
    daily_counts.plot(kind='line', color='darkred', linewidth=1)
    plt.title('Article Frequency Over Time (Daily Spikes)')
    plt.xlabel('Date (Day)')
    plt.ylabel('Article Count')
    plt.xticks(rotation=45)
    plt.grid(True, linestyle='--')
    plt.tight_layout()
    plt.show()

In [None]:
if not df.empty:
    print("--- Intraday Timing Analysis: Peak Hours ---")
    
    # Extract the hour of publication
    df['publication_hour'] = df['date'].dt.hour
    
    # Count the frequency by hour
    hourly_counts = df['publication_hour'].value_counts().sort_index()

    plt.figure(figsize=(10, 6))
    # Use a bar plot for clear hourly comparison
    sns.barplot(x=hourly_counts.index, y=hourly_counts.values, palette='viridis')
    
    plt.title('News Publication Frequency by Hour of the Day')
    plt.xlabel('Hour of Day (24h Clock)')
    plt.ylabel('Number of Articles')
    plt.xticks(range(0, 24)) # Ensure all 24 hours are represented on the x-axis
    plt.tight_layout()
    plt.show()
    
    # Identify and print the top 3 peak hours
    top_hours = hourly_counts.nlargest(3)
    print("\nTop 3 Peak News Publication Hours:")
    for hour, count in top_hours.items():
        print(f"Hour {hour}:00 (e.g., {hour}:00 AM/PM) with {count:,} articles.")

In [None]:


if not df.empty:
    print("--- 1. Publisher Dominance Analysis ---")
    
    # 1.1 Analyze Raw Publisher Counts (Recap from EDA)
    raw_publisher_counts = df['publisher'].value_counts()
    print("Top 10 Raw Publisher Names (by volume):")
    print(raw_publisher_counts.head(10))
    
    # -----------------------------------------------------
    
    print("\n--- 2. Domain Normalization (Identifying Organizations) ---")
    
    # Function to extract the domain name from a publisher string
    def extract_domain(publisher):
        publisher = str(publisher).lower().strip()
        # If it contains an '@' sign (likely an email), extract the domain part
        if '@' in publisher:
            return publisher.split('@')[-1]
        # For non-email names, return the name itself (or standardize it)
        # We can also attempt to extract domains from URLs if available, but for now, we focus on normalizing names.
        return publisher

    # Apply domain extraction
    df['publisher_domain'] = df['publisher'].apply(extract_domain)
    
    # Analyze the normalized domain counts
    domain_counts = df['publisher_domain'].value_counts()
    print("Top 10 Normalized Publishers (by domain/volume):")
    print(domain_counts.head(10))
    
    # -----------------------------------------------------
    
    print("\n--- 3. Topic Difference by Dominant Publisher ---")
    
    # Select the top 3-5 dominant domains for topic analysis
    top_domains = domain_counts.head(5).index.tolist()
    
    # Assuming you have run the LDA Topic Modeling and have an 'processed_headline' column
    # If you haven't run the LDA yet, skip this section or run Cells 4 & 5 first.
    
    if 'processed_headline' in df.columns:
        # Re-run the LDA assignment for the full dataset (from your previous LDA cell)
        # Note: This is computationally heavy; only run if Topic Modeling was successful.
        
        # --- (Required setup from LDA Cell 5: Dictionary, Corpus, and lda_model) ---
        # Since I don't have the lda_model object here, I will structure the analysis
        # to focus on keywords as a proxy for topic differences.
        
        print(f"Analyzing headline keywords for the top 5 domains: {top_domains}")
        
        for domain in top_domains:
            # Filter the DataFrame for the current domain
            domain_df = df[df['publisher_domain'] == domain]
            
            # Combine all processed headlines into one large string
            all_text = ' '.join([' '.join(tokens) for tokens in domain_df['processed_headline']])
            
            # Count the frequency of words
            word_counts = pd.Series(all_text.split()).value_counts()
            
            print(f"\n--- Top 5 Keywords for {domain} ({len(domain_df)} articles) ---")
            print(word_counts.head(5).to_string())
            
        print("\nInterpretation:")
        print("By comparing the top keywords across domains, you can infer differences in their reporting focus (e.g., one focuses on 'merger' and 'acquisition', another on 'economic' and 'report').")
    else:
        print("\nTopic analysis skipped: 'processed_headline' column not found. Please run Topic Modeling first.")