In [None]:
import nltk
# Use standard NLTK resource IDs
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')


In [None]:
import json
import re
import html
import os
import contractions
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import nltk
import pandas as pd

lemmatizer = WordNetLemmatizer()

# Define the root path of your project on Google Drive
project_root = '/content/drive/MyDrive/IS450 Project'

# Define paths relative to the project root
filtered_news_dir = os.path.join(project_root, 'Data', 'Historical News', 'Supplementary', 'unique_financial_news.xlsx')
lda_nmf_data_dir = os.path.join(project_root, 'Data', 'Historical News', 'LDA_NMF_Data')
bertopic_data_dir = os.path.join(project_root, 'Data', 'Historical News', 'BERTopic_Data')

# Ensure directories exist
if not os.path.exists(lda_nmf_data_dir):
    os.makedirs(lda_nmf_data_dir, exist_ok=True)
if not os.path.exists(bertopic_data_dir):
    os.makedirs(bertopic_data_dir, exist_ok=True)

print(f"Filtered posts directory: {filtered_news_dir}")
print(f"LDA/NMF Data directory: {lda_nmf_data_dir}")
print(f"BERTopic Data directory: {bertopic_data_dir}")


In [None]:
# Define the default stop words list
default_stopwords = set(stopwords.words('english'))

# Add domain-specific stopwords
# additional_stopwords = {"stock", "company", "inc", "said", "year", "new", "q", "u", "today", "like"}
# additional_stopwords = {}
# custom_stopwords = default_stopwords.copy()
# custom_stopwords.update(additional_stopwords)

def initial_clean(text):
    """Performs initial text cleaning common to most pipelines."""
    text = re.sub(r'<.*?>', '', text)                       # Remove any remaining HTML tags (e.g., <b>, <i>) BEFORE unescaping

    text = html.unescape(text)                              # Unescape HTML entities like &amp; -> &
    text = html.unescape(text)                              # Double unescape just in case some entities were doubly encoded

    text = re.sub(r'http\S+|www\.\S+', '', text)             # Remove URLs (http/https or www.)
    text = text.lower()
    try:
        text = contractions.fix(text)                       # Expand contractions (e.g., "don't" -> "do not")
    except Exception as e:
        # print(f"Contraction fixing failed for text: {text[:100]}... Error: {e}") # Optional: uncomment to log errors
        pass                                                # Continue if contraction fixing fails for an edge case

    text = re.sub(r"\b(\w+)'s\b", r"\1", text)     # e.g., "Amazon's" → "Amazon"
    text = re.sub(r'[^a-zA-Z\s]', '', text)     # Remove all non-letter, non-space characters

    # --- Final Cleanup ---
    text = re.sub(r'\\n+|\n+', ' ', text)                   # Replace literal '\n' strings and actual newlines with a space *NOW*
    text = re.sub(r'\s+', ' ', text).strip()                # Normalize whitespace (multiple spaces to one space, trim ends)
    return text

# === Helper functions for truncation detection ===
def detect_truncated_title(title):
    # Simple heuristic: check if title ends with ellipsis or is unusually short
    return isinstance(title, str) and (title.strip().endswith("...") or len(title.strip()) < 5)

def detect_truncated_description(description):
    # Same idea as above
    return isinstance(description, str) and (description.strip().endswith("...") or len(description.strip()) < 10)


In [None]:
# Map Treebank POS tags to WordNet POS tags
def get_wordnet_pos(treebank_tag):
    """Maps NLTK POS tags to WordNet POS tags."""
    if treebank_tag.startswith('J'): return wordnet.ADJ
    elif treebank_tag.startswith('V'): return wordnet.VERB
    elif treebank_tag.startswith('N'): return wordnet.NOUN
    elif treebank_tag.startswith('R'): return wordnet.ADV
    else: return wordnet.NOUN # Default to noun

def resolve_us_token(token, pos_tag):
    """Disambiguates 'us' (pronoun) vs 'US' (United States)."""
    # Check the original token's case if needed, but POS tag is usually sufficient
    if token.lower() == "us" and pos_tag == "NNP": # NNP = Proper noun, singular
        return "united_states"
    return token

def tokenize_lemmatize_lowercase(text):
    """Tokenizes, POS tags, lemmatizes, lowercases text, and cleans leading/trailing quotes.""" # Updated docstring
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    lemmatized = [
        resolve_us_token(
            lemmatizer.lemmatize(token, get_wordnet_pos(pos)),
            pos
        )
        for token, pos in tagged
    ]
    # Lowercase AFTER lemmatization and US resolution
    lemmatized_lower = [token.lower() for token in lemmatized]

    # Define characters to strip (single and double quotes)
    cleaned_tokens = [token.lstrip("'") for token in lemmatized_lower]
    # Filter out any empty strings that might result from stripping (e.g., if token was just "'")
    final_tokens = [token for token in cleaned_tokens if token]

    return final_tokens # Return the cleaned tokens


In [None]:
def remove_stopwords_topic_model(tokens, stopwords_set):
    """Removes stopwords, single-character tokens, specified artifacts, and numeric tokens for LDA/NMF."""
    cleaned_tokens = []
    # Reinstate the full set of artifacts we want removed
    punctuation_artifacts = {"''", "'s", "``", "--", "-", "\\\\-", "...", "`", "p."}

    for token in tokens:
        if token and not token[0].isalnum():
          token = token[1:]

        # Skip if stopword
        if token in stopwords_set:
            continue

        # Skip if specific punctuation artifact
        if token in punctuation_artifacts:
            continue

        # Keep token ONLY if it's longer than 1 character AND contains at least one word character
        # Reinstates the stricter single-character filtering and general content check
        if len(token) > 1 and re.search(r'\w', token):
             cleaned_tokens.append(token)

    return cleaned_tokens


In [None]:
def preprocess_for_lda_nmf(text, stopwords_set):
    cleaned_text = initial_clean(text)
    lemmatized_tokens = tokenize_lemmatize_lowercase(cleaned_text)
    final_tokens = remove_stopwords_topic_model(lemmatized_tokens, stopwords_set)
    return final_tokens


In [None]:
def preprocess_for_bertopic(text):
    """Pipeline for generating clean text suitable for BERTopic."""
    # Only apply initial cleaning. Keep text structure.
    cleaned_text = initial_clean(text)
    # Optional: Minimal further cleanup if needed, but avoid tokenization/lemmatization here
    cleaned_text = re.sub(r'[/]', ' ', cleaned_text) # Example: replace slashes
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text


In [None]:
# === Read input file ===
df = pd.read_excel(filtered_news_dir)

# === Process and store results ===
lda_news = []
bertopic_output_data = []

for idx, row in df.iterrows():
    title = row.get("title", "")
    description = row.get("description", "")

    truncated_title = detect_truncated_title(title)
    truncated_description = detect_truncated_description(description)

    # Combine title + description before preprocessing
    combined_text = f"{title} {description}"

    # --- Run the BERTopic Pipeline ---
    text_for_bertopic = preprocess_for_bertopic(combined_text)
    bertopic_output_data.append({
        "id": idx,
        "processed_text_bertopic": text_for_bertopic,
        "truncated_title": truncated_title,
        "truncated_description": truncated_description
    })

    # --- Run the LDA/NMF Pipeline ---
    processed_tokens = preprocess_for_lda_nmf(combined_text, default_stopwords)
    lda_news.append({
        "id": idx,
        "processed_text_lda": processed_tokens,
        "truncated_title": truncated_title,
        "truncated_description": truncated_description
    })

# === Save to JSONL file ===
lda_nmf_data_dir = os.path.join(lda_nmf_data_dir, "lda_news.jsonl")
bertopic_data_dir = os.path.join(lda_nmf_data_dir, "BERTopic_Data.jsonl")

with open(lda_nmf_data_dir, "w", encoding="utf-8") as f_out:
    for record in lda_news:
        json.dump(record, f_out)
        f_out.write("\n")  # Don't forget newline for JSONL format

with open(bertopic_data_dir, "w", encoding="utf-8") as f_out:
    for record in bertopic_output_data:
        json.dump(record, f_out)
        f_out.write("\n")  # Don't forget newline for JSONL format

print(f"✅ Preprocessed data saved to: {lda_nmf_data_dir}")
print(f"✅ Preprocessed data saved to: {bertopic_data_dir}")


In [None]:
########## Predict Topics for New News Article ################
from google.colab import drive
drive.mount('/content/drive')

!pip install nltk
!pip install contractions
!pip install gensim

import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
import json
import re
import html
import os
import contractions
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import nltk
import pandas as pd
from gensim.models import Nmf
from gensim.models import TfidfModel
from gensim.corpora.dictionary import Dictionary

# --- 0. Methods Needed: ---
default_stopwords = set(stopwords.words('english'))

def initial_clean(text):
    """Performs initial text cleaning common to most pipelines."""
    text = re.sub(r'<.*?>', '', text)                       # Remove any remaining HTML tags (e.g., <b>, <i>) BEFORE unescaping

    text = html.unescape(text)                              # Unescape HTML entities like &amp; -> &
    text = html.unescape(text)                              # Double unescape just in case some entities were doubly encoded

    text = re.sub(r'http\S+|www\.\S+', '', text)             # Remove URLs (http/https or www.)
    text = text.lower()
    try:
        text = contractions.fix(text)                       # Expand contractions (e.g., "don't" -> "do not")
    except Exception as e:
        # print(f"Contraction fixing failed for text: {text[:100]}... Error: {e}") # Optional: uncomment to log errors
        pass                                                # Continue if contraction fixing fails for an edge case

    text = re.sub(r"\b(\w+)'s\b", r"\1", text)     # e.g., "Amazon's" → "Amazon"
    text = re.sub(r'[^a-zA-Z\s]', '', text)     # Remove all non-letter, non-space characters

    # --- Final Cleanup ---
    text = re.sub(r'\\n+|\n+', ' ', text)                   # Replace literal '\n' strings and actual newlines with a space *NOW*
    text = re.sub(r'\s+', ' ', text).strip()                # Normalize whitespace (multiple spaces to one space, trim ends)
    return text

lemmatizer = WordNetLemmatizer()

# Map Treebank POS tags to WordNet POS tags
def get_wordnet_pos(treebank_tag):
    """Maps NLTK POS tags to WordNet POS tags."""
    if treebank_tag.startswith('J'): return wordnet.ADJ
    elif treebank_tag.startswith('V'): return wordnet.VERB
    elif treebank_tag.startswith('N'): return wordnet.NOUN
    elif treebank_tag.startswith('R'): return wordnet.ADV
    else: return wordnet.NOUN # Default to noun

def resolve_us_token(token, pos_tag):
    """Disambiguates 'us' (pronoun) vs 'US' (United States)."""
    # Check the original token's case if needed, but POS tag is usually sufficient
    if token.lower() == "us" and pos_tag == "NNP": # NNP = Proper noun, singular
        return "united_states"
    return token

def tokenize_lemmatize_lowercase(text):
    """Tokenizes, POS tags, lemmatizes, lowercases text, and cleans leading/trailing quotes.""" # Updated docstring
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    lemmatized = [
        resolve_us_token(
            lemmatizer.lemmatize(token, get_wordnet_pos(pos)),
            pos
        )
        for token, pos in tagged
    ]
    # Lowercase AFTER lemmatization and US resolution
    lemmatized_lower = [token.lower() for token in lemmatized]

    # Define characters to strip (single and double quotes)
    cleaned_tokens = [token.lstrip("'") for token in lemmatized_lower]
    # Filter out any empty strings that might result from stripping (e.g., if token was just "'")
    final_tokens = [token for token in cleaned_tokens if token]

    return final_tokens # Return the cleaned tokens

def remove_stopwords_topic_model(tokens, stopwords_set):
    """Removes stopwords, single-character tokens, specified artifacts, and numeric tokens for LDA/NMF."""
    cleaned_tokens = []
    # Reinstate the full set of artifacts we want removed
    punctuation_artifacts = {"''", "'s", "``", "--", "-", "\\\\-", "...", "`", "p."}

    for token in tokens:
        if token and not token[0].isalnum():
          token = token[1:]

        # Skip if stopword
        if token in stopwords_set:
            continue

        # Skip if specific punctuation artifact
        if token in punctuation_artifacts:
            continue

        # Keep token ONLY if it's longer than 1 character AND contains at least one word character
        # Reinstates the stricter single-character filtering and general content check
        if len(token) > 1 and re.search(r'\w', token):
             cleaned_tokens.append(token)

    return cleaned_tokens

def preprocess_for_lda_nmf(text, stopwords_set):
    cleaned_text = initial_clean(text)
    lemmatized_tokens = tokenize_lemmatize_lowercase(cleaned_text)
    final_tokens = remove_stopwords_topic_model(lemmatized_tokens, stopwords_set)
    return final_tokens

# --- Add Topic Labels Dictionary ---
topic_labels = {
    1: "Tesla & Electric Vehicles",
    2: "Earnings Estimates & Surprises",
    3: "Elon Musk & Twitter",
    4: "Quarterly Financial Results",
    5: "Banks, Interest Rates & Inflation",
    6: "Stock Market Movements",
    7: "Corporate Announcements",
    8: "Yahoo Finance & Earnings Coverage",
    9: "Big Tech & AI",
    10: "Healthcare & Pharmaceuticals",
    11: "Market Indices & Economic Data",
    12: "Earnings Calls & Executive Commentary",
    13: "Retail & Consumer Spending",
    14: "Analyst Insights & Investment Ideas",
    15: "Media & Streaming",
    16: "Auto Industry & Labor Strikes",
    17: "Costco & Wholesale Retail",
    18: "Insider Trading & Share Activity",
    19: "Aerospace & Aviation",
    20: "E-commerce & Amazon",
    21: "Social Media & Advertising",
    22: "Dividends & Energy Stocks",
    23: "Industrial Tech & Conglomerates",
    24: "ETFs & Asset Management",
    25: "AI & Semiconductors"
}
# --- End of Topic Labels ---

# --- 1. Define the new text and preprocess it ---
title = "Nvidia and Other Chip Stocks Fall as Tariff-Fueled Rout Continues"
content = "Spencer Platt / Getty Images Traders work the floor of the New York Stock Exchange on Friday morning\n\nThe chips are down.\n\nChip stocks fell Friday, losing ground on the second trading day after President Donald Trump unveiled a set of global tariffs that sent stock markets dramatically lower—and the first since China announced its own retaliatory measures. The latest news offered investors yet another indication that trade war is here to stay, at least for now.\n\nThe PHLX Semiconductor Index (SOX), which tracks chip shares, was recently down about 7%. Four of its components, namely Marvell Technology (MRVL), Coherent (COHR), Entegris (ENTG) and Micron Technology (MU), were recently off more than 7% apiece.\n\n“Because many finished electronics goods and IT infrastructure goods are ultimately imported from many of [the countries affected by the new tariffs],” UBS analysts wrote Thursday, “this could have a profound negative impact on electronics demand.”\n\nNvidia (NVDA), one of last year’s stock-market darlings, was off more than 7% in recent trading.\n\nCiti analysts yesterday suggested that analog chipmakers like Analog Devices (ADI) could outperform the broader sector in a downturn; its shares were off some 6% in recent trading.\n\nThis week’s tariff announcements have hit stocks broadly. Today, all three major indexes were recently down roughly 4%, with the blue-chip Dow faring only a bit better. All 11 of the S&P 500’s sectors were recently in decline; its semiconductor and equipment subindex was down about 7%.\n\nChina’s new tariffs on US imports are set to take effect Thursday, according to that country’s finance ministry.\n\n\n\nRead the original article on Investopedia"

combined = f"{title} {content}"
cleaned = preprocess_for_lda_nmf(combined, default_stopwords)
print(cleaned)

project_root = '/content/drive/MyDrive/IS450 Project'
# --- 2. Load the saved models and dictionary ---
model_path = os.path.join(project_root, "outputs", "topic_modeling", "nmf_news", 'gensim_nmf_tfidf.model')
dict_path = os.path.join(project_root, "outputs", "topic_modeling", "nmf_news", 'gensim_dictionary.dict')
tfidf_model_path = os.path.join(project_root, "outputs", "topic_modeling", "nmf_news", 'gensim_tfidf.model')

print(f"Loading NMF model from: {model_path}")
print(f"Loading Dictionary from: {dict_path}")
print(f"Loading TF-IDF model from: {tfidf_model_path}")

try:
    loaded_nmf_model = Nmf.load(model_path)
    loaded_dictionary = Dictionary.load(dict_path)
    loaded_tfidf_model = TfidfModel.load(tfidf_model_path)
    models_loaded = True
    print("Models and dictionary loaded successfully.")
except FileNotFoundError:
    print("ERROR: One or more model/dictionary files not found.")
    models_loaded = False
except Exception as e:
    print(f"ERROR: An unexpected error occurred loading models: {e}")
    models_loaded = False

# --- 3. Prepare the new text for the model ---
if models_loaded:
    # Convert the cleaned text to Bag-of-Words using the loaded dictionary
    new_text_bow = loaded_dictionary.doc2bow(cleaned)

    # Convert the BoW representation to TF-IDF using the loaded TF-IDF model
    new_text_tfidf = loaded_tfidf_model[new_text_bow]

    # --- 4. Apply the NMF model ---
    topic_distribution = loaded_nmf_model[new_text_tfidf]

    # --- 5. Extract and display the top topics ---
    if topic_distribution: # Check if the model returned any topics
        # Sort topics by probability (descending)
        sorted_topics = sorted(topic_distribution, key=lambda item: item[1], reverse=True)

        print("\nPredicted Topic Distribution:")
        print("(Note: 'Probability' here refers to the NMF topic weight/contribution for this document)")
        for topic_idx, prob in sorted_topics:
            # Convert 0-based index to 1-based for display
            topic_num = topic_idx + 1
            # --- Use topic labels dictionary ---
            label = topic_labels.get(topic_num, "Unknown Topic") # Get label, default if not found
            print(f"  Topic: {label}: Probability = {prob:.4f}")
            # --- End of label use ---

        # --- Display the top 2 topics ---
        if len(sorted_topics) >= 1:
            top_topic_idx, top_prob = sorted_topics[0]
            top_topic_num = top_topic_idx + 1
            # --- Use topic labels dictionary ---
            top_label = topic_labels.get(top_topic_num, "Unknown Topic")
            print(f"\nTop Topic: {top_label} (Weight/Probability: {top_prob:.4f})")

    else:
        print("The NMF model did not assign any topics to this document (possibly due to all words being filtered out).")

else:
    print("Skipping prediction because models could not be loaded.")
