In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import pandas as pd

df = pd.read_csv("hf://datasets/Shadow-Blade/financialNews/financialNews.csv")

# Define the output file path
output_path = "/content/drive/My Drive/IS450 Project/Data/Historical News/Raw/raw_financial_news.csv"

# Save as CSV
df.to_csv(output_path, index=False)

print(f"Data saved at: {output_path}")


In [None]:
import os
import pandas as pd
import json
import re
import html
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    print("Downloading NLTK resource: punkt_tab")
    nltk.download('punkt_tab')
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    print("Downloading NLTK resource: stopwords")
    nltk.download('stopwords')

print("Imports complete and NLTK resources checked/downloaded.")


In [None]:
# Define the root path of your project on Google Drive
project_root = '/content/drive/MyDrive/IS450 Project'

# Define paths relative to the project root
input_xlsx = os.path.join(project_root, 'Data/Historical News/Supplementary/unique_financial_news.xlsx')
base_output_path = os.path.join(project_root, 'Data/Historical News')
finbert_output_dir = os.path.join(base_output_path, "FinBERT_Data")
ml_output_dir = os.path.join(base_output_path, "ML_Data")

os.makedirs(finbert_output_dir, exist_ok=True)
os.makedirs(ml_output_dir, exist_ok=True)

print(f"Input file: {input_xlsx}")
print(f"FinBERT Output directory: {finbert_output_dir}")
print(f"ML Output directory: {ml_output_dir}")


In [None]:
default_stopwords = set(stopwords.words('english'))
stopwords_to_keep = {"not", "no", "but"} # Keep negation words
custom_stopwords = default_stopwords - stopwords_to_keep

print(f"Custom stopwords set created. Total: {len(custom_stopwords)}")
# print(custom_stopwords) # Optional: Uncomment to verify


In [None]:
def clean_news_text(text):
    """
    Clean news text by unescaping HTML, removing URLs/tags, and normalizing whitespace.
    """
    if not isinstance(text, str):
        return ""
    text = html.unescape(text)
    text = html.unescape(text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\\S+|www\\.\\S+', '', text)
    text = re.sub(r'\\s+', ' ', text).strip()
    return text

def detect_truncated_description(description):
    """
    Detect if the description is likely truncated using heuristics.
    """
    if not isinstance(description, str) or len(description.strip()) == 0:
        return False
    description = description.strip()
    if description.endswith("...") or description.endswith("…"):
        return True
    if not description or description[-1] not in {'.', '!', '?'}:
        return True
    if description:
        try:
            last_word = description.split()[-1]
            last_word_cleaned = re.sub(r'[.!?]$', '', last_word)
            if len(last_word_cleaned) < 3:
                return True
        except IndexError:
             return True
    return False


def detect_truncated_title(title):
    """
    Check if the title appears truncated (checks for ellipsis).
    """
    if not isinstance(title, str) or not title.strip():
        return False
    title_clean = title.strip()
    if title_clean.endswith("...") or title_clean.endswith("…"):
        return True
    return False


In [None]:
def preprocess_for_finbert_news(title, description):
    """
    Combine title/description and apply basic cleaning for FinBERT.
    """
    title_str = str(title) if title is not None else ""
    desc_str = str(description) if description is not None else ""
    combined = f"{title_str} {desc_str}".strip()
    return clean_news_text(combined)


def preprocess_for_traditional_ml_news(title, description):
    """
    Combine title/description and clean aggressively for traditional ML.
    """
    title_str = str(title) if title is not None else ""
    desc_str = str(description) if description is not None else ""
    combined = f"{title_str} {desc_str}".strip()
    cleaned = clean_news_text(combined)
    cleaned = cleaned.lower()
    cleaned = re.sub(r'[/]', ' ', cleaned)
    cleaned = re.sub(r'[^a-z0-9_\\s]', '', cleaned)
    cleaned = re.sub(r'\\s+', ' ', cleaned).strip()
    return cleaned

def tokenize_and_remove_stopwords(text, stopwords_set):
    """Tokenizes text and removes stopwords."""
    if not isinstance(text, str):
        return []
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stopwords_set and len(token) > 1]
    return tokens


In [None]:
def process_news():
    """
    Loads news data from Excel, processes it for FinBERT and ML pipelines,
    and saves the results to separate JSON files.
    """
    print(f"Starting processing for: {input_xlsx}")
    try:
        df = pd.read_excel(input_xlsx)
        print(f"Loaded {len(df)} rows from Excel.")
    except FileNotFoundError:
        print(f"ERROR: Input file not found at {input_xlsx}")
        return
    except Exception as e:
        print(f"ERROR: Failed to load Excel file: {e}")
        return

    ml_output_data = []
    finbert_output_data = []
    processed_count = 0

    for idx, row in df.iterrows():
        try:
            record_id = int(idx) # Ensure ID is integer
            title = row.get("title", "")
            description = row.get("description", "")

            finbert_text = preprocess_for_finbert_news(title, description)
            finbert_output_data.append({
                "id": record_id,
                "processed_text_finbert": finbert_text
            })

            ml_text = preprocess_for_traditional_ml_news(title, description)
            ml_tokens = tokenize_and_remove_stopwords(ml_text, custom_stopwords)
            ml_output_data.append({
                "id": record_id,
                "processed_tokens_ml": ml_tokens
            })

            processed_count += 1
            if processed_count % 10000 == 0: # Progress indicator every 10k records
                 print(f"Processed {processed_count} records...")

        except Exception as e:
            print(f"ERROR processing row index {idx}: {e}")
            continue # Skip problematic rows

    print(f"Finished processing {processed_count} records.")

    finbert_output_file = os.path.join(finbert_output_dir, "news_finbert.json")
    ml_output_file = os.path.join(ml_output_dir, "news_ml.json")

    try:
        with open(finbert_output_file, 'w', encoding='utf-8') as f:
            json.dump(finbert_output_data, f, ensure_ascii=False, indent=2)
        print(f"Saved {len(finbert_output_data)} FinBERT records to {finbert_output_file}")
    except Exception as e:
        print(f"ERROR saving FinBERT data: {e}")

    try:
        with open(ml_output_file, 'w', encoding='utf-8') as f:
            json.dump(ml_output_data, f, ensure_ascii=False, indent=2)
        print(f"Saved {len(ml_output_data)} ML records to {ml_output_file}")
    except Exception as e:
        print(f"ERROR saving ML data: {e}")


In [None]:
process_news()
print("\nProcessing attempt complete.")
