In [6]:
"""
News article categorization script - Crime focus
This script analyzes news articles and categorizes them based on crime-related keywords
"""

import json
import pandas as pd
import spacy
import os
from spacy.lang.en.stop_words import STOP_WORDS
from collections import Counter

# Configuration
SITE_LIST = ["foxnews"] # , "foxnews"
PROJECT_ROOT = os.getcwd()
DATA_DIR = os.path.join(PROJECT_ROOT, "..", "..", "data")
REFERENCE_DATA_PATH = f"{DATA_DIR}/News_Category_Dataset_v3.json"

In [3]:
# Load spaCy model for text processing
nlp = spacy.load("en_core_web_sm")

In [4]:
def load_jsonl_file(file_path):
    """Load a JSONL file into a pandas DataFrame"""
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError:
                continue
    return pd.DataFrame(data)

def preprocess_text(text):
    """Clean and preprocess text for analysis"""
    if not isinstance(text, str) or pd.isna(text):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = ''.join(c for c in text if c.isalnum() or c.isspace())
    text = ''.join(c for c in text if not c.isdigit())
    # Remove stop words
    return " ".join(word for word in text.split() if word not in STOP_WORDS)

def extract_crime_keywords():
    """Extract top crime-related keywords from reference dataset"""
    print("Extracting crime keywords from reference dataset...")
    
    # Load reference dataset
    df_ref = load_jsonl_file(REFERENCE_DATA_PATH)
    
    # Filter for crime category and preprocess
    df_crime = df_ref[df_ref["category"] == "CRIME"].copy()
    df_crime["headline"] = df_crime["headline"].apply(preprocess_text)
    df_crime["short_description"] = df_crime["short_description"].apply(preprocess_text)
    
    # Count word frequencies
    crime_word_counter = Counter()
    for _, row in df_crime.iterrows():
        crime_word_counter.update(row["headline"].split())
        crime_word_counter.update(row["short_description"].split())
    
    # Non-crime-related words to filter out
    non_related_words = {
        'new', 'florida', 'texas', 'nyc', 'california', 'says', 'idaho', 'chicago',
        'georgia', 'carolina', 'york', 'city', 'video', 'car', 'home', 'years', 'family',
        'los', 'mom', 'angeles', 'north', 'men', 'state', 'judge', 'officials', 'house',
        'near', 'people', 'sex', 'virginia', 'university', 'court', 'philadelphia', 'fire',
        'subway', 'nypd', 'arizona', 'washington', 'women', 'school', 'teen',
        'mother', 'woman', 'shows', 'girl', 'driver', 'student', 'child', 'search', 'leaves', 'found',
        'man', 'san', 'high', 'ohio', 'photo'
    }
    
    # Filter and get top crime words
    top_crime_words = [word for word, _ in crime_word_counter.most_common(200) 
                     if word not in non_related_words][:100]
    
    print(f"Extracted {len(top_crime_words)} crime keywords")
    print(f"Top 10 crime keywords: {', '.join(top_crime_words[:10])}")
    return top_crime_words

def categorize_articles(site_name, crime_keywords):
    """Categorize articles for a specific site based on crime keywords"""
    print(f"Categorizing articles for {site_name}...")
    
    # Load articles
    file_path = os.path.join(DATA_DIR, site_name, "backup", "articles_cleaned.jsonl")
    try:
        df = load_jsonl_file(file_path)

    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return
    
    # Preprocess text fields
    df['title_processed'] = df['title'].apply(preprocess_text)
    df['description_processed'] = df['description'].apply(preprocess_text)
    
    # Check if articles contain crime keywords
    def is_crime_article(row):
        """Check if an article contains enough crime keywords to be categorized as crime"""
        title_words = set(row["title_processed"].split())
        desc_words = set(row["description_processed"].split())
        all_words = title_words.union(desc_words)
        
        # Count crime keywords
        crime_word_count = sum(1 for word in all_words if word in crime_keywords)
        return crime_word_count >= 3  # Require at least 3 crime keywords
    
    # Apply categorization
    df["category"] = "Other"
    df.loc[df.apply(is_crime_article, axis=1), "category"] = "Crime"
    
    # Remove processing columns
    df.drop(columns=["title_processed", "description_processed"], inplace=True)
    
    # Save categorized articles
    output_path = os.path.join(DATA_DIR, site_name, "articles_categorized.json") 
    df.to_json(output_path, orient='records', lines=True)
    
    crime_count = len(df[df["category"] == "Crime"])
    total_count = len(df)
    print(f"Categorized {total_count} articles for {site_name}: {crime_count} crime articles ({crime_count/total_count:.1%})")
    return df

In [7]:
"""Main function to run the categorization process"""
print("Starting crime article categorization...")

# Extract crime keywords from reference dataset
crime_keywords = extract_crime_keywords()

# Process each site
for site in SITE_LIST:
    categorize_articles(site, crime_keywords)

print("Categorization complete!")

Starting crime article categorization...
Extracting crime keywords from reference dataset...
Extracted 100 crime keywords
Top 10 crime keywords: police, shooting, said, yearold, allegedly, suspect, killed, shot, death, accused
Categorizing articles for foxnews...
File not found: /Users/weikuo/Documents/github-repositories/scrape_news_articles/src/preprocess/../../data/foxnews/backup/articles_cleaned.jsonl
Categorization complete!
