In [3]:
import pandas as pd
import json
import os
import requests
import zipfile
import re

# --- NLP Library Imports ---
import spacy
from spacy import displacy
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [4]:
json_file_name = 'News_Category_Dataset_v3.json'
def setup_data():
    """Ensures the dataset is available and loads it into a DataFrame."""
    # Check for the file and download if necessary
    if not os.path.exists(json_file_name):
        print(f"'{json_file_name}' not found. Attempting to download and extract.")
        dataset_url = "https://www.kaggle.com/api/v1/datasets/download/rmisra/news-category-dataset"
        zip_file_name = "news-category-dataset.zip"

        try:
            # Download
            with requests.get(dataset_url, stream=True) as r:
                r.raise_for_status()
                with open(zip_file_name, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)
            print("Download successful.")

            # Extract
            with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
                zip_ref.extract('News_Category_Dataset_v3.json')
            print("Extraction successful.")
            os.remove(zip_file_name) # Clean up the zip file

        except Exception as e:
            print(f"Automatic download/extraction failed: {e}")
            print("Please download 'news-category-dataset.zip' from Kaggle and extract it manually.")
            return None

    # Load into DataFrame
    try:
        with open(json_file_name, 'r') as f:
            data = [json.loads(line) for line in f]
        df = pd.DataFrame(data)
        df['text'] = df['headline'] + '. ' + df['short_description']
        print("Dataset loaded successfully.")
        return df.sample(frac=1).reset_index(drop=True) # Shuffle the data
    except FileNotFoundError:
        print(f"Error: '{json_file_name}' could not be found.")
        return None

df = setup_data()


'News_Category_Dataset_v3.json' not found. Attempting to download and extract.
Download successful.
Extraction successful.
Dataset loaded successfully.


In [5]:
# --- 2. Information Extraction ---
print("\n--- Setting up Information Extraction Tools ---")

# A. Rule-Based Extraction (Dates)
def extract_dates(text):
    """Extracts dates using a simple regex pattern."""
    date_pattern = r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2},?\s+\d{4}|\d{4}-\d{2}-\d{2}\b'
    return re.findall(date_pattern, text)

# B. Named Entity Recognition (NER) with spaCy
print("Loading spaCy model 'en_core_web_sm'...")
try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    print("spaCy model 'en_core_web_sm' not found.")
    print("Please run: python -m spacy download en_core_web_sm")
    nlp = None # Set to None to prevent crashing

def extract_named_entities(text):
    """Extracts named entities using spaCy."""
    if not nlp or not isinstance(text, str) or not text.strip():
        return []
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities


--- Setting up Information Extraction Tools ---
Loading spaCy model 'en_core_web_sm'...


In [6]:
# --- 3. Abstractive Summarization ---
print("\n--- Setting up Summarization Model ---")
model_name = 'google-t5/t5-small'
try:
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    summarization_model = T5ForConditionalGeneration.from_pretrained(model_name)
    print("T5 model and tokenizer loaded successfully.")
except Exception as e:
    print(f"Could not load T5 model. Please ensure you have an internet connection. Error: {e}")
    tokenizer, summarization_model = None, None

def generate_summary(text):
    """Generates an abstractive summary using the T5 model."""
    if not tokenizer or not summarization_model or not isinstance(text, str) or not text.strip():
        return "Summarization model not available or text is empty."

    # Prepend the task prefix required by T5
    input_text = "summarize: " + text

    # Encode the text
    inputs = tokenizer.encode(input_text, return_tensors='pt', max_length=512, truncation=True)

    # Generate the summary
    summary_ids = summarization_model.generate(
        inputs,
        max_length=100,
        min_length=30,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )

    # Decode and return the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# --- 4. Demonstration & Qualitative Evaluation ---
print("\n--- Running Demonstrations ---")

if df is not None:
    # Select a few random articles to process
    sample_articles = df.head(3)

    for index, row in sample_articles.iterrows():
        print("="*80)
        print(f"ARTICLE {index + 1} (Category: {row['category']})")
        print("="*80)

        article_text = row['text']
        print(f"ORIGINAL TEXT:\n{article_text}\n")

        # --- Extraction ---
        dates = extract_dates(row['date']) # Check the actual date column
        print(f"RULE-BASED DATES (from 'date' column): {dates}")

        entities = extract_named_entities(article_text)
        print(f"NAMED ENTITIES (spaCy):\n{entities}\n")

        # Optional: Visualize entities in a Jupyter/Colab environment
        # if nlp:
        #     doc = nlp(article_text)
        #     displacy.render(doc, style="ent", jupyter=True)

        # --- Summarization ---
        summary = generate_summary(article_text)
        print(f"GENERATED SUMMARY (T5):\n{summary}\n")
else:
    print("Could not run demonstration as the dataframe failed to load.")


--- Setting up Summarization Model ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

T5 model and tokenizer loaded successfully.

--- Running Demonstrations ---
ARTICLE 1 (Category: WELLNESS)
ORIGINAL TEXT:
Is Your Relationship System Working Well?. All relationships have a system. Some work well and some don't. Since I have been working with relationships for the last 44 years, I've become very attuned to what kind of a system two people have between them.

RULE-BASED DATES (from 'date' column): ['2013-03-05']
NAMED ENTITIES (spaCy):
[('the last 44 years', 'DATE'), ('two', 'CARDINAL')]

GENERATED SUMMARY (T5):
all relationships have a system. some work well and others don't. I've become very attuned to what kind of a system two people have between them.

ARTICLE 2 (Category: BUSINESS)
ORIGINAL TEXT:
Customer Loyalty Management Via the Customer Service Silo. Your customer service department is the most important. Here are four ways you can leverage your customer service team to effectively manage customer loyalty, build relationships and turn customers into fans.

RULE