In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from collections import Counter
import json
import os
import requests
import zipfile

In [6]:
# --- Setup and Configuration ---
print("Downloading NLTK data...")
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True)
print("NLTK downloads complete.")

Downloading NLTK data...
NLTK downloads complete.


In [7]:
# Set plot style for better aesthetics
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

# --- 1. Data Loading & Automated Download ---
# Define file and URL details
dataset_url = "https://www.kaggle.com/api/v1/datasets/download/rmisra/news-category-dataset"
zip_file_name = "news-category-dataset.zip"
# The name of the JSON file inside the zip archive might vary. We'll use v3 as specified before.
json_file_name = 'News_Category_Dataset_v3.json'

# --- Function to download the file ---
def download_file(url, filename):
    """Downloads a file from a URL, showing progress."""
    print(f"Downloading {filename} from {url}...")
    # Note: Kaggle may require authentication. This direct download might fail.
    # A more robust solution involves using the Kaggle API with an API key.
    try:
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            with open(filename, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
        print("Download successful.")
        return True
    except requests.exceptions.RequestException as e:
        print(f"Error downloading the file: {e}")
        print("Please try downloading the dataset manually from Kaggle or configure your Kaggle API credentials.")
        return False

# --- Main data loading logic ---
if not os.path.exists(json_file_name):
    print(f"'{json_file_name}' not found.")
    if not os.path.exists(zip_file_name):
        download_file(dataset_url, zip_file_name)

    if os.path.exists(zip_file_name):
        print(f"Extracting '{zip_file_name}'...")
        try:
            with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
                # Find the correct file within the zip archive
                zip_contents = zip_ref.namelist()
                target_file = None
                for name in zip_contents:
                    if 'News_Category_Dataset_v3.json' in name:
                         target_file = name
                         break

                if target_file:
                    zip_ref.extract(target_file)
                    # Rename to the expected filename for consistency
                    os.rename(target_file, json_file_name)
                    print(f"Extracted and renamed to '{json_file_name}'.")
                else:
                    print(f"Could not find '{json_file_name}' inside the zip archive.")
        except zipfile.BadZipFile:
            print(f"Error: '{zip_file_name}' is not a valid zip file or is corrupted.")


# Now, try to load the dataframe


In [8]:
try:
    # The data is stored as one JSON object per line
    with open(json_file_name, 'r') as f:
        data = [json.loads(line) for line in f]
    df = pd.DataFrame(data)
    print("Dataset loaded successfully.")
    print(f"Dataset shape: {df.shape}")
except (FileNotFoundError, NameError):
    print(f"Error: Could not load the dataset.")
    print("Creating a dummy dataframe to allow the rest of the script to run.")
    df = pd.DataFrame({
        'headline': ['Sample headline 1', 'Sample headline 2'],
        'short_description': ['This is a sample description.', 'This is another sample text for processing.'],
        'category': ['TECH', 'SPORTS']
    })

# --- 2. Initial Inspection ---
print("\n--- Initial Data Inspection ---")
print("First 5 rows of the dataset:")
print(df.head())

# Combine headline and short description for a richer text source
df['text'] = df['headline'] + ' ' + df['short_description']


Dataset loaded successfully.
Dataset shape: (209527, 6)

--- Initial Data Inspection ---
First 5 rows of the dataset:
                                                link  \
0  https://www.huffpost.com/entry/covid-boosters-...   
1  https://www.huffpost.com/entry/american-airlin...   
2  https://www.huffpost.com/entry/funniest-tweets...   
3  https://www.huffpost.com/entry/funniest-parent...   
4  https://www.huffpost.com/entry/amy-cooper-lose...   

                                            headline   category  \
0  Over 4 Million Americans Roll Up Sleeves For O...  U.S. NEWS   
1  American Airlines Flyer Charged, Banned For Li...  U.S. NEWS   
2  23 Of The Funniest Tweets About Cats And Dogs ...     COMEDY   
3  The Funniest Tweets From Parents This Week (Se...  PARENTING   
4  Woman Who Called Cops On Black Bird-Watcher Lo...  U.S. NEWS   

                                   short_description               authors  \
0  Health experts said it is too early to predict...  Carla K. J

In [9]:
# --- 3. Text Preprocessing ---
print("\n--- Preprocessing Text Data ---")
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

def preprocess_text(text):
    """
    Cleans and preprocesses a single text string.
    - Tokenizes
    - Converts to lowercase
    - Removes stopwords and punctuation
    - Lemmatizes tokens
    """
    if not isinstance(text, str):
        return []

    tokens = word_tokenize(text.lower())

    lemmatized_tokens = [
        lemmatizer.lemmatize(token)
        for token in tokens
        if token.isalpha() and token not in stop_words and token not in punctuation
    ]

    return lemmatized_tokens

df_processed = df.copy()
df_processed['processed_text'] = df_processed['text'].apply(preprocess_text)

print("Preprocessing complete. A 'processed_text' column has been added.")


--- Preprocessing Text Data ---
Preprocessing complete. A 'processed_text' column has been added.


In [None]:
# --- 4. Exploratory Data Analysis (EDA) ---
print("\n--- Performing Exploratory Data Analysis ---")

# A. Distribution of Document Lengths
df_processed['doc_length'] = df_processed['processed_text'].apply(len)

plt.figure(figsize=(12, 6))
sns.histplot(df_processed['doc_length'], bins=50, kde=True)
plt.title('Distribution of Document Lengths (Number of Words)', fontsize=16)
plt.xlabel('Document Length', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.show()

# B. Most Frequent Words
all_words = [word for tokens in df_processed['processed_text'] for word in tokens]
word_counts = Counter(all_words)
most_common_words = word_counts.most_common(20)

df_most_common = pd.DataFrame(most_common_words, columns=['Word', 'Frequency'])

plt.figure(figsize=(12, 8))
sns.barplot(x='Frequency', y='Word', data=df_most_common, palette='viridis')
plt.title('Top 20 Most Frequent Words', fontsize=16)
plt.xlabel('Frequency', fontsize=12)
plt.ylabel('Word', fontsize=12)
plt.show()

# C. Category Distribution
plt.figure(figsize=(12, 10))
sns.countplot(y='category', data=df_processed, order=df_processed['category'].value_counts().index, palette='plasma')
plt.title('Distribution of News Categories', fontsize=16)
plt.xlabel('Number of Articles', fontsize=12)
plt.ylabel('Category', fontsize=12)
plt.tight_layout()
plt.show()

print("\nEDA visualizations have been generated.")