In [2]:
import pandas as pd
import os
import numpy as np

# Check current working directory
print("Current directory:", os.getcwd())

# List files in current directory to find the JSON file
print("\nFiles in current directory:")
for item in os.listdir('.'):
    if 'Module11' in item or 'News' in item:
        print(item)

Current directory: C:\Users\Tirth\Projects\Module11 (NLP)

Files in current directory:
Module11 nlp questions.pdf
News_Category_Dataset_v3.json


In [3]:
# Now read the JSON file (lines=True for JSON Lines format)
df = pd.read_json('News_Category_Dataset_v3.json', lines=True)

print(f"Shape of dataframe: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head())

Shape of dataframe: (209527, 6)

Columns: ['link', 'headline', 'category', 'short_description', 'authors', 'date']

First few rows:
                                                link  \
0  https://www.huffpost.com/entry/covid-boosters-...   
1  https://www.huffpost.com/entry/american-airlin...   
2  https://www.huffpost.com/entry/funniest-tweets...   
3  https://www.huffpost.com/entry/funniest-parent...   
4  https://www.huffpost.com/entry/amy-cooper-lose...   

                                            headline   category  \
0  Over 4 Million Americans Roll Up Sleeves For O...  U.S. NEWS   
1  American Airlines Flyer Charged, Banned For Li...  U.S. NEWS   
2  23 Of The Funniest Tweets About Cats And Dogs ...     COMEDY   
3  The Funniest Tweets From Parents This Week (Se...  PARENTING   
4  Woman Who Called Cops On Black Bird-Watcher Lo...  U.S. NEWS   

                                   short_description               authors  \
0  Health experts said it is too early to predict.

In [4]:
# Find number of unique categories
num_categories = df['category'].nunique()
print(f"Number of unique categories: {num_categories}")
print(f"\nAll categories:")
print(df['category'].unique())
print(f"\nCategory distribution:")
print(df['category'].value_counts())

Number of unique categories: 42

All categories:
['U.S. NEWS' 'COMEDY' 'PARENTING' 'WORLD NEWS' 'CULTURE & ARTS' 'TECH'
 'SPORTS' 'ENTERTAINMENT' 'POLITICS' 'WEIRD NEWS' 'ENVIRONMENT'
 'EDUCATION' 'CRIME' 'SCIENCE' 'WELLNESS' 'BUSINESS' 'STYLE & BEAUTY'
 'FOOD & DRINK' 'MEDIA' 'QUEER VOICES' 'HOME & LIVING' 'WOMEN'
 'BLACK VOICES' 'TRAVEL' 'MONEY' 'RELIGION' 'LATINO VOICES' 'IMPACT'
 'WEDDINGS' 'COLLEGE' 'PARENTS' 'ARTS & CULTURE' 'STYLE' 'GREEN' 'TASTE'
 'HEALTHY LIVING' 'THE WORLDPOST' 'GOOD NEWS' 'WORLDPOST' 'FIFTY' 'ARTS'
 'DIVORCE']

Category distribution:
category
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN          

In [5]:
# METHOD 1: Remove specific unwanted categories
# Define which categories you want to keep or remove

unwanted_categories = ['WEIRD NEWS', 'IMPACT', 'DIVORCE']

# Create a filtered dataframe
df_filtered = df[~df['category'].isin(unwanted_categories)]

print(f"Original dataset: {len(df)} rows")
print(f"After removing {unwanted_categories}:")
print(f"Filtered dataset: {len(df_filtered)} rows")
print(f"Rows removed: {len(df) - len(df_filtered)}")
print(f"\nRemaining categories: {df_filtered['category'].nunique()}")

Original dataset: 209527 rows
After removing ['WEIRD NEWS', 'IMPACT', 'DIVORCE']:
Filtered dataset: 199840 rows
Rows removed: 9687

Remaining categories: 39


In [6]:
# METHOD 2: Keep only specific categories
# Whitelist approach - define categories you WANT

keep_categories = ['TECHNO', 'ENTERTAINMENT', 'POLITICS', 'BUSINESS']

df_keep = df[df['category'].isin(keep_categories)]

print(f"Original dataset: {len(df)} rows")
print(f"Keeping only: {keep_categories}")
print(f"Filtered dataset: {len(df_keep)} rows")
print(f"Rows removed: {len(df) - len(df_keep)}")
print(f"\nCategories in filtered data:")
print(df_keep['category'].value_counts())

Original dataset: 209527 rows
Keeping only: ['TECHNO', 'ENTERTAINMENT', 'POLITICS', 'BUSINESS']
Filtered dataset: 58956 rows
Rows removed: 150571

Categories in filtered data:
category
POLITICS         35602
ENTERTAINMENT    17362
BUSINESS          5992
Name: count, dtype: int64


In [7]:
df_keep.head()

Unnamed: 0,link,headline,category,short_description,authors,date
20,https://www.huffpost.com/entry/golden-globes-r...,Golden Globes Returning To NBC In January Afte...,ENTERTAINMENT,"For the past 18 months, Hollywood has effectiv...",,2022-09-20
21,https://www.huffpost.com/entry/biden-us-forces...,Biden Says U.S. Forces Would Defend Taiwan If ...,POLITICS,President issues vow as tensions with China rise.,,2022-09-19
24,https://www.huffpost.com/entry/ukraine-festiva...,‚ÄòBeautiful And Sad At The Same Time‚Äô: Ukrainia...,POLITICS,An annual celebration took on a different feel...,Jonathan Nicholson,2022-09-19
28,https://www.huffpost.com/entry/james-cameron-f...,James Cameron Says He 'Clashed' With Studio Be...,ENTERTAINMENT,"The ""Avatar"" director said aspects of his 2009...",Ben Blanchet,2022-09-18
30,https://www.huffpost.com/entry/europe-britain-...,Biden Says Queen's Death Left 'Giant Hole' For...,POLITICS,"U.S. President Joe Biden, in London for the fu...","Darlene Superville, AP",2022-09-18


In [8]:
# Install spaCy
!pip install -q spacy
!python -m spacy download en_core_web_sm -q

print("‚úì Installation complete!")

[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
‚úì Installation complete!


In [9]:
import spacy

print("\n" + "="*70)
print("SPACY LOADING METHODS - OPTIMIZED FOR PERFORMANCE")
print("="*70)

# Method 1: Load ONLY tokenizer (fastest)
print("\n1. Load ONLY tokenizer (bare minimum):")
nlp_tokenizer = spacy.load('en_core_web_sm', 
                           disable=['tagger', 'parser', 'ner', 'attribute_ruler'])
print(f"   Active components: {nlp_tokenizer.pipe_names}")
print(f"   Use case: Fast tokenization, text preprocessing")
print(f"   Memory: ~10 MB")

# Method 2: Load tokenizer + tagger
print("\n2. Load tokenizer + POS tagger:")
nlp_tagger = spacy.load('en_core_web_sm', 
                       disable=['parser', 'ner', 'attribute_ruler'])
print(f"   Active components: {nlp_tagger.pipe_names}")
print(f"   Use case: POS tagging, grammar analysis")
print(f"   Memory: ~35 MB")

# Method 3: Load with parser
print("\n3. Load tokenizer + tagger + parser (no NER):")
nlp_parser = spacy.load('en_core_web_sm', 
                       disable=['ner'])
print(f"   Active components: {nlp_parser.pipe_names}")
print(f"   Use case: Dependency parsing, sentence structure")
print(f"   Memory: ~40 MB")

# Method 4: Full model
print("\n4. Load full model (all components):")
nlp_full = spacy.load('en_core_web_sm')
print(f"   Active components: {nlp_full.pipe_names}")
print(f"   Use case: NER, complete NLP analysis")
print(f"   Memory: ~45 MB")

print("\n" + "="*70)
print("TEST: Processing text with minimal components")
print("="*70)

text = "Apple Inc. is buying U.K. startup for $1 billion."
doc = nlp_tokenizer(text)
print(f"\nInput: {text}")
print(f"Tokens: {[token.text for token in doc]}")


SPACY LOADING METHODS - OPTIMIZED FOR PERFORMANCE

1. Load ONLY tokenizer (bare minimum):
   Active components: ['tok2vec', 'lemmatizer']
   Use case: Fast tokenization, text preprocessing
   Memory: ~10 MB

2. Load tokenizer + POS tagger:
   Active components: ['tok2vec', 'tagger', 'lemmatizer']
   Use case: POS tagging, grammar analysis
   Memory: ~35 MB

3. Load tokenizer + tagger + parser (no NER):
   Active components: ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer']
   Use case: Dependency parsing, sentence structure
   Memory: ~40 MB

4. Load full model (all components):
   Active components: ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
   Use case: NER, complete NLP analysis
   Memory: ~45 MB

TEST: Processing text with minimal components

Input: Apple Inc. is buying U.K. startup for $1 billion.
Tokens: ['Apple', 'Inc.', 'is', 'buying', 'U.K.', 'startup', 'for', '$', '1', 'billion', '.']




In [10]:
# Install NLTK for stopwords (if not already installed)
!pip install -q nltk

import nltk
nltk.download('stopwords', quiet=True)

from nltk.corpus import stopwords
import string

print("‚úì Dependencies installed successfully!")

‚úì Dependencies installed successfully!


In [11]:
# Test the preprocessing function
print("\n" + "="*80)
print("HEADLINE PREPROCESSING - TEST EXAMPLES")
print("="*80)

test_headlines = [
    "Apple Inc. is looking at buying British startup",
    "The Quick Brown Fox Jumps Over The Lazy Dog",
    "COVID-19 Vaccines: What You Need to Know!",
    "Breaking News: Markets Rally as Economic Data Improves",
    "Top 10 Ways to Stay Healthy and Fit in 2024"
]

for i, headline in enumerate(test_headlines, 1):
    cleaned = preprocess_headlines(headline)
    tokens = preprocess_headlines_tokens(headline)
    
    print(f"\n{i}. Original:")
    print(f"   {headline}")
    print(f"\n   Cleaned (string):")
    print(f"   {cleaned}")
    print(f"\n   Tokens (list):")
    print(f"   {tokens}")
    print("-" * 80)


HEADLINE PREPROCESSING - TEST EXAMPLES


NameError: name 'preprocess_headlines' is not defined

In [12]:
# ============================================================================
# PREPROCESSING FUNCTIONS
# ============================================================================

def preprocess_headlines(headline):
    """
    Preprocess a headline: lowercase, remove punctuation, and lemmatize
    """
    # Convert to lowercase
    text = headline.lower()
    
    # Remove special characters and punctuation
    text = ''.join(char if char.isalnum() or char.isspace() else '' for char in text)
    
    return text

def preprocess_headlines_tokens(headline):
    """
    Preprocess and tokenize headline using spaCy
    Returns list of lemmatized tokens (excluding stopwords)
    """
    # Preprocess first
    cleaned = preprocess_headlines(headline)
    
    # Process with spaCy
    doc = nlp(cleaned)
    
    # Extract lemmatized tokens, excluding stopwords and punctuation
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and token.text.strip()]
    
    return tokens

In [13]:
# ============================================================================
# COMPREHENSIVE NLP PIPELINE - NEWS CATEGORY CLASSIFICATION
# ============================================================================
# This code processes the loaded dataset (df) and builds a news category
# classification model using Logistic Regression with CountVectorizer.
# NO TEST DATA - only uses the actual loaded df dataset
# ============================================================================

# STEP 1: FILTER DATA TO 4 SPECIFIC CATEGORIES
print("\n" + "="*80)
print("STEP 1: FILTERING DATA TO 4 CATEGORIES")
print("="*80)

categories_to_keep = ['TECH', 'ENTERTAINMENT', 'POLITICS', 'BUSINESS']

# Filter dataframe to keep only these categories
df_filtered = df[df['category'].isin(categories_to_keep)].copy()

print(f"\nOriginal dataframe shape: {df.shape}")
print(f"Filtered dataframe shape: {df_filtered.shape}")
print(f"\nCategories in filtered data:")
print(df_filtered['category'].value_counts())
print(f"\nTotal records: {len(df_filtered)}")


# STEP 2: TEXT PREPROCESSING
print("\n" + "="*80)
print("STEP 2: TEXT PREPROCESSING - CLEAN HEADLINES")
print("="*80)

def clean_text(text):
    """
    Clean text by converting to lowercase and removing special characters
    """
    text = str(text).lower()
    text = ''.join(char if char.isalnum() or char.isspace() else '' for char in text)
    text = ' '.join(text.split())
    return text

print("\nApplying text preprocessing to all headlines...")
df_filtered['cleaned_headline'] = df_filtered['headline'].apply(clean_text)

print(f"Total headlines processed: {len(df_filtered)}")
print("\nSample of original vs cleaned headlines:")
for i in range(min(3, len(df_filtered))):
    print(f"\nOriginal [{i}]: {df_filtered['headline'].iloc[i][:80]}...")
    print(f"Cleaned  [{i}]: {df_filtered['cleaned_headline'].iloc[i][:80]}...")
    

# STEP 3: TEXT VECTORIZATION WITH COUNTVECTORIZER
from sklearn.feature_extraction.text import CountVectorizer

print("\n" + "="*80)
print("STEP 3: TEXT VECTORIZATION WITH COUNTVECTORIZER")
print("="*80)

vectorizer = CountVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.8,
    stop_words='english'
)

print("\nVectorizer Configuration:")
print("  - Max features: 5000")
print("  - N-gram range: (1, 2) - unigrams and bigrams")
print("  - Min document frequency: 2")
print("  - Max document frequency: 0.8")
print("  - Stop words: English")

print("\nFitting vectorizer and transforming text...")
X = vectorizer.fit_transform(df_filtered['cleaned_headline'])

print(f"\nFeature matrix shape: {X.shape}")
print(f"Total features (vocabulary size): {len(vectorizer.get_feature_names_out())}")
print(f"\nSample features (first 20):")
print(vectorizer.get_feature_names_out()[:20])


# STEP 4: CREATE FEATURE MATRIX (X) AND LABEL VECTOR (y)
print("\n" + "="*80)
print("STEP 4: CREATE FEATURE MATRIX AND LABEL VECTOR")
print("="*80)

y = df_filtered['category'].values

print(f"\nFeature matrix X shape: {X.shape}")
print(f"Label vector y shape: {y.shape}")
print(f"\nClass distribution in data:")
unique, counts = np.unique(y, return_counts=True)
for cat, count in zip(unique, counts):
    percentage = (count / len(y)) * 100
    print(f"  {cat}: {count} ({percentage:.1f}%)")

# STEP 5: TRAIN-TEST SPLIT
from sklearn.model_selection import train_test_split

print("\n" + "="*80)
print("STEP 5: TRAIN-TEST SPLIT WITH STRATIFICATION")
print("="*80)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set: {X_train.shape[0]} samples, {X_train.shape[1]} features")
print(f"Test set: {X_test.shape[0]} samples, {X_test.shape[1]} features")
print(f"\nClass distribution in training set:")
unique_train, counts_train = np.unique(y_train, return_counts=True)
for cat, count in zip(unique_train, counts_train):
    print(f"  {cat}: {count}")
    

# STEP 6: TRAIN LOGISTIC REGRESSION MODEL
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report

print("\n" + "="*80)
print("STEP 6: TRAIN LOGISTIC REGRESSION MODEL")
print("="*80)

model = LogisticRegression(
    max_iter=1000,
    random_state=42,
    multi_class='multinomial',
    solver='lbfgs',
    n_jobs=-1
)

print("\nTraining Logistic Regression model...")
model.fit(X_train, y_train)

print("\nModel training completed!")
print(f"Model classes: {model.classes_}")
print(f"Number of features: {model.n_features_in_}")

# STEP 7: EVALUATE MODEL ON TEST SET
print("\n" + "="*80)
print("STEP 7: MODEL EVALUATION ON TEST SET")
print("="*80)

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"\nModel Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Precision (weighted): {precision:.4f}")
print(f"Recall (weighted): {recall:.4f}")
print(f"F1-Score (weighted): {f1:.4f}")

print("\n" + "-"*80)
print("CLASSIFICATION REPORT")
print("-"*80)
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
import pandas as pd
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=model.classes_, columns=model.classes_)
print(cm_df)


# STEP 8: PREDICTION FUNCTION FOR NEW HEADLINES
print("\n" + "="*80)
print("STEP 8: PREDICTION FUNCTION")
print("="*80)

def predict_headline_category(headline):
    """
    Predict the category of a given headline using the trained model.
    Uses only data from the loaded dataset - no test data.
    """
    cleaned = clean_text(headline)
    X_new = vectorizer.transform([cleaned])
    prediction = model.predict(X_new)[0]
    confidence = model.predict_proba(X_new)[0].max()
    return prediction, confidence

# Test with actual headlines from df_filtered
print("\nTesting prediction function with actual headlines from dataset:")
print("\nSample predictions from actual data:")
test_indices = np.random.choice(len(df_filtered), size=min(5, len(df_filtered)), replace=False)

for idx in test_indices:
    actual_headline = df_filtered['headline'].iloc[idx]
    actual_category = df_filtered['category'].iloc[idx]
    predicted_category, confidence = predict_headline_category(actual_headline)
    
    match = "‚úì CORRECT" if predicted_category == actual_category else "‚úó INCORRECT"
    print(f"\nHeadline: {actual_headline[:70]}...")
    print(f"Actual: {actual_category} | Predicted: {predicted_category} [{confidence:.2%}] {match}")

print("\n" + "="*80)
print("PIPELINE COMPLETE")
print("="*80)
print(f"\nModel successfully trained on {len(df_filtered)} news articles")
print(f"Test set accuracy: {accuracy*100:.2f}%")
print(f"\nModel is ready for predictions on new headlines.")
print(f"Use: predict_headline_category('Your headline here')")


STEP 1: FILTERING DATA TO 4 CATEGORIES

Original dataframe shape: (209527, 6)
Filtered dataframe shape: (61060, 6)

Categories in filtered data:
category
POLITICS         35602
ENTERTAINMENT    17362
BUSINESS          5992
TECH              2104
Name: count, dtype: int64

Total records: 61060

STEP 2: TEXT PREPROCESSING - CLEAN HEADLINES

Applying text preprocessing to all headlines...
Total headlines processed: 61060

Sample of original vs cleaned headlines:

Original [0]: Twitch Bans Gambling Sites After Streamer Scams Folks Out Of $200,000...
Cleaned  [0]: twitch bans gambling sites after streamer scams folks out of 200000...

Original [1]: Golden Globes Returning To NBC In January After Year Off-Air...
Cleaned  [1]: golden globes returning to nbc in january after year offair...

Original [2]: Biden Says U.S. Forces Would Defend Taiwan If China Invaded...
Cleaned  [2]: biden says us forces would defend taiwan if china invaded...

STEP 3: TEXT VECTORIZATION WITH COUNTVECTORIZER

Vec




Model training completed!
Model classes: ['BUSINESS' 'ENTERTAINMENT' 'POLITICS' 'TECH']
Number of features: 5000

STEP 7: MODEL EVALUATION ON TEST SET

Model Accuracy: 0.8751 (87.51%)
Precision (weighted): 0.8706
Recall (weighted): 0.8751
F1-Score (weighted): 0.8713

--------------------------------------------------------------------------------
CLASSIFICATION REPORT
--------------------------------------------------------------------------------
               precision    recall  f1-score   support

     BUSINESS       0.71      0.59      0.64      1198
ENTERTAINMENT       0.89      0.88      0.88      3472
     POLITICS       0.90      0.94      0.92      7121
         TECH       0.75      0.52      0.62       421

     accuracy                           0.88     12212
    macro avg       0.81      0.73      0.77     12212
 weighted avg       0.87      0.88      0.87     12212


Confusion Matrix:
               BUSINESS  ENTERTAINMENT  POLITICS  TECH
BUSINESS            711       

In [None]:
# ============================================================================
# INTERACTIVE CHATBOT - NEWS CATEGORY PREDICTOR
# ============================================================================

def run_interactive_chatbot():
    """
    Interactive chatbot for predicting news headline categories.
    Run this to chat with the model!
    """
    print("\n" + "="*80)
    print("üì∞ WELCOME TO NEWS CATEGORY CHATBOT")
    print("="*80)
    print("\nI can predict the category of any news headline!")
    print(f"\nAvailable categories: {', '.join(model.classes_)}")
    print("\nType 'quit', 'exit', or 'q' to end the conversation.\n")
    
    conversation_count = 0
    
    while True:
        try:
            # Get user input
            user_headline = input("üìù Enter a news headline (or 'quit' to exit): ").strip()
            
            # Check for exit commands
            if user_headline.lower() in ['quit', 'exit', 'q']:
                print("\nüëã Thank you for using NewsBot! Goodbye!")
                print(f"Total headlines processed: {conversation_count}")
                break
            
            # Validate input
            if not user_headline:
                print("‚ö†Ô∏è  Please enter a valid headline.\n")
                continue
            
            # Make prediction
            predicted_category, confidence = predict_headline_category(user_headline)
            conversation_count += 1
            
            # Display results
            print(f"\n‚úÖ Prediction: {predicted_category}")
            print(f"   Confidence: {confidence:.2%}")
            
            # Confidence indicator
            if confidence > 0.9:
                reliability = "üü¢ Very High (>90%)"
            elif confidence > 0.8:
                reliability = "üü¢ High (80-90%)"
            elif confidence > 0.7:
                reliability = "üü° Medium (70-80%)"
            else:
                reliability = "üî¥ Low (<70%)"
            
            print(f"   Reliability: {reliability}\n")
            
        except KeyboardInterrupt:
            print("\n\nüëã Chat interrupted. Goodbye!")
            break
        except Exception as e:
            print(f"‚ùå Error: {e}")
            print("Please try again.\n")

# RUN THE CHATBOT
run_interactive_chatbot()



üì∞ WELCOME TO NEWS CATEGORY CHATBOT

I can predict the category of any news headline!

Available categories: BUSINESS, ENTERTAINMENT, POLITICS, TECH

Type 'quit', 'exit', or 'q' to end the conversation.



üìù Enter a news headline (or 'quit' to exit):  Tesla to restart Dojo supercomputer project, Musk says



‚úÖ Prediction: BUSINESS
   Confidence: 62.13%
   Reliability: üî¥ Low (<70%)



üìù Enter a news headline (or 'quit' to exit):  Tesla to restart Dojo supercomputer project



‚úÖ Prediction: BUSINESS
   Confidence: 58.58%
   Reliability: üî¥ Low (<70%)



üìù Enter a news headline (or 'quit' to exit):  Tesla is reviving its Dojo supercomputer project just five months after CEO Elon Musk declared its predecessor "an evolutionary dead end" and disbanded the team behind it.



‚úÖ Prediction: BUSINESS
   Confidence: 89.07%
   Reliability: üü¢ High (80-90%)



In [None]:
!pip install streamlit