In [1]:
# ## 1. Import Libraries
import json
import re
import numpy as np
import pandas as pd
from collections import defaultdict
from time import time
import os
import gzip
import pickle 

In [2]:
# ## 2. Load and Prepare Dataset (Optimized with Pandas)
def load_and_prepare_data(filepath, sample_size=300):
    # Load JSON data efficiently
    print("Loading data...")
    start = time()
    data = pd.read_json(filepath, lines=True)
    print(f"Loaded {len(data)} entries in {time()-start:.2f}s")
    
    # Filter for required categories
    categories = ['WELLNESS', 'ENTERTAINMENT', 'CRIME', 'POLITICS']
    filtered_data = data[data['category'].isin(categories)].copy()
    
    # Sample evenly from each category
    sampled_data = (filtered_data.groupby('category', group_keys=False, observed=True)
                .apply(lambda x: x.sample(n=min(len(x), sample_size//len(categories)))))

    # Create labels
    sampled_data['label'] = sampled_data['category'].apply(
        lambda x: 1 if x in ['WELLNESS', 'ENTERTAINMENT'] else 0
    )
    
    # Test data (remaining entries)
    test_data = filtered_data[~filtered_data.index.isin(sampled_data.index)]
    
    return sampled_data, test_data


In [3]:
# ## 3. Text Preprocessing (Vectorized with Pandas)
basic_stopwords = {
    # Articles/determiners
    'the', 'a', 'an', 'some', 'any', 'all', 'both', 'each', 'every', 'either', 'neither',
    
    # Pronouns
    'i', 'me', 'my', 'myself', 'we', 'us', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves',
    'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself',
    'they', 'them', 'their', 'theirs', 'themselves', 'this', 'that', 'these', 'those',
    'who', 'whom', 'whose', 'which', 'what', 'whatever', 'whoever', 'whichever',
    
    # Common verbs (all forms)
    'be', 'am', 'is', 'are', 'was', 'were', 'been', 'being',
    'have', 'has', 'had', 'having',
    'do', 'does', 'did', 'doing',
    'can', 'could', 'shall', 'should', 'will', 'would', 'may', 'might', 'must',
    
    # Prepositions
    'about', 'above', 'across', 'after', 'against', 'along', 'among', 'around', 'at',
    'before', 'behind', 'below', 'beneath', 'beside', 'between', 'beyond', 'by',
    'concerning', 'despite', 'down', 'during', 'except', 'for', 'from', 'in', 'inside',
    'into', 'like', 'near', 'of', 'off', 'on', 'onto', 'out', 'outside', 'over',
    'past', 'regarding', 'since', 'through', 'throughout', 'to', 'toward', 'under',
    'until', 'up', 'upon', 'with', 'within', 'without',
    
    # Conjunctions
    'and', 'but', 'or', 'nor', 'for', 'yet', 'so', 'as', 'if', 'because', 'although',
    'while', 'when', 'where', 'whether', 'since', 'unless', 'until', 'before', 'after',
    
    # Other common words
    'there', 'here', 'very', 'just', 'not', 'no', 'yes', 'more', 'most', 'many', 'much',
    'few', 'little', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
    'well', 'also', 'even', 'just', 'now', 'then', 'again', 'already', 'still', 'always',
    'never', 'often', 'usually', 'sometimes', 'rather', 'quite', 'perhaps', 'maybe',
    'once', 'twice', 'new', 'old', 'good', 'bad', 'high', 'low', 'first', 'last',
    'next', 'other', 'another', 'each', 'every', 'any', 'some', 'all', 'both', 'either',
    'neither', 'few', 'many', 'several', 'such', 'what', 'which', 'whose', 'these',
    'those', 'them', 'then', 'therefore', 'thus', 'hence', 'however', 'nevertheless',
    'nonetheless', 'otherwise', 'accordingly', 'consequently', 'meanwhile','how', 'why','more', 'new', 'all', 'they',
    
    # Contractions
    "don't", "doesn't", "didn't", "isn't", "aren't", "wasn't", "weren't", "haven't",
    "hasn't", "hadn't", "won't", "wouldn't", "can't", "couldn't", "shouldn't",
    "mightn't", "mustn't", "i'm", "you're", "he's", "she's", "it's", "we're", "they're",
    "i've", "you've", "we've", "they've", "i'd", "you'd", "he'd", "she'd", "we'd", "they'd",
    "i'll", "you'll", "he'll", "she'll", "we'll", "they'll", "let's", "that's", "who's",
    "what's", "where's", "when's", "why's", "how's"
}

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return [word for word in text.split() if word not in basic_stopwords]

def preprocess_dataframe(df):
    print("Preprocessing text...")
    start = time()
    df['processed'] = (df['headline'] + ' ' + df['short_description']).apply(preprocess_text)
    print(f"Preprocessing completed in {time()-start:.2f}s")
    return df

In [4]:
# ## 4. Optimized Naïve Bayes Implementation
class OptimizedNaiveBayes:
    def __init__(self, max_vocab_size=5000):
        self.class_probs = None
        self.word_probs = None
        self.vocabulary = None
        self.max_vocab_size = max_vocab_size  # Store the limit

    
    def train(self, data):
        print("\nTraining model...")
        start = time()
        
        # Calculate class probabilities
        class_counts = data['label'].value_counts()
        self.class_probs = (class_counts / class_counts.sum()).to_dict()
        
        # Keep only top 5000 most frequent words
        word_counts = defaultdict(int)
        for words in data['processed']:
            for word in words:
                word_counts[word] += 1

        self.vocabulary = [word for word, _ in 
                  sorted(word_counts.items(), key=lambda x: -x[1])[:5000]]

        # Build vocabulary
        all_words = [word for words in data['processed'] for word in words]
        self.vocabulary = list(set(all_words))
        
        # Count words per class using vectorized operations
        word_counts = {0: defaultdict(int), 1: defaultdict(int)}
        class_totals = {0: 0, 1: 0}
        
        for _, row in data.iterrows():
            cls = row['label']
            for word in row['processed']:
                word_counts[cls][word] += 1
                class_totals[cls] += 1
        
        # Calculate probabilities with Laplace smoothing
        self.word_probs = {0: {}, 1: {}}
        v_size = len(self.vocabulary)
        
        for cls in [0, 1]:
            for word in self.vocabulary:
                count = word_counts[cls].get(word, 0)
                self.word_probs[cls][word] = (count + 1) / (class_totals[cls] + v_size)
        
        print(f"Training completed in {time()-start:.2f}s")
    
    def predict(self, text):
        log_probs = {}
        
        for cls in [0, 1]:
            # Start with log class probability
            log_probs[cls] = np.log(self.class_probs[cls])
            
            # Add log word probabilities
            for word in text:
                if word in self.word_probs[cls]:
                    log_probs[cls] += np.log(self.word_probs[cls][word])
        
        return max(log_probs.items(), key=lambda x: x[1])[0]

    def save_model(self, filename='naive_bayes_model.pkl.gz', compressed=True):
        model_data = {
            'class_probs': self.class_probs,
            'word_probs': dict(self.word_probs),
            'vocabulary': self.vocabulary
        }
        
        if compressed:
            with gzip.open(filename, 'wb') as f:
                pickle.dump(model_data, f)
        else:
            with open(filename, 'wb') as f:
                pickle.dump(model_data, f)
        
        print(f"Model saved to {filename}")

def load_model(filename='naive_bayes_model.pkl.gz'):
    try:
        # Detect compression from file extension
        if filename.endswith('.gz'):
            with gzip.open(filename, 'rb') as f:
                data = pickle.load(f)
        else:
            with open(filename, 'rb') as f:
                data = pickle.load(f)
        
        model = OptimizedNaiveBayes()
        model.class_probs = data['class_probs']
        model.word_probs = defaultdict(dict, data['word_probs'])
        model.vocabulary = data['vocabulary']
        
        print(f"Model loaded from {filename}")
        return model
    except FileNotFoundError:
        print(f"Model file {filename} not found")
        return None
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        return None
    
def evaluate_model(model, test_data):
    confusion_matrix = defaultdict(int)  # TP, FP, TN, FN
    class_stats = defaultdict(lambda: {'correct': 0, 'total': 0})
    
    for _, row in test_data.iterrows():
        true_label = row['label']
        pred = model.predict(row['processed'])
        
        # Update confusion matrix
        if true_label == 1 and pred == 1:
            confusion_matrix['TP'] += 1
        elif true_label == 1 and pred == 0:
            confusion_matrix['FN'] += 1
        elif true_label == 0 and pred == 1:
            confusion_matrix['FP'] += 1
        else:
            confusion_matrix['TN'] += 1
        
        # Update class stats
        class_stats[true_label]['total'] += 1
        if pred == true_label:
            class_stats[true_label]['correct'] += 1
    
    # Calculate metrics
    accuracy = (confusion_matrix['TP'] + confusion_matrix['TN']) / len(test_data)
    precision = confusion_matrix['TP'] / (confusion_matrix['TP'] + confusion_matrix['FP'])
    recall = confusion_matrix['TP'] / (confusion_matrix['TP'] + confusion_matrix['FN'])
    f1 = 2 * (precision * recall) / (precision + recall)
    
    print(f"\nConfusion Matrix:")
    print(f"               Predicted Like  Predicted Dislike")
    print(f"Actual Like      {confusion_matrix['TP']:^12}      {confusion_matrix['FN']:^12}")
    print(f"Actual Dislike   {confusion_matrix['FP']:^12}      {confusion_matrix['TN']:^12}")
    
    print(f"\nOverall Accuracy: {accuracy:.2%}")
    print(f"Precision: {precision:.2%}")
    print(f"Recall: {recall:.2%}")
    print(f"F1 Score: {f1:.2%}")
    
    for label, stats in class_stats.items():
        print(f"Class {'Like' if label else 'Dislike'} Accuracy: {stats['correct']/stats['total']:.2%}")


In [5]:
# ## 5. Train and Evaluate Model
# Load and prepare data
sampled_data, test_data = load_and_prepare_data('News_Category_Dataset_v3.json')

# Preprocess text
sampled_data = preprocess_dataframe(sampled_data)
test_data = preprocess_dataframe(test_data)

test_data['label'] = test_data['category'].apply(
    lambda x: 1 if x in ['WELLNESS', 'ENTERTAINMENT'] else 0
)

# Try to load existing model, otherwise train new one
nb = load_model()
if nb is None:
    nb = OptimizedNaiveBayes(max_vocab_size=5000)  # Initialize with vocab limit
    nb.train(sampled_data)
    nb.save_model()  # Save the newly trained model

# Evaluate
print("\nEvaluating model...")
evaluate_model(nb, test_data)

print("Training labels:", sampled_data['label'].value_counts())
print("Test labels:", test_data['label'].value_counts())

# Top 10 "Like" words
print("\nTop 10 'Like' words:")
print(sorted(nb.word_probs[1].items(), key=lambda x: -x[1])[:10])

# Top 10 "Dislike" words  
print("\nTop 10 'Dislike' words:")
print(sorted(nb.word_probs[0].items(), key=lambda x: -x[1])[:10])



Loading data...
Loaded 209527 entries in 0.83s
Preprocessing text...
Preprocessing completed in 0.00s
Preprocessing text...


  sampled_data = (filtered_data.groupby('category', group_keys=False, observed=True)


Preprocessing completed in 0.46s
Model loaded from naive_bayes_model.pkl.gz

Evaluating model...

Confusion Matrix:
               Predicted Like  Predicted Dislike
Actual Like         27017              8140    
Actual Dislike       5875             33139    

Overall Accuracy: 81.10%
Precision: 82.14%
Recall: 76.85%
F1 Score: 79.40%
Class Like Accuracy: 76.85%
Class Dislike Accuracy: 84.94%
Training labels: label
0    150
1    150
Name: count, dtype: int64
Test labels: label
0    39014
1    35157
Name: count, dtype: int64

Top 10 'Like' words:
[('one', 0.003609456776755098), ('help', 0.002165674066053059), ('life', 0.002165674066053059), ('want', 0.002165674066053059), ('make', 0.001985201227215304), ('day', 0.001804728388377549), ('people', 0.001804728388377549), ('health', 0.001804728388377549), ('weight', 0.0016242555495397943), ('healthy', 0.0016242555495397943)]

Top 10 'Dislike' words:
[('police', 0.0035889686437476386), ('trump', 0.0028333962976955043), ('man', 0.0020778239516

In [6]:
# ## 6. Make Predictions
def predict_sentiment(text, model):
    processed = preprocess_text(text)
    return "Like 👍" if model.predict(processed) == 1 else "Dislike 👎"

# Example prediction
test_article = "I love playing"
print(f"\nPrediction for: '{test_article}'")
print("Result:", predict_sentiment(test_article, nb))

# Additional examples
examples = [
    "New yoga techniques help reduce stress",
    "Political tensions rise in the Middle East",
    "Celebrity wedding announcement",
    "Crime rate increases in urban areas"
]

print("\nAdditional Predictions:")
for example in examples:
    print(f"{example[:50]}... -> {predict_sentiment(example, nb)}")


Prediction for: 'I love playing'
Result: Like 👍

Additional Predictions:
New yoga techniques help reduce stress... -> Like 👍
Political tensions rise in the Middle East... -> Dislike 👎
Celebrity wedding announcement... -> Like 👍
Crime rate increases in urban areas... -> Dislike 👎
