# Importing The Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
import re
import string
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Loading The Dataset

In [2]:
categories = ['sci.med', 'comp.graphics']
newsgroups = fetch_20newsgroups(subset='all', categories=categories,
                                remove=('headers', 'footers', 'quotes'), random_state=42)

In [3]:
df = pd.DataFrame({
    'text': newsgroups.data,
    'target': newsgroups.target,
    'category': [categories[i] for i in newsgroups.target]
})

In [4]:
print(f"Dataset shape: {df.shape}")
print(f"Category distribution:\n{df['category'].value_counts()}")

Dataset shape: (1963, 3)
Category distribution:
category
comp.graphics    990
sci.med          973
Name: count, dtype: int64


# Displaying Sample Data

In [5]:
print("\nSample texts from each category:")
for category in categories:
    sample_text = df[df['category'] == category]['text'].iloc[0][:200] + "..."
    print(f"\n{category}: {sample_text}")


Sample texts from each category:

sci.med: Hi all,

  I am looking for a recommandation on a good royalty free graphics
library package for C and C++ program.  This is mainly use to write
children games and education software.  I heard someone...

comp.graphics: 
You certainly do not see OTC preparations advertised as such.
The only such ridiculous concoctions are nostrums for premenstrual
syndrome, ostensibly to treat headache and "bloating" simultaneously.
...


# Text Preprocessing

Text Cleaning

In [6]:
def clean_text(text):
    if not isinstance(text, str):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [7]:
df['cleaned_text'] = df['text'].apply(clean_text)

Removing Frequent Words (Custom Stopwords)

In [8]:
def get_frequent_words(texts, n=50):
    all_words = ' '.join(texts).split()
    word_freq = Counter(all_words)
    return [word for word, freq in word_freq.most_common(n)]

In [9]:
frequent_words = get_frequent_words(df['cleaned_text'].tolist(), 30)
custom_stopwords = set(frequent_words)

print(f"Custom stopwords (top 10): {list(custom_stopwords)[:10]}")

Custom stopwords (top 10): ['that', 'of', 'is', 'by', 'in', 'to', 'with', 'and', 'can', 'it']


Removing Stopwords

In [10]:
def remove_stopwords(text, stop_words):
    if not isinstance(text, str):
        return ""

    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

In [11]:
df['processed_text'] = df['cleaned_text'].apply(
    lambda x: remove_stopwords(x, custom_stopwords)
)

# Displaying Cleaned and Original Text

In [12]:
print("\nOriginal vs Cleaned Text Comparison:")
sample_idx = 10
print(f"\nOriginal text:\n{df['text'].iloc[sample_idx][:150]}...")
print(f"\nCleaned text:\n{df['processed_text'].iloc[sample_idx][:150]}...")


Original vs Cleaned Text Comparison:

Original text:

And to add further fuel to the flame war, I read about 20 years ago that
the "natural" MSG - extracted from the sources you mention above - does not
...

Cleaned text:
add further fuel flame war read about years ago natural msg extracted sources mention above does cause reported aftereffects its only nasty artificial...


# Splitting The Dataset

In [13]:
X = df['processed_text']
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

Training set size: 1374
Test set size: 589
