# 1. Project Setup and Library Imports

Description: Import all required libraries for preprocessing, model training, and evaluation.

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.pipeline import Pipeline
import joblib


# 2. Load the Dataset

Description: Load the 20 Newsgroups dataset for text classification.

In [2]:
categories = ["sci.space", "rec.sport.baseball"]
newsgroups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers','footers','quotes'))

df = pd.DataFrame({
    "text": newsgroups.data,
    "label": newsgroups.target
})

df.head()


Unnamed: 0,text,label
0,\nDo you really have *that* much information o...,0
1,\n ^^^^^^\n\n\tSure they might. ...,0
2,------------------------- Original Article ---...,0
3,"\nNot really, though I wouldn't personally say...",0
4,\nCan anybody name a player who was 'rushed' t...,0


# 3. Inspect Dataset Sample

Description: View a few rows to understand the structure.

In [3]:
df.sample(5)

Unnamed: 0,text,label
1595,Can someone please remind me who said a well k...,1
114,"\n\n\n\nTed, you're missing a vital point. A...",0
635,\n\nConsidering the magnitude of loss of life ...,1
1033,\n<<<most of message deleted>>>\n\n,1
1940,"In the April edition of ""One Small Step for a ...",1


# 4. Check Class Distribution

Description: Examine dataset balance across categories.

In [4]:
df['label'].value_counts()

label
0    994
1    987
Name: count, dtype: int64

# 5. Train–Test Split

Description: Split dataset into train and test sets

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"], test_size=0.3, random_state=42
)

# 6. TF-IDF Vectoriser Setup

Description: Create TF–IDF vectoriser for converting text to features.

In [6]:
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)

# 7. Fit TF-IDF on Training Data

Description: Learn vocabulary and weighting from training data.

In [7]:
tfidf.fit(X_train)

# 8. Transform Train and Test Data

Description: Convert text into TF–IDF feature vectors.

In [8]:
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# 9. Train Baseline Model (LinearSVC)

Description: Train a baseline SVM classifier.

In [9]:
baseline_model = LinearSVC()
baseline_model.fit(X_train_tfidf, y_train)

# 10. Evaluate Baseline Model

Description: Evaluate accuracy and classification report.

In [10]:
baseline_pred = baseline_model.predict(X_test_tfidf)

print("Baseline Accuracy:", accuracy_score(y_test, baseline_pred))
print(classification_report(y_test, baseline_pred, target_names=categories))

Baseline Accuracy: 0.9394957983193277
                    precision    recall  f1-score   support

         sci.space       0.91      0.97      0.94       286
rec.sport.baseball       0.97      0.91      0.94       309

          accuracy                           0.94       595
         macro avg       0.94      0.94      0.94       595
      weighted avg       0.94      0.94      0.94       595



# 11. Train Multiple Models for Comparison

Description: Train several algorithms to compare performance.

In [11]:
models = {
    "LinearSVC": LinearSVC(),
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "MultinomialNB": MultinomialNB()
}

results = {}


# 12. Compare Model Performance

Description: Measure accuracy and F1-score for each model.

In [12]:
for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    preds = model.predict(X_test_tfidf)
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average="weighted")
    results[name] = (acc, f1)
    print(f"\n{name}")
    print("Accuracy:", acc)
    print("F1-score:", f1)



LinearSVC
Accuracy: 0.9394957983193277
F1-score: 0.9395060528052855

LogisticRegression
Accuracy: 0.9529411764705882
F1-score: 0.9528971758252455

MultinomialNB
Accuracy: 0.9512605042016806
F1-score: 0.951274548221612


# 13. Select Best Model

Description: Choose the highest-performing model.

In [13]:
best_model_name = max(results, key=lambda x: results[x][0])
best_model_name

'LogisticRegression'

# 14. Build Scikit-Learn Pipeline (TF-IDF + Best Model)

Description: Create a reusable Pipeline for deployment.

In [14]:
best_model = models[best_model_name]

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.7)),
    ('clf', best_model)
])

# 15. Train the Pipeline on Raw Text

Description: Train end-to-end pipeline on raw text.

In [15]:
pipeline.fit(X_train, y_train)

# 16. Evaluate Final Pipeline

Description: Evaluate the performance of the Pipeline.

In [16]:
pipe_pred = pipeline.predict(X_test)
print("Pipeline Accuracy:", accuracy_score(y_test, pipe_pred))
print(classification_report(y_test, pipe_pred, target_names=categories))

Pipeline Accuracy: 0.9529411764705882
                    precision    recall  f1-score   support

         sci.space       0.97      0.93      0.95       286
rec.sport.baseball       0.94      0.97      0.96       309

          accuracy                           0.95       595
         macro avg       0.95      0.95      0.95       595
      weighted avg       0.95      0.95      0.95       595



# 17. Save the Trained Pipeline (joblib)

Description: Save the final model for use in FastAPI.

In [17]:
joblib.dump(pipeline, "news_text_classifier.joblib")

['news_text_classifier.joblib']

# 18. Load Saved Pipeline for Verification

Description: Ensure saved model loads correctly.

In [19]:
loaded_model = joblib.load("news_text_classifier.joblib")

# 19. Test Prediction with Sample Text

Description: Run prediction using loaded Pipeline.

In [20]:
sample_text = "NASA has announced a new space mission."
loaded_model.predict([sample_text])[0]

1

# 20. Helper Function for Quick Predictions

Description: Define utility function for repeated tests.

In [21]:
def predict(text):
    return loaded_model.predict([text])[0]

predict("The Yankees won their game today.")

0