## **ML model for 20 news dataset**

In [6]:
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
newsgroups_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)

X_train, y_train = newsgroups_train.data, newsgroups_train.target
X_test, y_test = newsgroups_test.data, newsgroups_test.target

In [25]:
print(X_train[1000])

From: dabl2@nlm.nih.gov (Don A.B. Lindbergh)
Subject: Diamond SS24X, Win 3.1, Mouse cursor
Organization: National Library of Medicine
Lines: 10


Anybody seen mouse cursor distortion running the Diamond 1024x768x256 driver?
Sorry, don't know the version of the driver (no indication in the menus) but it's a recently
delivered Gateway system.  Am going to try the latest drivers from Diamond BBS but wondered
if anyone else had seen this.

post or email

--Don Lindbergh
dabl2@lhc.nlm.nih.gov



In [7]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

In [8]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression())  # placeholder, will change in GridSearchCV
])

In [9]:
param_grid = [
    {
        'tfidf__max_features': [10000, 20000],
        'tfidf__ngram_range': [(1,1), (1,2)],
        'clf': [LogisticRegression(max_iter=2000)],
        'clf__C': [0.1, 1, 5]
    },
    {
        'tfidf__max_features': [10000, 20000],
        'tfidf__ngram_range': [(1,1), (1,2)],
        'clf': [LinearSVC()],
        'clf__C': [0.1, 1, 5]
    },
    {
        'tfidf__max_features': [10000, 20000],
        'tfidf__ngram_range': [(1,1), (1,2)],
        'clf': [MultinomialNB()],
        'clf__alpha': [0.1, 0.5, 1.0]
    }
]

In [10]:
grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    n_jobs=-1,
    verbose=2
)

grid.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


In [12]:
print("Best Parameters:", grid.best_params_)
print("Best CV Score:", grid.best_score_)

y_pred = grid.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=newsgroups_train.target_names))

Best Parameters: {'clf': LinearSVC(), 'clf__C': 1, 'tfidf__max_features': 20000, 'tfidf__ngram_range': (1, 1)}
Best CV Score: 0.9106417986458583
Test Accuracy: 0.8422729686670207
                          precision    recall  f1-score   support

             alt.atheism       0.82      0.76      0.79       319
           comp.graphics       0.73      0.79      0.76       389
 comp.os.ms-windows.misc       0.75      0.73      0.74       394
comp.sys.ibm.pc.hardware       0.72      0.74      0.73       392
   comp.sys.mac.hardware       0.83      0.84      0.83       385
          comp.windows.x       0.86      0.75      0.80       395
            misc.forsale       0.81      0.91      0.85       390
               rec.autos       0.91      0.88      0.90       396
         rec.motorcycles       0.94      0.95      0.94       398
      rec.sport.baseball       0.90      0.94      0.92       397
        rec.sport.hockey       0.95      0.97      0.96       399
               sci.crypt    

In [13]:
import joblib

joblib.dump(grid.best_estimator_, "news_classifier_pipeline.pkl")

['news_classifier_pipeline.pkl']

In [23]:
examples = [
    "God does not exist, atheism makes more sense.",
    "The government is debating new gun control laws.",
    "I love driving my new sports car!",
    "NASA discovered new planets in the solar system.",
]

for text in examples:
    pred = grid.best_estimator_.predict([text])
    print(f"Text: {text}")
    print(f"Predicted category: {newsgroups_train.target_names[pred[0]]}\n")


Text: God does not exist, atheism makes more sense.
Predicted category: alt.atheism

Text: The government is debating new gun control laws.
Predicted category: talk.politics.guns

Text: I love driving my new sports car!
Predicted category: rec.autos

Text: NASA discovered new planets in the solar system.
Predicted category: sci.space

