Get newsgroup data

In [1]:
from sklearn.datasets import fetch_20newsgroups
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
    'rec.sport.hockey'
]
data_train = fetch_20newsgroups(subset='train', categories=categories, random_state=42)
data_test = fetch_20newsgroups(subset='test', categories=categories, random_state=42)

Setup cleaning

In [2]:
def is_letter_only(word):
    for char in word:
        if not char.isalpha():
            return False
    return True

from nltk.corpus import names
all_names = set(names.words())
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def clean_text(data):
    data_cleaned = []
    for doc in data:
        doc = doc.lower()
        doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if is_letter_only(word) and word not in all_names)
        data_cleaned.append(doc_cleaned)
    return data_cleaned

Clean data

In [3]:
cleaned_train = clean_text(data_train.data)
label_train = data_train.target
cleaned_test = clean_text(data_test.data)
label_test = data_test.target
print(len(label_train), len(label_test))

2634 1752


Verify balance

In [4]:
from collections import Counter
print(Counter(label_train))
print(Counter(label_test))

Counter({2: 600, 3: 593, 1: 584, 0: 480, 4: 377})
Counter({2: 399, 3: 394, 1: 389, 0: 319, 4: 251})


Fit tfidfvectorizer

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=None)
term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train)
term_docs_test = tfidf_vectorizer.transform(cleaned_test)

Fit and predict SVM

In [6]:
from sklearn.svm import SVC
svm = SVC(kernel='linear', C=1.0, random_state=42)
svm.fit(term_docs_train, label_train)
accuracy = svm.score(term_docs_test, label_test)
print('The accuracy of 5class classification is:', '{0:.1f}%'.format(accuracy*100))

The accuracy of 5class classification is: 88.5%


Accuracy for individual classes

In [7]:
from sklearn.metrics import classification_report
prediction = svm.predict(term_docs_test)
report = classification_report(label_test, prediction)
print(report)

              precision    recall  f1-score   support

           0       0.79      0.76      0.78       319
           1       0.91      0.95      0.93       389
           2       0.98      0.96      0.97       399
           3       0.92      0.94      0.93       394
           4       0.73      0.73      0.73       251

    accuracy                           0.88      1752
   macro avg       0.87      0.87      0.87      1752
weighted avg       0.88      0.88      0.88      1752

