#  20 News Group Text Classification

In [40]:
from sklearn.datasets import fetch_20newsgroups
groups = fetch_20newsgroups()

In [41]:
data_train = fetch_20newsgroups(subset='train', random_state=21)
train_label = data_train.target
data_test = fetch_20newsgroups(subset='test', random_state=21)
test_label = data_test.target
len(data_train.data), len(data_test.data), len(test_label)

(11314, 7532, 7532)

In [42]:
import numpy as np
np.unique(test_label)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [43]:
from collections import defaultdict
from nltk.stem import WordNetLemmatizer
from nltk.corpus import names

all_names = names.words()
WNL = WordNetLemmatizer()
def clean(data):
    cleaned = defaultdict(list)
    count = 0
    for group in data:
        for words in group.split():
            if words.isalpha() and words not in all_names:
                cleaned[count].append(WNL.lemmatize(words.lower()))
        cleaned[count] = ' '.join(cleaned[count])
        count +=1 
    return(list(cleaned.values()))

In [44]:
x_train = clean(data_train.data)
x_train[0]

'bouncing lymenet lehigh university the following address are on the lymenet mailing but are rejecting since the list server originally accepted these address i assume these address have since been improperly functioning mail gateway might also be if you are listed here and would still like to remain on the please write to i will remove these address from the list before the next newsletter go a a general please remember to from all your mailing list before your account is this will save the listserv maintainer from many box lehigh university'

In [45]:
 len(x_train)

11314

In [46]:
x_test = clean(data_test.data)
len(x_test)

7532

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(stop_words='english', max_features=1000)
X_train = tf.fit_transform(x_train)
X_test = tf.transform(x_test)
X_train.shape, X_test.shape

((11314, 1000), (7532, 1000))

## Multinomial Naive Bayes

In [48]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, train_label)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [49]:
Y_predict = clf.predict(X_test)

In [50]:
clf.score(X_test, test_label)

0.6443175783324482

In [51]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(test_label, Y_predict))

              precision    recall  f1-score   support

           0       0.57      0.46      0.51       319
           1       0.51      0.60      0.55       389
           2       0.52      0.55      0.53       394
           3       0.52      0.58      0.55       392
           4       0.60      0.54      0.57       385
           5       0.62      0.60      0.61       395
           6       0.66      0.74      0.70       390
           7       0.67      0.72      0.69       396
           8       0.71      0.73      0.72       398
           9       0.68      0.74      0.71       397
          10       0.82      0.81      0.82       399
          11       0.88      0.77      0.82       396
          12       0.57      0.50      0.53       393
          13       0.70      0.72      0.71       396
          14       0.82      0.70      0.75       394
          15       0.50      0.83      0.62       398
          16       0.58      0.73      0.64       364
          17       0.87    

## SVC

In [52]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
import timeit

linear_svc = LinearSVC()
parameters = {'C': (0.5, 1, 10,100)}

grid_search2 =GridSearchCV(linear_svc, parameters, n_jobs = -1, cv = 3)
start_time = timeit.default_timer()
grid_search2.fit(X_train, train_label)
final = timeit.default_timer()-start_time
print("Execution Time : ",final)

Execution Time :  20.902394499993534


In [53]:
print(grid_search2.best_params_)
print(grid_search2.best_score_)

{'C': 0.5}
0.7240587621538075


In [54]:
grid_search_best2 = grid_search2.best_estimator_
accur2 = grid_search_best2.score(X_test, test_label)
accur2

0.6303770578863516

## Hyper Parameter Tuning -> Linear SVC

In [55]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('tf_id', TfidfVectorizer(stop_words = "english")), ('svm_im', LinearSVC())])
pipeline

parameter = {'tf_id__max_features' : (100,1000, 2000, 8000),
             'tf_id__max_df' : (0.25, 0.5),
             'tf_id__smooth_idf' : (True, False),
             'tf_id__sublinear_tf' : (True, False)
}

In [56]:
grid_search = GridSearchCV(pipeline, parameter,cv = 3)
grid_search.fit(x_train, train_label)

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tf_id',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [57]:
print(grid_search.best_params_)

{'tf_id__max_df': 0.25, 'tf_id__max_features': 8000, 'tf_id__smooth_idf': False, 'tf_id__sublinear_tf': True}


In [58]:
print(grid_search.best_score_)

0.8700729197980644
