In [1]:
import nltk
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from nltk.stem.snowball import SnowballStemmer

In [2]:
# Classes
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])


# Loading the data set - training data.
# twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

text_train = open('data_train.txt')
class_train = open('labels_train_original.txt')
lines_train = text_train.readlines()
target_train = class_train.readlines()

text_test = open('data_valid.txt')
class_test = open('labels_valid_original.txt')
lines_test = text_test.readlines()
target_test = class_test.readlines()

classes = set(target_train)

# You can check the target names (categories) and some data files by following commands.
print(classes)  # prints all the categories
# print("\n".join(lines[0].split("\n")[:]))  # prints first line of the first data file

{'Features\n', 'Classifieds\n', 'Opinion\n', 'News\n'}


In [19]:
len(lines_test)

2000

In [3]:
# Extracting features from text files
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(lines_train)
print(X_train_counts.shape)

(2000, 38749)


In [4]:
# TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

(2000, 38749)


In [5]:
# Machine Learning
# Training Naive Bayes (NB) classifier on training data.
clf = MultinomialNB().fit(X_train_tfidf, target_train)

In [6]:
# Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows:
# The names ‘vect’ , ‘tfidf’ and ‘clf’ are arbitrary but will be used later.
# We will be using the 'text_clf' going forward.
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
text_clf = text_clf.fit(lines_train, target_train)

In [7]:
# Performance of NB Classifier
# twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(lines_test)
np.mean(predicted == target_test)

0.6595

In [8]:
# Training Support Vector Machines - SVM and calculating its performance
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42))])
text_clf_svm = text_clf_svm.fit(lines_train, target_train)
predicted_svm = text_clf_svm.predict(lines_test)
np.mean(predicted_svm == target_test)

0.6785

In [9]:
# Grid Search
# Here, we are creating a list of parameters for which we would like to do performance tuning.
# All the parameters name start with the classifier name (remember the arbitrary name we gave).
# E.g. vect__ngram_range; here we are telling to use unigram and bigrams and choose the one which is optimal.
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}

In [10]:
# Next, we create an instance of the grid search by passing the classifier, parameters
# and n_jobs=-1 which tells to use multiple cores from user machine.
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(lines_train, target_train)

In [11]:
# To see the best mean score and the params, run the following code
print(gs_clf.best_score_)
print(gs_clf.best_params_)

0.677
{'clf__alpha': 0.01, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}


In [12]:
# Output for above should be: The accuracy has now increased to ~90.6% for the NB classifier
# and the corresponding parameters are {‘clf__alpha’: 0.01, ‘tfidf__use_idf’: True, ‘vect__ngram_range’: (1, 2)}.

In [13]:
# Similarly doing grid search for SVM
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),
                  'clf-svm__alpha': (1e-2, 1e-3)}

gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(lines_train, target_train)

print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)

0.672
{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}


In [14]:
# NLTK
# Removing stop words
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])

In [15]:
# Stemming Code
#nltk.download()
stemmer = SnowballStemmer("english", ignore_stopwords=True)
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')
text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()),
                             ('mnb', MultinomialNB(fit_prior=False))])
text_mnb_stemmed = text_mnb_stemmed.fit(lines_train, target_train)
predicted_mnb_stemmed = text_mnb_stemmed.predict(lines_test)
print(np.mean(predicted_mnb_stemmed == target_test))

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
0.656
