In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn import datasets
from sklearn.model_selection import GridSearchCV
from nltk.corpus import names, movie_reviews
import nltk
import numpy as np
import pandas as pd
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   U

True

Fetching and importing the twenty news groups data set

In [None]:

twenty_users = datasets.fetch_20newsgroups()

Displaying the target names

In [None]:
twenty_users.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [None]:
from nltk import FreqDist

Using nltk distribution FreqDist to identify the words

In [120]:
words = [word for doc in twenty_users.data for word in doc.split()]
word_Freq = FreqDist(all_words)

Words that appears more than 97.5 and less than 2.5

Using numpy percentile to do so

In [None]:
freq_1 = np.percentile(list(word_Freq.values()),97.5)
freq_2 = np.percentile(list(word_Freq.values()),2.5)

In [None]:
total_docs = len(twenty_users.data)

In [None]:
high_freq1 = [word for word, freq in word_Freq.items() if freq >= freq_1]
low_freq2 = [word for word,freq in word_Freq.items() if freq <= freq_2]

In [None]:
df = pd.DataFrame({'doc' :twenty_users.data,'target' : twenty_users.target})

Adding up for the stop words

In [84]:
stop_words = high_freq1 + low_freq2

Splitting the train test data sets

In [105]:
traindf,testdf = train_test_split(df)

Using count vectorizer for dealing with stop words that helps in counting

In [106]:
vectorizer = CountVectorizer(stop_words = stop_words)

Creating a pipeline for counting and applying the multinomial classifier


In [107]:

pipeline = Pipeline([('vectorizer', CountVectorizer(binary=True)),
                     ('classifier', MultinomialNB())])
pipeline.fit(traindf['doc'], traindf['target'])

In [108]:
confusion_matrix(testdf['target'],pipeline.predict(testdf['doc']))

array([[115,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,  10,   0,   0,   0,   1],
       [  0, 114,   0,   9,   3,   9,   0,   0,   0,   0,   0,  10,   0,
          2,   1,   1,   0,   1,   0,   0],
       [  0,   5,  47,  65,   1,  16,   1,   0,   0,   0,   0,  15,   5,
          0,   2,   1,   1,   0,   0,   0],
       [  0,   2,   0, 116,   1,   2,   2,   4,   0,   0,   0,   3,   4,
          1,   2,   0,   0,   1,   0,   0],
       [  0,   0,   0,  12, 113,   2,   1,   0,   0,   0,   0,   6,   9,
          1,   1,   1,   0,   0,   0,   0],
       [  0,   8,   0,   4,   0, 140,   0,   0,   0,   0,   0,   4,   0,
          0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   1,  18,   2,   0, 103,   4,   1,   1,   2,   6,   8,
          2,   2,   0,   4,   0,   0,   0],
       [  0,   0,   0,   0,   1,   0,   0, 144,   1,   1,   0,   1,   3,
          0,   0,   1,   5,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   3, 146,   0,   0,  

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

Using TfidVectorizer to fit and transform

In [None]:
vectorizer = TfidfVectorizer()

In [110]:
X_train = vectorizer.fit_transform(df['doc'])
y_train = df['target']

Splitting the vector data into train and test split

In [115]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)

Using the classiier Multinomial to find the predictions and accuracy

In [116]:
classifier = MultinomialNB()

Fitting the data set and identifying the accuracy score

In [117]:
classifier.fit(X_train,y_train)

In [118]:
train_pred = classifier.predict(X_train)
test_pred = classifier.predict(X_test)

In [119]:
train_accuracy = accuracy_score(y_train,train_pred)
test_accuracy = accuracy_score(y_test,test_pred)
print(train_accuracy)
print(test_accuracy)

0.967848856479947
0.8422448077772868


Conclusion :Adding the stop words to the count vector helps in improving the accuracy score for both test  and train (when compared it was : 0.745)