In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn import datasets
from sklearn.model_selection import GridSearchCV
from nltk.corpus import names, movie_reviews
import nltk
import numpy as np
import pandas as pd
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   U

True

In [None]:
twenty_users = datasets.fetch_20newsgroups()

In [None]:
twenty_users.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [None]:
df = pd.DataFrame({'doc': twenty_users.data, 'target': twenty_users.target})
traindf, testdf = train_test_split(df)

In [None]:
from nltk.tokenize import word_tokenize
distribution = nltk.FreqDist(word for doc in traindf['doc'] for word in word_tokenize(doc))

In [None]:
total_words = len(traindf['doc'])
words975 = [word for word, freq in distribution.items() if freq / total_words >= 0.975 or freq / total_words <= 0.025]

In [None]:
vectorizer = CountVectorizer(stop_words=words975)

In [None]:
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', RandomForestClassifier(n_estimators=50))
])
pipeline.fit(traindf['doc'], traindf['target'])



In [None]:
training_accuracy = accuracy_score(traindf['target'], pipeline.predict(traindf['doc']))
training_accuracy

0.9998821449616971

In [None]:
testing_accuracy = accuracy_score(testdf['target'], pipeline.predict(testdf['doc']))
testing_accuracy

0.7317073170731707

The training accuracy is 99% which means the data fits perfectly to the training data, meaning the model memorized the data and is overfitting.

The testing accuracy is 73%, which is almost same as when the dimensionality was decreased.