In [1]:
import sklearn
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

In [2]:
newsgroups_data = fetch_20newsgroups()
newsgroups_data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [3]:
print(newsgroups_data.DESCR)

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`~sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features      

In [4]:
newsgroups_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [5]:
len(newsgroups_data.data), len(newsgroups_data.target)

(11314, 11314)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
tfidf_vect = TfidfVectorizer(stop_words='english')
X_transformed = tfidf_vect.fit_transform(newsgroups_data.data)

In [8]:
X_transformed.shape

(11314, 129796)

In [10]:
len(tfidf_vect.get_feature_names_out())

129796

In [11]:
import random
random.sample(tfidf_vect.vocabulary_.items(), 10)

[('041033', 941),
 ('fromn', 56880),
 ('lenger', 75154),
 ('tigris', 114831),
 ('aiders', 26893),
 ('niclas', 87042),
 ('gts1000a', 60624),
 ('flstf', 55866),
 ('600cc', 16472),
 ('drchambe', 48897)]

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_transformed,
                                                        newsgroups_data.target,
                                                        test_size=0.2,
                                                        random_state=42,
                                                        shuffle=True)

In [13]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((9051, 129796), (2263, 129796), (9051,), (2263,))

In [15]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [16]:
mlp_clf = MLPClassifier(
    hidden_layer_sizes=(32,),
    activation='relu',
    solver='adam',
    verbose=True,
    max_iter=50
)
mlp_clf.fit(X_train, y_train)
print(mlp_clf.score(X_test, y_test))

Iteration 1, loss = 2.91328903
Iteration 2, loss = 2.53366606
Iteration 3, loss = 2.01107542
Iteration 4, loss = 1.45322664
Iteration 5, loss = 0.98859055
Iteration 6, loss = 0.66508671
Iteration 7, loss = 0.45761381
Iteration 8, loss = 0.32608374
Iteration 9, loss = 0.24017913
Iteration 10, loss = 0.18226524
Iteration 11, loss = 0.14214852
Iteration 12, loss = 0.11328019
Iteration 13, loss = 0.09232408
Iteration 14, loss = 0.07661956
Iteration 15, loss = 0.06469559
Iteration 16, loss = 0.05538115
Iteration 17, loss = 0.04815556
Iteration 18, loss = 0.04227240
Iteration 19, loss = 0.03755508
Iteration 20, loss = 0.03363623
Iteration 21, loss = 0.03045449
Iteration 22, loss = 0.02774890
Iteration 23, loss = 0.02547260
Iteration 24, loss = 0.02350259
Iteration 25, loss = 0.02185636
Iteration 26, loss = 0.02037440
Iteration 27, loss = 0.01912951
Iteration 28, loss = 0.01803795
Iteration 29, loss = 0.01705026
Iteration 30, loss = 0.01613443
Iteration 31, loss = 0.01537545
Iteration 32, los

