# Text Classifier

For training and testing I used the [_fetch_20newsgroups_](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html) by _sklearn.datasets_



## Importing our classes, dataset and the more functions

In [5]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import SGDClassifier

from sklearn import metrics
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt

## Limiting the number of categories

In [7]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med', 'rec.autos']
train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

KeyboardInterrupt: 

## Feedback about our data

In [None]:
print(train.target_names)

['alt.atheism', 'comp.graphics', 'rec.autos', 'sci.med', 'soc.religion.christian']


In [None]:
train.target[:10]

array([1, 0, 1, 1, 1, 1, 3, 4, 0, 1])

In [None]:
for t in train.target[:10]:
  print(train.target_names[t])

comp.graphics
alt.atheism
comp.graphics
comp.graphics
comp.graphics
comp.graphics
sci.med
soc.religion.christian
alt.atheism
comp.graphics


## About the Classifier - TF-IDF

In [None]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train.data)

Classifier training

In [None]:
clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)
clf.fit(X_train, train.target)

## Evaluating the performance

In [None]:
test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
docs_test = test.data

vect_transform = vectorizer.transform(docs_test)
predicted = clf.predict(vect_transform)

print(metrics.classification_report(test.target, predicted, target_names=test.target_names))

print(clf.classes_)

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.80      0.87       319
         comp.graphics       0.87      0.96      0.91       389
             rec.autos       0.96      0.98      0.97       396
               sci.med       0.95      0.87      0.91       396
soc.religion.christian       0.88      0.96      0.92       398

              accuracy                           0.92      1898
             macro avg       0.92      0.91      0.92      1898
          weighted avg       0.92      0.92      0.92      1898

[0 1 2 3 4]


In [None]:
confusion_matrix = confusion_matrix(test.target, predicted)
print(confusion_matrix)

[[256   7   1  13  42]
 [  2 374   8   1   4]
 [  0   8 387   1   0]
 [  7  31   6 346   6]
 [  5   9   0   2 382]]


In [8]:
# para melhor visualização da matriz de confusao
plt.matshow(confusion_matrix)
plt.title('Matriz de confusão')
plt.colorbar()

plt.ylabel('Classificações corretas')
plt.xlabel('Classificações obtidas')
plt.show()

ValueError: not enough values to unpack (expected 2, got 0)