In [24]:
import pandas as pd

#Dataset
from sklearn.datasets import fetch_20newsgroups

from string import punctuation

#feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer

#classificadores
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import cross_val_score


In [2]:
dataset = fetch_20newsgroups()
dataset.data[:1]

[u"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"]

In [3]:
dataset.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [25]:
#utilizar apenas duas categorias para demonstração
categories = ['rec.autos', 'rec.sport.baseball']

dataset_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42) 

In [5]:
#nomes das categorias
dataset_train.target_names

['rec.autos', 'rec.sport.baseball']

In [6]:
#rótulo de cada categoria
dataset_train.target[:5]

array([0, 1, 1, 1, 1])

In [7]:
#cálculo TF-IDF para identificação do peso (importância) de cada termo no dataset
vectorizer = TfidfVectorizer()
x_train_tfidf_vectorize = vectorizer.fit_transform(dataset_train.data)

In [8]:
#separação do conjunto de dados, onde X representa as features (neste caso, o texto) e y a classe de cada amostra
X, y = x_train_tfidf_vectorize, dataset_train.target

### Utilizando cross-validation

1. Necessário definir o número de folds (quantidade de grupos no qual o dataset será dividido)
2. Utiliza 1 fold para teste e os demais para treinamento
3. Cálculo do score
4. Repete os passos 2 e 3 até que cada fold tenha sido utilizado como teste
5. Calcula o score médio a partir do score de cada execução

Podemos utilizar *cross_val_score* ou *cross_validate*. A diferença é que a função *cross_validate* permite a especificação de várias métricas de avaliação e retorna um dicionário contendo informações como *training scores*, *fit-times* e *score-times*.

In [26]:
#métricas ponderadas
scoring = ['f1_weighted', 'precision_weighted', 'recall_weighted', 'accuracy' ]

In [27]:
from sklearn.model_selection import cross_validate

svm = SGDClassifier()
#10 folds e como score, passamos as métricas definidas em scoring
scores_svm = cross_validate(svm, X, y, cv=10, scoring=scoring)

sorted(scores_svm.keys())


['fit_time',
 'score_time',
 'test_accuracy',
 'test_f1_weighted',
 'test_precision_weighted',
 'test_recall_weighted',
 'train_accuracy',
 'train_f1_weighted',
 'train_precision_weighted',
 'train_recall_weighted']

In [11]:
print('Accuracy: {:.3f}'.format(scores_svm['test_accuracy'].mean()))

print('F1_weighted: {:.3f}'.format(scores_svm['test_f1_weighted'].mean()))

print('Precision_weighted: {:.3f}'.format(scores_svm['test_precision_weighted'].mean()))

print('Recall_weighted: {:.3f}'.format(scores_svm['test_recall_weighted'].mean()))

Accuracy: 0.988
F1_weighted: 0.988
Precision_weighted: 0.988
Recall_weighted: 0.988


In [12]:
nb = MultinomialNB()

scores_nb = cross_validate(nb, X, y, cv=10, scoring=scoring)


print('Accuracy: {:.3f}'.format(scores_nb['test_accuracy'].mean()))

print('F1_weighted: {:.3f}'.format(scores_nb['test_f1_weighted'].mean()))

print('Precision_weighted: {:.3f}'.format(scores_nb['test_precision_weighted'].mean()))

print('Recall_weighted: {:.3f}'.format(scores_nb['test_recall_weighted'].mean()))

Accuracy: 0.988
F1_weighted: 0.988
Precision_weighted: 0.988
Recall_weighted: 0.988


In [13]:
random_f = RandomForestClassifier()

scores_random = cross_validate(random_f, X, y, cv=10, scoring=scoring)

print('Accuracy: {:.3f}'.format(scores_random['test_accuracy'].mean()))

print('F1_weighted: {:.3f}'.format(scores_random['test_f1_weighted'].mean()))

print('Precision_weighted: {:.3f}'.format(scores_random['test_precision_weighted'].mean()))

print('Recall_weighted: {:.3f}'.format(scores_random['test_recall_weighted'].mean()))


Accuracy: 0.918
F1_weighted: 0.918
Precision_weighted: 0.921
Recall_weighted: 0.918


In [14]:
#Sumarização dos resultados - 10-folds cross-validation

models = pd.DataFrame({
    'Modelo': ['Stochastic Gradient Decent', 'Naive Bayes', 'Random Forest'],
    'Acuracia': [scores_svm['test_accuracy'].mean(), scores_nb['test_accuracy'].mean(), scores_random['test_accuracy'].mean()]
    })
models.sort_values(by='Acuracia', ascending=False)

Unnamed: 0,Acuracia,Modelo
1,0.988242,Naive Bayes
0,0.988228,Stochastic Gradient Decent
2,0.917756,Random Forest


#### Predição de novos dados


In [15]:
#notícias de baseball- 1º, 2º, 5º, 7º
#notícias sobre veículos - 3º, 4º, 6º

#frases tiradas de sites de notícias
new_docs = [
    "Major League Baseball has released its postseason schedule for 2018 and there's immediately good news: no November baseball.",
    "If the World Series runs all seven games -- and c'mon, it's best when it does -- the final game of the season would take place on Halloween.",
    "The government is preparing a fresh policy for promotion of electric vehicles, which will be rolled out initially on a smaller scale to ensure smoother transition and better cooperation from the automobile sector, a government official said.",
    "The swift way in which the Idea-Vodafone merger is progressing under a year shows the improvement in ease of doing business in India.",
    "MLB trade deadline: Orioles' Adam Jones explains why he vetoed a deal to the Phillies",
    "The company, however, said it is currently working out the details of the quantum of price increase, which will vary depending on the model",
    "Old standbys like Luis Severino and Corey Kluber haven't quite looked like themselves of late, and they're not alone. Scott White assesses the reliability of 20 presumed mainstays."
]

# 0 = autos, 1 = baseball
y_new_docs = [1,1,0,0,1,0,1]

x_new_tfidf_vectorize = vectorizer.transform(new_docs)

De acordo com a [documentação](http://scikit-learn.org/stable/modules/cross_validation.html), a função *cross_validate* vai fazer o *split*, *fit* e vai computar o *score* K vezes. ("splitting the data, fitting a model and computing the score K consecutive times").

E, de acordo com este [link](https://stats.stackexchange.com/a/52277) e [este](https://stackoverflow.com/questions/41560177/using-cross-val-predict-against-test-data-set), cross-validation é utilizado como uma forma de validar o modelo (algoritmo e parâmetros escolhidos) e não para a construção do modelo final. Sendo assim, para testar o modelo com as novas amostras, é necessário executar o *fit* com o conjunto de dados (vamos utilizar todo o dataset) para dar prosseguimento.


In [17]:
svm.fit(X, y)
svm_predicted = svm.predict(x_new_tfidf_vectorize)

for doc, category in zip(new_docs, svm_predicted):
    print('%r => %s' % (doc, dataset_train.target_names[category]))

"Major League Baseball has released its postseason schedule for 2018 and there's immediately good news: no November baseball." => rec.sport.baseball
"If the World Series runs all seven games -- and c'mon, it's best when it does -- the final game of the season would take place on Halloween." => rec.sport.baseball
'The government is preparing a fresh policy for promotion of electric vehicles, which will be rolled out initially on a smaller scale to ensure smoother transition and better cooperation from the automobile sector, a government official said.' => rec.autos
'The swift way in which the Idea-Vodafone merger is progressing under a year shows the improvement in ease of doing business in India.' => rec.autos
"MLB trade deadline: Orioles' Adam Jones explains why he vetoed a deal to the Phillies" => rec.sport.baseball
'The company, however, said it is currently working out the details of the quantum of price increase, which will vary depending on the model' => rec.autos
"Old standbys l

In [18]:
print(metrics.classification_report(y_new_docs, svm_predicted, target_names=categories, digits=3))

                    precision    recall  f1-score   support

         rec.autos      1.000     1.000     1.000         3
rec.sport.baseball      1.000     1.000     1.000         4

       avg / total      1.000     1.000     1.000         7



In [19]:
nb.fit(X, y)

nb_predicted = nb.predict(x_new_tfidf_vectorize)

for doc, category in zip(new_docs, nb_predicted):
    print('%r => %s' % (doc, dataset_train.target_names[category]))

"Major League Baseball has released its postseason schedule for 2018 and there's immediately good news: no November baseball." => rec.sport.baseball
"If the World Series runs all seven games -- and c'mon, it's best when it does -- the final game of the season would take place on Halloween." => rec.sport.baseball
'The government is preparing a fresh policy for promotion of electric vehicles, which will be rolled out initially on a smaller scale to ensure smoother transition and better cooperation from the automobile sector, a government official said.' => rec.autos
'The swift way in which the Idea-Vodafone merger is progressing under a year shows the improvement in ease of doing business in India.' => rec.autos
"MLB trade deadline: Orioles' Adam Jones explains why he vetoed a deal to the Phillies" => rec.sport.baseball
'The company, however, said it is currently working out the details of the quantum of price increase, which will vary depending on the model' => rec.autos
"Old standbys l

In [20]:
print(metrics.classification_report(y_new_docs, nb_predicted, target_names=categories, digits=3))

                    precision    recall  f1-score   support

         rec.autos      1.000     1.000     1.000         3
rec.sport.baseball      1.000     1.000     1.000         4

       avg / total      1.000     1.000     1.000         7



O modelo utilizando o classificador Naive Bayes foi capaz de predizer corretamente todos os documentos.

In [21]:
random_f.fit(X, y)
random_predicted = random_f.predict(x_new_tfidf_vectorize)

for doc, category in zip(new_docs, random_predicted):
    print('%r => %s' % (doc, dataset_train.target_names[category]))

"Major League Baseball has released its postseason schedule for 2018 and there's immediately good news: no November baseball." => rec.autos
"If the World Series runs all seven games -- and c'mon, it's best when it does -- the final game of the season would take place on Halloween." => rec.sport.baseball
'The government is preparing a fresh policy for promotion of electric vehicles, which will be rolled out initially on a smaller scale to ensure smoother transition and better cooperation from the automobile sector, a government official said.' => rec.autos
'The swift way in which the Idea-Vodafone merger is progressing under a year shows the improvement in ease of doing business in India.' => rec.autos
"MLB trade deadline: Orioles' Adam Jones explains why he vetoed a deal to the Phillies" => rec.autos
'The company, however, said it is currently working out the details of the quantum of price increase, which will vary depending on the model' => rec.autos
"Old standbys like Luis Severino 

In [22]:
print(metrics.classification_report(y_new_docs, random_predicted, target_names=categories, digits=3))

                    precision    recall  f1-score   support

         rec.autos      0.500     1.000     0.667         3
rec.sport.baseball      1.000     0.250     0.400         4

       avg / total      0.786     0.571     0.514         7



O modelo utilizando o classificador RandomForest não se saiu muito bem, principalmente considerando a classe *baseball*, já que das 7 amostras classificou apenas 1 como *baseball*.

Para cada métrica separadamente, é interessante observar que mesmo tendo um desempenho ruim para a classe *baseball*, a Precisão da mesma foi de 100%, já que a única amostra considerada como *baseball* pertencia realmente a essa classe.

O mesmo ocorre para a classe *autos*, neste caso para a métrica Recall. Mesmo tendo classificado amostras da classe *baseball* erroneamente, todas as amostras da classe *autos* foram classificadas corretamente. 

Por isso é importante avaliar várias métricas e não focar em apenas uma.

In [23]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(y_new_docs, random_predicted)
print(confusion_matrix)

[[3 0]
 [3 1]]


true positive = 3, true negative = 1, false positive = 3 e false negative = 0

Positive = autos e Negative = baseball

Logo, na diagonal principal temos:
- *true positive* (3) representa a quantidade de amostras classificadas como *autos* que realmente pertencem a essa classe, ou seja, que foram classificados corretamente.
- *true negative* (1) representa a quantidade de amostras classificadas como *baseball* que realmente pertencem a essa classe.

E na diagonal secundária:

- *false positive* (3) representa a quantidade de amostras classificadas como *autos*, mas que na verdade são da classe *baseball* (neste caso, 3 amostras foram classificadas erroneamente).
- *false negative* (0) representa a quantidade de amostras classificadas como *baseball*, mas que na verdade são da classe *autos* (neste caso, nenhuma amostra foi classificada erroneamente como *baseball*).