In [2]:
import pandas as pd

#Dataset
from sklearn.datasets import fetch_20newsgroups

from string import punctuation

#feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer

#classificadores
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split


In [3]:
dataset = fetch_20newsgroups()
dataset.data[:1]

[u"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"]

In [4]:
dataset.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [5]:
#utilizar apenas duas categorias para demonstração
categories = ['rec.autos', 'rec.sport.baseball']

#shuffle serve para embaralhar os dados
dataset_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42) 

In [6]:
#nomes das categorias
dataset_train.target_names

['rec.autos', 'rec.sport.baseball']

In [7]:
#rótulo de cada categoria
dataset_train.target[:5]

array([0, 1, 1, 1, 1])

In [8]:
#cálculo TF-IDF para identificação do peso (importância) de cada termo no dataset
vectorizer = TfidfVectorizer()
x_train_tfidf_vectorize = vectorizer.fit_transform(dataset_train.data)

In [9]:
#separação do conjunto em grupos para treinamento e teste
X, Y = x_train_tfidf_vectorize, dataset_train.target

# 30% dos dados é destinado para teste
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [10]:
# quantidade de dados em cada conjunto
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(833, 18391)
(833,)
(358, 18391)
(358,)


In [11]:
'''
http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html

This estimator implements regularized linear models with stochastic gradient descent (SGD) learning: the gradient 
of the loss is estimated each sample at a time and the model is updated along the way with a decreasing strength
schedule (aka learning rate).

This implementation works with data represented as dense or sparse arrays of floating point values for the 
features. The model it fits can be controlled with the loss parameter; by default, it fits a linear support
vector machine (SVM).

http://scikit-learn.org/stable/modules/sgd.html

SGD has been successfully applied to large-scale and sparse machine learning problems often encountered in text 
classification and natural language processing.

'''

svm = SGDClassifier()
svm.fit(x_train, y_train)
y_pred_svm = svm.predict(x_test)

print(metrics.classification_report(y_test, y_pred_svm, target_names=categories, digits=3))



                    precision    recall  f1-score   support

         rec.autos      0.983     0.988     0.985       171
rec.sport.baseball      0.989     0.984     0.987       187

       avg / total      0.986     0.986     0.986       358



In [12]:
#resultado do treinamento - acurácia média
acc_svm = round(svm.score(x_train, y_train), 3)
acc_svm

1.0

In [13]:
'''
Naive Bayes methods are a set of supervised learning algorithms based on applying Bayes' theorem with the "naive"
assumption of independence between every pai of features.

Multinomial NB implements the NB algorithm for multinomially distributed data, and is one of the two classic naive 
bayes variants used in text classification (where the data are typically represented as word vector counts, 
although tf-idf vectors are also known to work well in practice)

'''

nb = MultinomialNB()

nb.fit(x_train, y_train)
y_pred_nb = nb.predict(x_test)
print(metrics.classification_report(y_test, y_pred_nb, target_names=categories, digits=3))

                    precision    recall  f1-score   support

         rec.autos      0.977     0.988     0.983       171
rec.sport.baseball      0.989     0.979     0.984       187

       avg / total      0.983     0.983     0.983       358



In [14]:
#resultado treinamento
acc_nb = round(nb.score(x_train, y_train), 3)
acc_nb

0.998

In [15]:
'''
A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the
datasat and use averaging to improve the predictive accuracy and control over-fitting.

'''


random_f = RandomForestClassifier()
random_f.fit(x_train, y_train)
y_pred_random_f = random_f.predict(x_test)

print(metrics.classification_report(y_test, y_pred_random_f, target_names=categories, digits=3))


                    precision    recall  f1-score   support

         rec.autos      0.842     0.936     0.886       171
rec.sport.baseball      0.935     0.840     0.885       187

       avg / total      0.890     0.885     0.885       358



In [16]:
#resultado treinamento
acc_random_f = round(random_f.score(x_train, y_train), 3)
acc_random_f

0.996

In [17]:
#Sumarização dos resultados

models = pd.DataFrame({
    'Modelo': ['Stochastic Gradient Decent', 'Naive Bayes', 'Random Forest'],
    'Acuracia': [acc_svm, acc_nb, acc_random_f]
    })
models.sort_values(by='Acuracia', ascending=False)

Unnamed: 0,Acuracia,Modelo
0,1.0,Stochastic Gradient Decent
1,0.998,Naive Bayes
2,0.996,Random Forest


Na média, os resultados para as métricas Precision, Recall e F1-Score foram os melhores para o classificador 
SGDClassifier


#### Predição de novos dados


In [43]:
#notícias de baseball- 1º, 2º, 5º, 7º
#notícias sobre veículos - 3º, 4º, 6º

#frases tiradas de sites de notícias
new_docs = [
    "Major League Baseball has released its postseason schedule for 2018 and there's immediately good news: no November baseball.",
    "If the World Series runs all seven games -- and c'mon, it's best when it does -- the final game of the season would take place on Halloween.",
    "The government is preparing a fresh policy for promotion of electric vehicles, which will be rolled out initially on a smaller scale to ensure smoother transition and better cooperation from the automobile sector, a government official said.",
    "The swift way in which the Idea-Vodafone merger is progressing under a year shows the improvement in ease of doing business in India.",
    "MLB trade deadline: Orioles' Adam Jones explains why he vetoed a deal to the Phillies",
    "The company, however, said it is currently working out the details of the quantum of price increase, which will vary depending on the model",
    "Old standbys like Luis Severino and Corey Kluber haven't quite looked like themselves of late, and they're not alone. Scott White assesses the reliability of 20 presumed mainstays."
]

# 0 = autos, 1 = baseball
y_new_docs = [1,1,0,0,1,0,1]

x_new_tfidf_vectorize = vectorizer.transform(new_docs)

In [33]:
svm_predicted = svm.predict(x_new_tfidf_vectorize)

for doc, category in zip(new_docs, svm_predicted):
    print('%r => %s' % (doc, dataset_train.target_names[category]))

"Major League Baseball has released its postseason schedule for 2018 and there's immediately good news: no November baseball." => rec.sport.baseball
"If the World Series runs all seven games -- and c'mon, it's best when it does -- the final game of the season would take place on Halloween." => rec.sport.baseball
'The government is preparing a fresh policy for promotion of electric vehicles, which will be rolled out initially on a smaller scale to ensure smoother transition and better cooperation from the automobile sector, a government official said.' => rec.autos
'The swift way in which the Idea-Vodafone merger is progressing under a year shows the improvement in ease of doing business in India.' => rec.sport.baseball
"MLB trade deadline: Orioles' Adam Jones explains why he vetoed a deal to the Phillies" => rec.sport.baseball
'The company, however, said it is currently working out the details of the quantum of price increase, which will vary depending on the model' => rec.autos
"Old s

In [34]:
print(metrics.classification_report(y_new_docs, svm_predicted, target_names=categories, digits=3))

                    precision    recall  f1-score   support

         rec.autos      1.000     0.667     0.800         3
rec.sport.baseball      0.800     1.000     0.889         4

       avg / total      0.886     0.857     0.851         7



Utilizando o classificador SGDClassifier, o modelo errou apenas uma das entradas: 

The swift way in which the Idea-Vodafone merger is progressing under a year shows the improvement in ease of doing business in India.' => rec.sport.baseball

O *recall* para classe *baseball* foi de 100%, pois o classificador conseguiu encontrar todos os documentos referentes a essa classe. E *precision* foi de 80%, já que um dos docs considerado como baseball, na realidade pertencia a outra classe.

In [39]:
nb_predicted = nb.predict(x_new_tfidf_vectorize)

for doc, category in zip(new_docs, nb_predicted):
    print('%r => %s' % (doc, dataset_train.target_names[category]))

"Major League Baseball has released its postseason schedule for 2018 and there's immediately good news: no November baseball." => rec.sport.baseball
"If the World Series runs all seven games -- and c'mon, it's best when it does -- the final game of the season would take place on Halloween." => rec.sport.baseball
'The government is preparing a fresh policy for promotion of electric vehicles, which will be rolled out initially on a smaller scale to ensure smoother transition and better cooperation from the automobile sector, a government official said.' => rec.autos
'The swift way in which the Idea-Vodafone merger is progressing under a year shows the improvement in ease of doing business in India.' => rec.autos
"MLB trade deadline: Orioles' Adam Jones explains why he vetoed a deal to the Phillies" => rec.sport.baseball
'The company, however, said it is currently working out the details of the quantum of price increase, which will vary depending on the model' => rec.autos
"Old standbys l

In [40]:
print(metrics.classification_report(y_new_docs, nb_predicted, target_names=categories, digits=3))

                    precision    recall  f1-score   support

         rec.autos      1.000     1.000     1.000         3
rec.sport.baseball      1.000     1.000     1.000         4

       avg / total      1.000     1.000     1.000         7



O modelo utilizando o classificador Naive Bayes foi capaz de predizer corretamente todos os documentos.

In [41]:
random_predicted = random_f.predict(x_new_tfidf_vectorize)

for doc, category in zip(new_docs, random_predicted):
    print('%r => %s' % (doc, dataset_train.target_names[category]))

"Major League Baseball has released its postseason schedule for 2018 and there's immediately good news: no November baseball." => rec.sport.baseball
"If the World Series runs all seven games -- and c'mon, it's best when it does -- the final game of the season would take place on Halloween." => rec.autos
'The government is preparing a fresh policy for promotion of electric vehicles, which will be rolled out initially on a smaller scale to ensure smoother transition and better cooperation from the automobile sector, a government official said.' => rec.sport.baseball
'The swift way in which the Idea-Vodafone merger is progressing under a year shows the improvement in ease of doing business in India.' => rec.sport.baseball
"MLB trade deadline: Orioles' Adam Jones explains why he vetoed a deal to the Phillies" => rec.sport.baseball
'The company, however, said it is currently working out the details of the quantum of price increase, which will vary depending on the model' => rec.autos
"Old s

In [42]:
print(metrics.classification_report(y_new_docs, random_predicted, target_names=categories, digits=3))

                    precision    recall  f1-score   support

         rec.autos      0.500     0.333     0.400         3
rec.sport.baseball      0.600     0.750     0.667         4

       avg / total      0.557     0.571     0.552         7



O modelo utilizando o classificador RandomForest não se saiu muito bem, principalmente considerando a classe *autos*.

"If the World Series runs all seven games -- and c'mon, it's best when it does -- the final game of the season would take place on Halloween." => rec.autos

'The government is preparing a fresh policy for promotion of electric vehicles, which will be rolled out initially on a smaller scale to ensure smoother transition and better cooperation from the automobile sector, a government official said.' => rec.sport.baseball

'The swift way in which the Idea-Vodafone merger is progressing under a year shows the improvement in ease of doing business in India.' => rec.sport.baseball