# Tidene Códigos - Classificadores

### Recebe os dados

In [1]:
import numpy as np
import pandas as pd
import nltk
import sklearn
import gensim
import pickle

In [2]:
# copiei aqui as classes definidas quando foram criados os vetorizadores... ela poderia ser importada do notebook no qual foi definida
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import *    #https://www.nltk.org/api/nltk.tokenize.html

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        self.tokenizer = nltk.tokenize.RegexpTokenizer("[a-zA-Z']+")
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in self.tokenizer.tokenize(doc) if (len(t)>2)]

class StemTokenizer(object):
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.tokenizer = nltk.tokenize.RegexpTokenizer("[a-zA-Z']+")
    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in self.tokenizer.tokenize(doc) if (len(t)>2)]


In [3]:
# carrega a matriz de features e o vetorizador e a matriz de features 
tfidf_vectorizer = pickle.load(open("tfidf_vectorizer.pickle", "rb"))
X_train_tfidf = pickle.load(open("X_train_tfidf.pickle", "rb"))

X_train_tfidf.shape   # matriz de features

(17, 943)

#### Lê os textos e as classes de treinamento - e também as classes do arquivo de treinamento

In [4]:
# AQUI PODE-SE MUDAR QUAL A CLASSE QUE VAI SER CONSIDERADA
train_classes = pd.read_csv('toy.csv',encoding='utf8')['subgroup'].values.tolist()

test_docs = pd.read_csv('testtoy.csv',encoding='utf8')['data'].values.tolist()
test_classes = pd.read_csv('testtoy.csv',encoding='utf8')['subgroup'].values.tolist()
others_test_classes = pd.read_csv('testtoy.csv',encoding='utf8')['othersipcs'].values.tolist()


X_test_tfidf = tfidf_vectorizer.transform(test_docs) #representa os documentos com o padrao treinado


In [5]:
X_test_tfidf.shape

(8, 943)

### Classificador Bayesiano

In [6]:
#http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
from sklearn.naive_bayes import MultinomialNB
clf_NB = MultinomialNB().fit(X_train_tfidf, train_classes)

##### Predizendo classes para o texto novo

In [7]:
predicted_NB = clf_NB.predict(X_test_tfidf)
print("Acurácia: ", np.mean(predicted_NB == test_classes))           

print('classe real => classe predita ===++=== poderiam ser estas tambem ')

for real, pred, others in zip(test_classes, predicted_NB, others_test_classes):
    print('%s   => %s      ===++=== %s' % (real,pred,others))


Acurácia:  0.125
classe real => classe predita ===++=== poderiam ser estas tambem 
B03B00562   => B03B00562      ===++=== D21C00502
B03B00510   => B03B00562      ===++=== B03B00524
B03B00700   => B03B00562      ===++=== C09K00700
B03B00104   => B03B00562      ===++=== B03D00306-C08J00303-C04B03304-C09C00142
H03F00100   => B03B00562      ===++===  
H03F00102   => B03B00562      ===++=== H03G00320
H03M00514   => B03B00562      ===++=== G11B02014
H03M00300   => B03B00562      ===++===  


#### Avaliação

In [8]:
from sklearn import metrics
print(metrics.classification_report(test_classes, predicted_NB,target_names=set(test_classes)))


             precision    recall  f1-score   support

  B03B00104       0.00      0.00      0.00         1
  H03M00300       0.00      0.00      0.00         1
  H03F00102       0.12      1.00      0.22         1
  H03F00100       0.00      0.00      0.00         1
  B03B00510       0.00      0.00      0.00         1
  B03B00562       0.00      0.00      0.00         1
  H03M00514       0.00      0.00      0.00         1
  B03B00700       0.00      0.00      0.00         1

avg / total       0.02      0.12      0.03         8



  'precision', 'predicted', average, warn_for)


In [9]:
# Matriz de Confusao
print(metrics.confusion_matrix(test_classes, predicted_NB))

[[0 0 1 0 0 0 0 0]
 [0 0 1 0 0 0 0 0]
 [0 0 1 0 0 0 0 0]
 [0 0 1 0 0 0 0 0]
 [0 0 1 0 0 0 0 0]
 [0 0 1 0 0 0 0 0]
 [0 0 1 0 0 0 0 0]
 [0 0 1 0 0 0 0 0]]


### Classificadore SGDC

In [10]:
#http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
from sklearn.linear_model import SGDClassifier

clf_SGDC = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=5, tol=None)
clf_SGDC.fit(X_train_tfidf, train_classes)
predicted_SGDC = clf_SGDC.predict(X_test_tfidf)

print("Acurácia: ", np.mean(predicted_SGDC == test_classes))           

print('classe real => classe predita ===++=== poderiam ser estas tambem ')

for real, pred, others in zip(test_classes, predicted_SGDC, others_test_classes):
    print('%s   => %s      ===++=== %s' % (real,pred,others))


Acurácia:  0.125
classe real => classe predita ===++=== poderiam ser estas tambem 
B03B00562   => B03B00512      ===++=== D21C00502
B03B00510   => B03B00562      ===++=== B03B00524
B03B00700   => H03M00730      ===++=== C09K00700
B03B00104   => B03B00562      ===++=== B03D00306-C08J00303-C04B03304-C09C00142
H03F00100   => H03F00102      ===++===  
H03F00102   => H03F00102      ===++=== H03G00320
H03M00514   => H03M00740      ===++=== G11B02014
H03M00300   => B03B00562      ===++===  


#### Métricas de avaliação

In [11]:
from sklearn import metrics
print(metrics.classification_report(test_classes, predicted_SGDC,target_names=set(test_classes)))
      

             precision    recall  f1-score   support

  B03B00104       0.00      0.00      0.00         1
  H03M00300       0.00      0.00      0.00         1
  H03F00102       0.00      0.00      0.00         0
  H03F00100       0.00      0.00      0.00         1
  B03B00510       0.00      0.00      0.00         1
  B03B00562       0.00      0.00      0.00         1
  H03M00514       0.50      1.00      0.67         1
  B03B00700       0.00      0.00      0.00         1

avg / total       0.06      0.12      0.08         8



  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [12]:
print("Matriz de confusão")
print(metrics.confusion_matrix(test_classes, predicted_SGDC))

Matriz de confusão
[[0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0]]


### Classificador SVM

In [13]:
#http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
from sklearn.svm import LinearSVC

clf_SVC = LinearSVC(random_state=0)
clf_SVC.fit(X_train_tfidf, train_classes)
predicted_SVC = clf_SVC.predict(X_test_tfidf)

print("Acurácia: ", np.mean(predicted_SVC == test_classes))           


print('classe real => classe predita ===++=== poderiam ser estas tambem ')

for real, pred, others in zip(test_classes, predicted_SVC, others_test_classes):
    print('%s   => %s      ===++=== %s' % (real,pred,others))



Acurácia:  0.125
classe real => classe predita ===++=== poderiam ser estas tambem 
B03B00562   => B03B00562      ===++=== D21C00502
B03B00510   => B03B00562      ===++=== B03B00524
B03B00700   => B03B00562      ===++=== C09K00700
B03B00104   => B03B00562      ===++=== B03D00306-C08J00303-C04B03304-C09C00142
H03F00100   => H03F00126      ===++===  
H03F00102   => H03F00130      ===++=== H03G00320
H03M00514   => H03M00740      ===++=== G11B02014
H03M00300   => H03M00730      ===++===  


#### Avaliação

In [14]:
from sklearn import metrics
print(metrics.classification_report(test_classes, predicted_SVC,target_names=set(test_classes)))

             precision    recall  f1-score   support

  B03B00104       0.00      0.00      0.00         1
  H03M00300       0.00      0.00      0.00         1
  H03F00102       0.25      1.00      0.40         1
  H03F00100       0.00      0.00      0.00         1
  B03B00510       0.00      0.00      0.00         1
  B03B00562       0.00      0.00      0.00         1
  H03M00514       0.00      0.00      0.00         0
  B03B00700       0.00      0.00      0.00         0

avg / total       0.03      0.12      0.05         8



  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


#### Matriz de Confusão

In [15]:
print("Matriz de confusão")
print(metrics.confusion_matrix(test_classes, predicted_SVC))

Matriz de confusão
[[0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]]
