# Tidene Códigos - Classificadores

### Recebe os dados

In [None]:
import numpy as np
import pandas as pd
import nltk
import sklearn
import gensim
import pickle

In [None]:
# copiei aqui as classes definidas quando foram criados os vetorizadores... ela poderia ser importada do notebook no qual foi definida
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import *    #https://www.nltk.org/api/nltk.tokenize.html

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        self.tokenizer = nltk.tokenize.RegexpTokenizer("[a-zA-Z']+")
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in self.tokenizer.tokenize(doc) if (len(t)>2)]

class StemTokenizer(object):
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.tokenizer = nltk.tokenize.RegexpTokenizer("[a-zA-Z']+")
    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in self.tokenizer.tokenize(doc) if (len(t)>2)]


In [None]:
# carrega a matriz de features e o vetorizador e a matriz de features 
tfidf_vectorizer = pickle.load(open("tfidf_vectorizer.pickle", "rb"))
X_train_tfidf = pickle.load(open("X_train_tfidf.pickle", "rb"))

X_train_tfidf.shape   # matriz de features

#### Lê os textos e as classes de treinamento - e também as classes do arquivo de treinamento

In [None]:
# AQUI PODE-SE MUDAR QUAL A CLASSE QUE VAI SER CONSIDERADA
from sklearn.model_selection import train_test_split
import pandas as pd
corpus =  pd.read_csv('data/train_min.csv',encoding='utf8')

corpus_train , corpus_test =  train_test_split(corpus, train_size=0.7)

train_classes = corpus_train['sentiment'].values.tolist()

test_docs = corpus_test['review'].values.tolist()
test_classes = corpus_test['sentiment'].values.tolist()


X_test_tfidf = tfidf_vectorizer.transform(test_docs) #representa os documentos com o padrao treinado


In [None]:
X_test_tfidf.shape

### Classificador Bayesiano

In [None]:
#http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
from sklearn.naive_bayes import MultinomialNB
clf_NB = MultinomialNB().fit(X_train_tfidf, train_classes)

##### Predizendo classes para o texto novo

In [None]:
predicted_NB = clf_NB.predict(X_test_tfidf)
print("Acurácia: ", np.mean(predicted_NB == test_classes))           

print('classe real => classe predita')

for real, pred in zip(test_classes, predicted_NB):
    print((real,pred))


#### Avaliação

In [None]:
from sklearn import metrics
print(metrics.classification_report(test_classes, predicted_NB,target_names=["negative","positive"]))


In [None]:
# Matriz de Confusao
print(metrics.confusion_matrix(test_classes, predicted_NB))

### Classificadore SGDC

In [None]:
#http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
from sklearn.linear_model import SGDClassifier

clf_SGDC = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=5, tol=None)
clf_SGDC.fit(X_train_tfidf, train_classes)
predicted_SGDC = clf_SGDC.predict(X_test_tfidf)

print("Acurácia: ", np.mean(predicted_SGDC == test_classes))           

print('classe real => classe predita')

for real, pred in zip(test_classes, predicted_SGDC):
    print((real,pred))
    
    


#### Métricas de avaliação

In [None]:
from sklearn import metrics
print(metrics.classification_report(test_classes, predicted_SGDC,target_names=["negative","positive"]))
      

In [None]:
print("Matriz de confusão")
print(metrics.confusion_matrix(test_classes, predicted_SGDC))

### Classificador SVM

In [None]:
#http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
from sklearn.svm import LinearSVC

clf_SVC = LinearSVC(random_state=0)
clf_SVC.fit(X_train_tfidf, train_classes)
predicted_SVC = clf_SVC.predict(X_test_tfidf)

print("Acurácia: ", np.mean(predicted_SVC == test_classes))           


print('classe real => classe predita')

for real, pred in zip(test_classes, predicted_SVC):
    print((real,pred))



#### Avaliação

In [None]:
from sklearn import metrics
print(metrics.classification_report(test_classes, predicted_SVC))

#### Matriz de Confusão

In [None]:
print("Matriz de confusão")
print(metrics.confusion_matrix(test_classes, predicted_SVC))