## Train Classifiers

In [10]:
import pandas as pd
import re
import string
import nltk
import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

## Modelagem
Nesta etapa diversos algoritmos de ML serão utilizados visando encontrar os melhores modelos para classificação de mensagens SPAM ou não SPAM.

Serão aplicados os seguintes modelos: 

            LogisticRegression;
            MultinomialNB;
            KNeighborsClassifier;
            SVC;
            DecisionTreeClassifier;
            RandomForestClassifier;
            GradientBoostingClassifier;
            AdaBoostClassifier.

In [11]:
path = Path() / '..'/ 'data' / 'dataframe_to_modeling' / 'spam_ham.pickle'
spam_ham = pd.read_pickle(path)

In [12]:
spam_ham.head()

Unnamed: 0,ID,Message,SPAM,SPAM_int,text_norm,text_tokenized,text_lemmatized,Message_len,punct%
0,615317,Linda msg!,True,1,linda msg,"[linda, msg]","[linda, msg]",9,11.1
1,855344,To fazendo batida de kwui com coco,True,1,fazendo batida kwui coco,"[fazendo, batida, kwui, coco]","[fazendo, batida, kwui, coco]",28,0.0
2,1007222,Esse dinheiro só vai sair para quem está com o...,True,1,esse dinheiro sair nome lista esse dinheiro sa...,"[esse, dinheiro, sair, nome, lista, esse, dinh...","[esse, dinheiro, sair, nome, lista, esse, dinh...",113,7.1
3,811063,Isso msm Irmã.. nao lembrava dos nomes kkk,True,1,isso msm irma nao lembrava nomes kkk,"[isso, msm, irma, nao, lembrava, nomes, kkk]","[isso, msm, irma, nao, lembrava, nome, kkk]",35,5.7
4,913944,Ai xara eu sujiro esse,True,1,xara sujiro,"[xara, sujiro]","[xara, sujiro]",18,0.0


In [13]:
spam_ham.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1773 entries, 0 to 1772
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               1773 non-null   int64  
 1   Message          1773 non-null   object 
 2   SPAM             1773 non-null   bool   
 3   SPAM_int         1773 non-null   int64  
 4   text_norm        1773 non-null   object 
 5   text_tokenized   1773 non-null   object 
 6   text_lemmatized  1773 non-null   object 
 7   Message_len      1773 non-null   int64  
 8   punct%           1773 non-null   float64
dtypes: bool(1), float64(1), int64(3), object(4)
memory usage: 126.4+ KB


In [14]:
spam_ham.isnull().sum()

ID                 0
Message            0
SPAM               0
SPAM_int           0
text_norm          0
text_tokenized     0
text_lemmatized    0
Message_len        0
punct%             0
dtype: int64

## Vetorização
É importante a criação de um CountVectorizer para contar o número de palavras (frequência do termo), 
limitar o tamanho do seu vocabulário e, aplicar palavras de parada, por exemplo.

### Bag of words
A criação da 'Bag of words' se dá na necessidade de converter cada uma dessas mensagens (lemas) em um vetor que os modelos de algoritmos do SciKit Learn possam trabalhar.

### Passos:

O CountVectorizer converterá uma coleção de documentos de texto em uma matriz de contagens de token em matriz 2D e contará quantas vezes uma palavra ocorre em cada mensagem (conhecida como frequência de termo);

Normalizar os vetores para comprimento unitário, para abstrair do comprimento do texto original (norma L2) por TF-IDF, usando o TfidfTransformer do scikit-learn.

In [15]:
cv = CountVectorizer()
word_count_vector = cv.fit_transform(spam_ham['text_norm'])

In [16]:
#aqui as colunas reprensentam palavras únicas (9411)
word_count_vector.shape

(1773, 9317)

## Aplicando TFIDF 
Codificação de texto como inteiro em forma numérica para criar vetores de recursos para que possamos aplicar algoritmos de ML nele.

In [17]:
feature_extraction = TfidfVectorizer(min_df=1, lowercase = 'True')
# min_df = se uma palavra não repetir, ou repetir apenas uma vez, ela não é importante para análises predivitas.

In [None]:
X_tfidf = feature_extraction.fit_transform(spam_ham['text_norm'])
print(X_tfidf.shape)
print(feature_extraction.get_feature_names())

## Aplicando os modelos de Machine Learning

Nesta etapa diversos modelos de ML serão utilizados visando encontrar os melhores modelos para classificação de mensagens SPAM ou não SPAM.

Serão aplicados os seguintes modelos: 

            -LogisticRegression;
            -MultinomialNB;
            -KNeighborsClassifier;
            -SVC;
            -DecisionTreeClassifier;
            -RandomForestClassifier;
            -GradientBoostingClassifier;
            -AdaBoostClassifier.

Separando o Dataset em treino e teste

In [64]:
X=spam_ham[['Message', 'Message_len', 'punct%']]
y=spam_ham['SPAM']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42, stratify=y)

In [65]:
feature_extraction_fit = feature_extraction.fit(X_train['Message'])

tfidf_train = feature_extraction_fit.transform(X_train['Message'])
tfidf_test = feature_extraction_fit.transform(X_test['Message'])

X_train_vect = pd.concat([X_train[['Message_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['Message_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray())], axis=1)

X_train_vect.head()

Unnamed: 0,Message_len,punct%,0,1,2,3,4,5,6,7,...,7743,7744,7745,7746,7747,7748,7749,7750,7751,7752
0,100,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,107,9.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,294,3.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.159512,0.0,0.0,0.0


Testando os modelos selecionados e suas acurácias

In [22]:
lr = LogisticRegression()
nb = MultinomialNB()
knc = KNeighborsClassifier()
svc = SVC(gamma = 'auto')
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(n_estimators=100)
gbc = GradientBoostingClassifier()
abc = AdaBoostClassifier()



models = {'Logistic Regression':lr, 'Naive Bayes classifier':nb, 'k-nearest neighbors':knc, 
          'Support Vector Machine':svc, 'Decision Tree Classifier':dtc, 
          'Random Forest Classifier':rfc, 'Gradient Boosting Classifier':gbc, 'AdaBoost Classifier':abc}

In [66]:
def eval_model(model):
    
    model.fit(tfidf_train, y_train)
    y_pred = model.predict(tfidf_test)    
    test_accuracy = accuracy_score(y_test, y_pred)   
    return test_accuracy

In [25]:
test_accuracies = []

for name, model in models.items():
    test_acc = eval_model(model) 
    test_accuracies.append(test_acc)
    print(f'{name} ---> Test accuracy - {test_acc*100:.2f}%')

Logistic Regression ---> Test accuracy - 82.71%
Naive Bayes classifier ---> Test accuracy - 84.77%
k-nearest neighbors ---> Test accuracy - 80.64%
Support Vector Machine ---> Test accuracy - 52.63%
Decision Tree Classifier ---> Test accuracy - 73.12%
Random Forest Classifier ---> Test accuracy - 80.45%
Gradient Boosting Classifier ---> Test accuracy - 77.82%
AdaBoost Classifier ---> Test accuracy - 79.70%


## Melhorando hiperparâmetros com Grid Search


Os hiperparâmetros controlam diretamente o comportamento do algoritmo de treinamento e têm um impacto significativo no desempenho do modelo que está sendo treinado. É uma técnica muito tradicional para implementar hiperparâmetros. Por meio de força bruta testa todas as combinações.

Visando aumentar o desempenho dos algoritmos, o Grid Search foi aplicado nos modelos com a acurácia mais elevada: Logistic Regression, Random Forest, Adaboost e Naive Bayes.

Grid search para Random Forest

In [53]:
param_grid = { 
    'n_estimators': [200, 400, 600, 800, 1200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4, 8, 16, 32, 64],
    'criterion' :['gini', 'entropy']
}

In [54]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(tfidf_train, y_train)

In [55]:
print(CV_rfc.best_params_)
print(CV_rfc.best_estimator_)
print(CV_rfc.best_score_)

{'criterion': 'entropy', 'max_depth': 64, 'max_features': 'auto', 'n_estimators': 1200}
RandomForestClassifier(criterion='entropy', max_depth=64, max_features='auto',
                       n_estimators=1200)
0.8356198989506414


GridSearch para Logistic Regression

In [56]:
param_grid = {'C':[0.001,0.01,0.1,1,10, 100], 
                    'penalty':['none', 'l1', 'l2', 'elasticnet'],
                    'solver' : ['newton-cg', 'lbfgs', 'liblinear']                   
                  }

In [57]:
CV_lr = GridSearchCV(estimator=lr, param_grid=param_grid, \
scoring='accuracy', cv=5, n_jobs=-1)

In [58]:
CV_lr.fit(tfidf_train, y_train)

In [59]:
print(CV_lr.best_params_)
print(CV_lr.best_estimator_)
print(CV_lr.best_score_)

{'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
LogisticRegression(C=10, solver='newton-cg')
0.853322969296541


Grid Search para Adaboost

In [8]:
param_grid = {'n_estimators':[10, 50, 100, 500, 1000], 
                    'learning_rate':[0.0001, 0.001, 0.01, 0.1, 1.0],
                    'algorithm' : ['SAMME', 'SAMME.R']                  
                  }

In [9]:
CV_abc = GridSearchCV(estimator=abc, param_grid=param_grid, n_jobs=-1, cv=5, scoring='accuracy')

NameError: name 'abc' is not defined

In [62]:
grid_result = CV_abc.fit(tfidf_train, y_train)

In [63]:
print(CV_abc.best_params_)
print(CV_abc.best_estimator_)
print(CV_abc.best_score_)

{'algorithm': 'SAMME.R', 'learning_rate': 0.1, 'n_estimators': 1000}
AdaBoostClassifier(learning_rate=0.1, n_estimators=1000)
0.8194843891695817


GridSearch Naive Bayes

In [77]:
param_grid = {'alpha': [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000]}

In [78]:
  CV_nb = GridSearchCV(estimator=nb, param_grid=param_grid, n_jobs=-1, cv=5, scoring='accuracy') 

In [79]:
grid_result = CV_nb.fit(tfidf_train, y_train)

In [67]:
print(CV_nb.best_params_)
print(CV_nb.best_estimator_)
print(CV_nb.best_score_)

{'alpha': 0.1}
MultinomialNB(alpha=0.1)
0.8549391112838451


O classificador Naive Bayes possui um conjunto de parâmetros muito limitado. Dependendo da implementação, às vezes o número de classes é o único parâmetro, sobre o qual, na prática, não temos controle. Portanto, o ajuste de hiperparâmetros é bastante limitado para melhorar a precisão do classificador Naive Bayes. 

# Teste da eficiência do algoritmo Naive Bayes no dataset de teste

Até aqui, verificamos que a Regressão Logística e Naive Bayes foram os modelos que mostraram melhor acurácia para o problema. A seguir, finalizaremos com o modelo Naive Bayes, treinando-o em todo o conjunto de dados de treinamento (sem validação cruzada) e faremos predições para o conjunto de dados de teste que foi separado anteriormente, a fim de confirmarmos nossas descobertas.

In [41]:
path = Path() / '..'/ 'data' / 'dataframe_to_modeling' / 'teste.pickle'
teste = pd.read_pickle(path)

In [42]:
teste.head()

Unnamed: 0,ID,Message,text_norm,text_tokenized,text_lemmatized,Message_len,punct%
0,405268,Bom dia!!! Divulga ae!!!🎼🎵🎶🎧📢\nVamos!!!🍻,bom dia divulga ae vamos,"[bom, dia, divulga, ae, vamos]","[bom, dia, divulga, ae, vamos]",36,25.0
1,810119,to bem\nfznd o que de bom?\ntbm nada,bem fznd bom tbm nada,"[bem, fznd, bom, tbm, nada]","[bem, fznd, bom, tbm, nada]",28,3.6
2,439979,E ai cambada\nTrankilo?\nTrankilo cmo eskilo\n...,cambada trankilo trankilo cmo eskilo loka tran...,"[cambada, trankilo, trankilo, cmo, eskilo, lok...","[cambada, trankilo, trankilo, cmo, eskilo, lok...",190,1.1
3,928898,Ouçam a conversa gravada no carro do Cristiano...,oucam conversa gravada carro cristiano araujo ...,"[oucam, conversa, gravada, carro, cristiano, a...","[oucam, conversa, gravada, carro, cristiano, a...",181,1.7
4,500412,"Nem, mete o piru no Pirra! Ele quer te dar",nem mete piru pirra ele quer dar,"[nem, mete, piru, pirra, ele, quer, dar]","[nem, mete, piru, pirra, ele, quer, dar]",33,6.1


# Tentativa de classificar os dados de Teste

O modelo já foi preparado anteriormente:

     - model = MultinomialNB(alpha=0.1)
     - grid_result = CV_nb.fit(tfidf_train, y_train)
     
     
     Aplicaremos: yhat = model.predict(X_test)

In [85]:
cv = CountVectorizer()
word_count_vector = cv.fit_transform(teste['text_norm'])

In [86]:
word_count_vector.shape

(500, 3785)

In [88]:
model = MultinomialNB(alpha=0.1)

In [89]:
model.fit(tfidf_train, y_train)

In [91]:
feature_extraction_fit = feature_extraction.fit(teste['Message'])
tfidf_x_teste = feature_extraction_fit.transform(teste['Message'])

In [None]:
yhat = model.predict(tfidf_x_teste)

In [None]:
X_test_vect_nb = pd.concat([teste[['text_norm']].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray())], axis=1)

X_test_vect_nb.head()

Vetorização

In [75]:
#feature_extraction_fit = feature_extraction.fit(X_train['Message'])

#tfidf_train = feature_extraction_fit.transform(X_train['Message'])
#tfidf_test = feature_extraction_fit.transform(X_test['Message'])

#X_train_vect = pd.concat([X_train[['Message_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray())], axis=1)
#X_test_vect = pd.concat([X_test[['Message_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray())], axis=1)

#X_train_vect.head()

#X_teste_tfidf = feature_extraction.fit_transform(teste['text_norm'])
#print(X_teste_tfidf.shape)
#print(feature_extraction.get_feature_names())

Separando novamente em treino e teste

In [60]:
#X_train = feature_extraction.fit_transform(spam_ham['text_norm'])
#Y_train = spam_ham['SPAM']

In [61]:
#model = MultinomialNB(alpha=0.1).fit(X_train, y_train)
#grid_result = CV_nb.fit(tfidf_train, y_train)

Aplicando TFIDF

In [None]:
X_teste_tfidf = feature_extraction.fit_transform(teste['text_norm'])
print(X_teste_tfidf.shape)
print(feature_extraction.get_feature_names())

Estimativa da acurácia no conjunto de testes