## Train Classifiers

In [8]:
import pandas as pd
import re
import string
import nltk
import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

## Modelagem
Nesta etapa diversos modelos de ML serão utilizados visando encontrar os melhores modelos para classificação de mensagens SPAM ou não SPAM.

Serão aplicados os seguintes modelos: 
            LogisticRegression
            MultinomialNB
            KNeighborsClassifier
            SVC
            DecisionTreeClassifier
            RandomForestClassifier
            GradientBoostingClassifier
            AdaBoostClassifier

In [9]:
spam_ham = pd.read_pickle('../data/dataframe_to_modeling/spam_ham.pickle')

In [10]:
Path() / '..'/ 'data' / 'dataframe_to_modeling' / 'spam_ham.pickle'

WindowsPath('../data/dataframe_to_modeling/spam_ham.pickle')

In [11]:
spam_ham.head()

Unnamed: 0,ID,Message,SPAM,SPAM_int,text_norm,text_tokenized,text_lemmatized,Message_len,punct%
0,615317,Linda msg!,True,1,lina msg,"[lina, msg]","[lina, msg]",9,11.1
1,855344,To fazendo batida de kwui com coco,True,1,fazeno batia kwui coco,"[fazeno, batia, kwui, coco]","[fazeno, batia, kwui, coco]",28,0.0
2,1007222,Esse dinheiro só vai sair para quem está com o...,True,1,esse inheiro sair nome lista esse inheiro sair...,"[esse, inheiro, sair, nome, lista, esse, inhei...","[esse, inheiro, sair, nome, lista, esse, inhei...",113,7.1
3,811063,Isso msm Irmã.. nao lembrava dos nomes kkk,True,1,isso msm irma nao lembrava nomes kkk,"[isso, msm, irma, nao, lembrava, nomes, kkk]","[isso, msm, irma, nao, lembrava, nome, kkk]",35,5.7
4,913944,Ai xara eu sujiro esse,True,1,ara sujiro,"[ara, sujiro]","[ara, sujiro]",18,0.0


In [12]:
spam_ham.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1773 entries, 0 to 1772
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               1773 non-null   int64  
 1   Message          1773 non-null   object 
 2   SPAM             1773 non-null   bool   
 3   SPAM_int         1773 non-null   int64  
 4   text_norm        1773 non-null   object 
 5   text_tokenized   1773 non-null   object 
 6   text_lemmatized  1773 non-null   object 
 7   Message_len      1773 non-null   int64  
 8   punct%           1773 non-null   float64
dtypes: bool(1), float64(1), int64(3), object(4)
memory usage: 126.4+ KB


In [13]:
spam_ham.isnull().sum()

ID                 0
Message            0
SPAM               0
SPAM_int           0
text_norm          0
text_tokenized     0
text_lemmatized    0
Message_len        0
punct%             0
dtype: int64

## Vetorização
É importante a criação de um CountVectorizer para contar o número de palavras (frequência do termo), 
limitar o tamanho do seu vocabulário e, aplicar palavras de parada, por exemplo.

## Bag of words
A criação da 'Bag of words' se dá na necessidade de converter cada uma dessas mensagens (lemmas) em um vetor que os modelos de algoritmos do SciKit Learn possam trabalhar.

### Passos:

Contar quantas vezes uma palavra ocorre em cada mensagem (conhecida como frequência de termo) pelo CountVectorizer;

Pesar as contagens, para que os tokens frequentes tenham um peso menor (frequência inversa do documento);

Normalizar os vetores para comprimento unitário, para abstrair do comprimento do texto original (norma L2) por TF-IDF, usando o TfidfTransformer do scikit-learn.

CountVectorizer : converterá uma coleção de documentos de texto em uma matriz de contagens de token em matriz 2D:

In [14]:
cv = CountVectorizer()
word_count_vector = cv.fit_transform(spam_ham['text_norm'])

In [15]:
#aqui as colunas reprensentam palavras únicas (9411)
word_count_vector.shape

(1773, 9307)

## Aplicando TFIDF 
Codificação de texto como inteiro em forma numérica para criar vetores de recursos para que possamos aplicar algoritmos de ML nele.

In [16]:
feature_extraction = TfidfVectorizer(min_df=1, lowercase = 'True')
# min_df = se uma palavra não repetir, ou repetir apenas uma vez, ela não é importante para análises predivitas.

In [17]:
X_tfidf = feature_extraction.fit_transform(spam_ham['text_norm'])
print(X_tfidf.shape)
print(feature_extraction.get_feature_names())

(1773, 9307)
['111', '1114', '113', '11343721', '114hs', '1195', '11hs', '11ivan', '121', '12294', '123', '124', '125', '125d', '1275d', '12alvaro', '13', '131', '134', '135', '13d', '13h', '13h3min', '13hs', '13rafaelzao', '141', '1411', '143', '1431578419', '143536592', '143568748', '143825166', '147', '14flavio', '14h', '14hrs', '15', '151239847', '1531', '15842', '15leonel', '163', '165', '16anos', '16hrs', '16rorigo', '171', '1723d', '1746', '1747', '175', '177', '17cm', '17d', '17h', '17hs', '183', '18963421', '18bragantino', '18d', '1915d', '1937d', '195', '19d', '1ano', '1h2', '1h21', '1hr', '1km', '1mauricelio', '1mil', '1min', '211', '2115', '211581172839', '212', '213', '214', '215', '216', '2192963419', '21964957222', '21d', '21h', '2212d', '22531177', '2269', '22bpm', '22d', '22h55', '23', '2338', '235', '23d', '24', '241343', '24d', '24h', '24hs', '25', '252', '25d', '261', '26a', '26d', '27', '275', '276', '27866d', '27am', '27d', '27em', '27puer', '27sbook', '27sim', '2

## Aplicando os modelos

Separando o Dataset em treino e teste

In [18]:
X=spam_ham[['Message', 'Message_len', 'punct%']]
y=spam_ham['SPAM']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42, stratify=y)

In [19]:
#feature_extraction = TfidfVectorizer(min_df=1, lowercase = 'True')

feature_extraction_fit = feature_extraction.fit(X_train['Message']) #Message ou Message

tfidf_train = feature_extraction_fit.transform(X_train['Message'])
tfidf_test = feature_extraction_fit.transform(X_test['Message'])

X_train_vect = pd.concat([X_train[['Message_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['Message_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray())], axis=1)

X_train_vect.head()

Unnamed: 0,Message_len,punct%,0,1,2,3,4,5,6,7,...,7757,7758,7759,7760,7761,7762,7763,7764,7765,7766
0,100,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,107,9.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,294,3.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.159563,0.0,0.0,0.0


Aplicação do modelos

In [20]:
lr = LogisticRegression()
nb = MultinomialNB()
knc = KNeighborsClassifier()
svc = SVC(gamma = 'auto')
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(n_estimators=100)
gbc = GradientBoostingClassifier()
abc = AdaBoostClassifier()



models = {'Logistic Regression':lr, 'Naive Bayes classifier':nb, 'k-nearest neighbors':knc, 
          'Support Vector Machine':svc, 'Decision Tree Classifier':dtc, 
          'Random Forest Classifier':rfc, 'Gradient Boosting Classifier':gbc, 'AdaBoost Classifier':abc}

In [21]:
def eval_model(model):
    
    model.fit(X_train_vect, y_train)
    y_pred = model.predict(X_test_vect)    
    test_accuracy = accuracy_score(y_test, y_pred)   
    return test_accuracy

In [124]:
test_accuracies = []

for name, model in models.items():
    test_acc = eval_model(model) 
    test_accuracies.append(test_acc)
    print(f'{name} ---> Test accuracy - {test_acc*100:.2f}%')

Logistic Regression ---> Test accuracy - 83.83%
Naive Bayes classifier ---> Test accuracy - 79.70%
k-nearest neighbors ---> Test accuracy - 61.65%
Support Vector Machine ---> Test accuracy - 66.17%
Decision Tree Classifier ---> Test accuracy - 69.92%
Random Forest Classifier ---> Test accuracy - 81.39%
Gradient Boosting Classifier ---> Test accuracy - 79.14%
AdaBoost Classifier ---> Test accuracy - 80.26%


## Melhorando hiperparâmetros com Grid Search

Visando aumentar o desempenho dos algoritmos, o Grid Search foi aplicado nos modelos com a acurácia mais elevada: Logistic Regression, Random Forest e Adaboost.

Os hiperparâmetros controlam diretamente o comportamento do algoritmo de treinamento e têm um impacto significativo no desempenho do modelo que está sendo treinado. É uma técnica muito tradicional para implementar hiperparâmetros. Por meio de força bruta testa todas as combinações.

Grid search para Random Forest

In [192]:
param_grid = { 
    'n_estimators': [200, 400, 600, 800, 1200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4, 8, 16, 32, 64],
    'criterion' :['gini', 'entropy']
}

In [193]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(tfidf_train, y_train)


In [194]:
print(CV_rfc.best_params_)
print(CV_rfc.best_estimator_)
print(CV_rfc.best_score_)

{'criterion': 'entropy', 'max_depth': 64, 'max_features': 'auto', 'n_estimators': 800}
RandomForestClassifier(criterion='entropy', max_depth=64, max_features='auto',
                       n_estimators=800)
0.838839227879259


GridSearch para Logistic Regression

In [24]:
param_grid = {'C':[0.001,0.01,0.1,1,10, 100], 
                    'penalty':['none', 'l1', 'l2', 'elasticnet'],
                    'solver' : ['newton-cg', 'lbfgs', 'liblinear']                   
                  }

In [25]:
CV_lr = GridSearchCV(estimator=lr, param_grid=param_grid, \
scoring='accuracy', cv=5, n_jobs=-1)

In [26]:
CV_lr.fit(tfidf_train, y_train)

In [27]:
print(CV_lr.best_params_)
print(CV_lr.best_estimator_)
print(CV_lr.best_score_)

{'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
LogisticRegression(C=10, solver='newton-cg')
0.8557423241352506


Grid Search para Adaboost

In [33]:
param_grid = {'n_estimators':[10, 50, 100, 500, 1000], 
                    'learning_rate':[0.0001, 0.001, 0.01, 0.1, 1.0],
                    'algorithm' : ['SAMME', 'SAMME.R']                  
                  }

In [34]:
CV_abc = GridSearchCV(estimator=abc, param_grid=param_grid, n_jobs=-1, cv=5, scoring='accuracy')

In [35]:
grid_result = CV_abc.fit(tfidf_train, y_train)

In [36]:
print(CV_abc.best_params_)
print(CV_abc.best_estimator_)
print(CV_abc.best_score_)


{'algorithm': 'SAMME.R', 'learning_rate': 0.1, 'n_estimators': 1000}
AdaBoostClassifier(learning_rate=0.1, n_estimators=1000)
0.8291650472859178
