In [72]:
import numpy as np
import pandas as pd
import string
from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\renat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Base de Dados de Treino

In [73]:
df = pd.read_csv('../data/train/Tweets_Mg.csv')
df.shape

(8199, 26)

In [74]:
# remoção de features inrelevantes
df = df.drop(columns=['Unnamed: 0', 'Created At', 'Geo Coordinates.latitude',
       'Geo Coordinates.longitude', 'User Location', 'Username',
       'User Screen Name', 'Retweet Count', 'Observação',
       'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13',
       'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17',
       'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21',
       'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24'])

In [32]:
df.Classificacao.value_counts()

Positivo    3300
Neutro      2453
Negativo    2446
Name: Classificacao, dtype: int64

In [5]:
df.Text

0       ���⛪ @ Catedral de Santo Antônio - Governador ...
1       � @ Governador Valadares, Minas Gerais https:/...
2       �� @ Governador Valadares, Minas Gerais https:...
3                             ��� https://t.co/BnDsO34qK0
4       ��� PSOL vai questionar aumento de vereadores ...
                              ...                        
8194    Trio é preso suspeito de roubo, tráfico e abus...
8195    Trio é preso suspeito de roubo, tráfico e abus...
8196    Trio é preso suspeito de roubo, tráfico e abus...
8197    Trio é preso suspeito de roubo, tráfico e abus...
8198    Trio suspeito de roubo de cargas é preso em Sa...
Name: Text, Length: 8199, dtype: object

## Feature Engineering

### Features que serão adicionadas
    1 - Número de palavras no texto
    2 - Número de palavras únicas no texto
    3 - Número de caracteres no texto
    4 - Número de palavras irrelevantes (stopwords)
    5 - Número de pontuações
    6 - Número palavras maiúsculas
    7 - Número de palavras de caixa de título
    8 - Comprimento médio das palavras
    

In [75]:
# Numero de palavras no texto
df["num_words"] = df['Text'].apply(lambda x: len(str(x).split()))

# Numero de palavras unicas no texto
df['num_unique_words'] = df['Text'].apply(lambda x: len(set(str(x).split())))

# Número de caracteres no texto
df['num_chars'] = df['Text'].apply(lambda x: len(str(x)))

# Número de palavras irrelevantes (stopwords)
df['num_stopwords'] = df['Text'].apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords]))

# Número de pontuações
df['num_punctuations'] = df['Text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

# Número palavras maiúsculas
df['num_words_upper'] = df['Text'].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

# Número de palavras de caixa de título
df['num_words_title'] = df['Text'].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

# Comprimento médio das palavras
df['mean_word_len'] = df['Text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))


In [6]:
df.head()

Unnamed: 0,Text,Classificacao,num_words,num_unique_words,num_chars,num_stopwords,num_punctuations,num_words_upper,num_words_title,mean_word_len
0,���⛪ @ Catedral de Santo Antônio - Governador ...,Neutro,10,10,82,1,8,0,4,7.3
1,"� @ Governador Valadares, Minas Gerais https:/...",Neutro,7,7,62,0,7,0,4,8.0
2,"�� @ Governador Valadares, Minas Gerais https:...",Neutro,7,7,63,0,7,0,4,8.142857
3,��� https://t.co/BnDsO34qK0,Neutro,2,2,27,0,5,0,0,13.0
4,��� PSOL vai questionar aumento de vereadores ...,Negativo,20,17,126,5,7,2,4,5.35


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8199 entries, 0 to 8198
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Text              8199 non-null   object 
 1   Classificacao     8199 non-null   object 
 2   num_words         8199 non-null   int64  
 3   num_unique_words  8199 non-null   int64  
 4   num_chars         8199 non-null   int64  
 5   num_stopwords     8199 non-null   int64  
 6   num_punctuations  8199 non-null   int64  
 7   num_words_upper   8199 non-null   int64  
 8   num_words_title   8199 non-null   int64  
 9   mean_word_len     8199 non-null   float64
dtypes: float64(1), int64(7), object(2)
memory usage: 640.7+ KB


## Data Preprocessing
- Converter texto para minúsculo
- Remover pontuações
- Limpeza de Número 
- remoção de stopwords

### Remoção de pontuação

In [7]:
# Removing punctuations
punctuation_list =[',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', 
        '•', '~', '@', '£', '·', '_', '{', '}', '©', '^', '®', '`', '<', '→', '°', '€', '™', '›', '♥', '←', '×', '§', '″', '′', 
        '█', '…', '“', '★', '”', '–', '●', '►', '−', '¢', '¬', '░', '¡', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', 
        '—', '‹', '─', '▒', '：', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', '¯', '♦', '¤', '▲', '¸', '⋅', '‘', '∞', 
        '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '・', '╦', '╣', '╔', '╗', '▬', '❤', '≤', '‡', '√', '◄', '━', 
        '⇒', '▶', '≥', '╝', '♡', '◊', '。', '✈', '≡', '☺', '✔', '↵', '≈', '✓', '♣', '☎', '℃', '◦', '└', '‟', '～', '！', '○', 
        '◆', '№', '♠', '▌', '✿', '▸', '⁄', '□', '❖', '✦', '．', '÷', '｜', '┃', '／', '￥', '╠', '↩', '✭', '▐', '☼', '☻', '┐', 
        '├', '«', '∼', '┌', '℉', '☮', '฿', '≦', '♬', '✧', '〉', '－', '⌂', '✖', '･', '◕', '※', '‖', '◀', '‰', '\x97', '↺', 
        '∆', '┘', '┬', '╬', '،', '⌘', '⊂', '＞', '〈', '⎙', '？', '☠', '⇐', '▫', '∗', '∈', '≠', '♀', '♔', '˚', '℗', '┗', '＊', 
        '┼', '❀', '＆', '∩', '♂', '‿', '∑', '‣', '➜', '┛', '⇓', '☯', '⊖', '☀', '┳', '；', '∇', '⇑', '✰', '◇', '♯', '☞', '´', 
        '↔', '┏', '｡', '◘', '∂', '✌', '♭', '┣', '┴', '┓', '✨', '\xa0', '˜', '❥', '┫', '℠', '✒', '［', '∫', '\x93', '≧', '］', 
        '\x94', '∀', '♛', '\x96', '∨', '◎', '↻', '⇩', '＜', '≫', '✩', '✪', '♕', '؟', '₤', '☛', '╮', '␊', '＋', '┈', '％', 
        '╋', '▽', '⇨', '┻', '⊗', '￡', '।', '▂', '✯', '▇', '＿', '➤', '✞', '＝', '▷', '△', '◙', '▅', '✝', '∧', '␉', '☭', 
        '┊', '╯', '☾', '➔', '∴', '\x92', '▃', '↳', '＾', '׳', '➢', '╭', '➡', '＠', '⊙', '☢', '˝', '∏', '„', '∥', '❝', '☐', 
        '▆', '╱', '⋙', '๏', '☁', '⇔', '▔', '\x91', '➚', '◡', '╰', '\x85', '♢', '˙', '۞', '✘', '✮', '☑', '⋆', 'ⓘ', '❒', 
        '☣', '✉', '⌊', '➠', '∣', '❑', '◢', 'ⓒ', '\x80', '〒', '∕', '▮', '⦿', '✫', '✚', '⋯', '♩', '☂', '❞', '‗', '܂', '☜', 
        '‾', '✜', '╲', '∘', '⟩', '＼', '⟨', '·', '✗', '♚', '∅', 'ⓔ', '◣', '͡', '‛', '❦', '◠', '✄', '❄', '∃', '␣', '≪', '｢', 
        '≅', '◯', '☽', '∎', '｣', '❧', '̅', 'ⓐ', '↘', '⚓', '▣', '˘', '∪', '⇢', '✍', '⊥', '＃', '⎯', '↠', '۩', '☰', '◥', 
        '⊆', '✽', '⚡', '↪', '❁', '☹', '◼', '☃', '◤', '❏', 'ⓢ', '⊱', '➝', '̣', '✡', '∠', '｀', '▴', '┤', '∝', '♏', 'ⓐ', 
        '✎', ';', '␤', '＇', '❣', '✂', '✤', 'ⓞ', '☪', '✴', '⌒', '˛', '♒', '＄', '✶', '▻', 'ⓔ', '◌', '◈', '❚', '❂', '￦', 
        '◉', '╜', '̃', '✱', '╖', '❉', 'ⓡ', '↗', 'ⓣ', '♻', '➽', '׀', '✲', '✬', '☉', '▉', '≒', '☥', '⌐', '♨', '✕', 'ⓝ', 
        '⊰', '❘', '＂', '⇧', '̵', '➪', '▁', '▏', '⊃', 'ⓛ', '‚', '♰', '́', '✏', '⏑', '̶', 'ⓢ', '⩾', '￠', '❍', '≃', '⋰', '♋', 
        '､', '̂', '❋', '✳', 'ⓤ', '╤', '▕', '⌣', '✸', '℮', '⁺', '▨', '╨', 'ⓥ', '♈', '❃', '☝', '✻', '⊇', '≻', '♘', '♞', 
        '◂', '✟', '⌠', '✠', '☚', '✥', '❊', 'ⓒ', '⌈', '❅', 'ⓡ', '♧', 'ⓞ', '▭', '❱', 'ⓣ', '∟', '☕', '♺', '∵', '⍝', 'ⓑ', 
        '✵', '✣', '٭', '♆', 'ⓘ', '∶', '⚜', '◞', '்', '✹', '➥', '↕', '̳', '∷', '✋', '➧', '∋', '̿', 'ͧ', '┅', '⥤', '⬆', '⋱', 
        '☄', '↖', '⋮', '۔', '♌', 'ⓛ', '╕', '♓', '❯', '♍', '▋', '✺', '⭐', '✾', '♊', '➣', '▿', 'ⓑ', '♉', '⏠', '◾', '▹', 
        '⩽', '↦', '╥', '⍵', '⌋', '։', '➨', '∮', '⇥', 'ⓗ', 'ⓓ', '⁻', '⎝', '⌥', '⌉', '◔', '◑', '✼', '♎', '♐', '╪', '⊚', 
        '☒', '⇤', 'ⓜ', '⎠', '◐', '⚠', '╞', '◗', '⎕', 'ⓨ', '☟', 'ⓟ', '♟', '❈', '↬', 'ⓓ', '◻', '♮', '❙', '♤', '∉', '؛', 
        '⁂', 'ⓝ', '־', '♑', '╫', '╓', '╳', '⬅', '☔', '☸', '┄', '╧', '׃', '⎢', '❆', '⋄', '⚫', '̏', '☏', '➞', '͂', '␙', 
        'ⓤ', '◟', '̊', '⚐', '✙', '↙', '̾', '℘', '✷', '⍺', '❌', '⊢', '▵', '✅', 'ⓖ', '☨', '▰', '╡', 'ⓜ', '☤', '∽', '╘', 
        '˹', '↨', '♙', '⬇', '♱', '⌡', '⠀', '╛', '❕', '┉', 'ⓟ', '̀', '♖', 'ⓚ', '┆', '⎜', '◜', '⚾', '⤴', '✇', '╟', '⎛', 
        '☩', '➲', '➟', 'ⓥ', 'ⓗ', '⏝', '◃', '╢', '↯', '✆', '˃', '⍴', '❇', '⚽', '╒', '̸', '♜', '☓', '➳', '⇄', '☬', '⚑', 
        '✐', '⌃', '◅', '▢', '❐', '∊', '☈', '॥', '⎮', '▩', 'ு', '⊹', '‵', '␔', '☊', '➸', '̌', '☿', '⇉', '⊳', '╙', 'ⓦ', 
        '⇣', '｛', '̄', '↝', '⎟', '▍', '❗', '״', '΄', '▞', '◁', '⛄', '⇝', '⎪', '♁', '⇠', '☇', '✊', 'ி', '｝', '⭕', '➘', 
        '⁀', '☙', '❛', '❓', '⟲', '⇀', '≲', 'ⓕ', '⎥', '\u06dd', 'ͤ', '₋', '̱', '̎', '♝', '≳', '▙', '➭', '܀', 'ⓖ', '⇛', '▊', 
        '⇗', '̷', '⇱', '℅', 'ⓧ', '⚛', '̐', '̕', '⇌', '␀', '≌', 'ⓦ', '⊤', '̓', '☦', 'ⓕ', '▜', '➙', 'ⓨ', '⌨', '◮', '☷', 
        '◍', 'ⓚ', '≔', '⏩', '⍳', '℞', '┋', '˻', '▚', '≺', 'ْ', '▟', '➻', '̪', '⏪', '̉', '⎞', '┇', '⍟', '⇪', '▎', '⇦', '␝', 
        '⤷', '≖', '⟶', '♗', '̴', '♄', 'ͨ', '̈', '❜', '̡', '▛', '✁', '➩', 'ா', '˂', '↥', '⏎', '⎷', '̲', '➖', '↲', '⩵', '̗', '❢', 
        '≎', '⚔', '⇇', '̑', '⊿', '̖', '☍', '➹', '⥊', '⁁', '✢']

In [8]:
def remove_punctuation(text):
    for punctuation in punctuation_list:
        if punctuation in text:
            text = text.replace(punctuation, f'{punctuation}')
    return text

### Limpeza de números

In [9]:
def clean_numbers(text):
    if bool(re.search(r'\d', text)):
        text = re.sub('[0-9]{5,}', '#####', text)
        text = re.sub('[0-9]{4}', '####', text)
        text = re.sub('[0-9]{3}', '###', text)
        text = re.sub('[0-9]{2}', '##', text)
    return text

### Remoção de Stopwords

In [10]:
from nltk.tokenize.toktok import ToktokTokenizer

def remove_stopwords(text, is_lower_case=True):
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [11]:
# Aplica todos os preprocessamentos 
def clean_text(x):
    x = x.lower()
    x = remove_punctuation(x)
    x = clean_numbers(x)
    x = remove_stopwords(x)
    x = x.replace("'", "")
    return x

In [76]:
df['preprocessed_text'] = df['Text'].apply(lambda x: clean_text(x))


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8199 entries, 0 to 8198
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Text               8199 non-null   object 
 1   Classificacao      8199 non-null   object 
 2   num_words          8199 non-null   int64  
 3   num_unique_words   8199 non-null   int64  
 4   num_chars          8199 non-null   int64  
 5   num_stopwords      8199 non-null   int64  
 6   num_punctuations   8199 non-null   int64  
 7   num_words_upper    8199 non-null   int64  
 8   num_words_title    8199 non-null   int64  
 9   mean_word_len      8199 non-null   float64
 10  preprocessed_text  8199 non-null   object 
dtypes: float64(1), int64(7), object(3)
memory usage: 704.7+ KB


## Construindo Vetorizadores e Modelos

In [14]:
import copy
import time
from sklearn.metrics._classification import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer 
from sklearn.naive_bayes import MultinomialNB

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn import metrics



## Count Vectorize

In [77]:
dic_classificacao = {
    'Neutro' : 1,
    'Negativo': 0,
    'Positivo': 1,
}
df['Classificacao'] = df['Classificacao'].replace(dic_classificacao)

In [78]:
train, test = np.split(df.sample(frac=1), [int(.8*len(df))])
train.shape, test.shape

((6559, 11), (1640, 11))

In [79]:
# Creating CountVectorizer object
vectorizer = CountVectorizer(
    dtype=np.float32, 
    strip_accents='unicode', 
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 3),
    min_df=3
)
# Fit the vectorizer on training data after preprocessing
vectorizer.fit_transform(train['preprocessed_text'].values.tolist() + test['preprocessed_text'].values.tolist())
train_vectorizer = vectorizer.transform(train['preprocessed_text'].values.tolist())
test_vectorizer = vectorizer.transform(test['preprocessed_text'].values.tolist())

In [80]:
# For storing the threshold values and f1 score
threshold_list = []
best_f1_score_list = []

## Customizando função para construir o modelo e f1 score

In [81]:
train_y = train['Classificacao'].values

def buildModel(train_X, train_y, test_X, test_y, test_X2, model_obj):
    model = copy.deepcopy(model_obj)
    model.fit(train_X, train_y)
    pred_test_y = model.predict_proba(test_X)[:,1]
    pred_test_y2 = model.predict_proba(test_X2)[:,1]
    return pred_test_y, pred_test_y2, model

def best_threshold_function(val_y, pred_val_y):
    threshold_dict = {}
    for thresh in np.arange(0.1, 0.201, 0.01):
        thresh = np.round(thresh, 2)
        # Updating the dict with threshold as key and f1 score as value
        threshold_dict[thresh] =  metrics.f1_score(val_y, (pred_val_y > thresh).astype(int))
        
    # Finding the max key
    best_threshold = max(threshold_dict, key=threshold_dict.get)
    
    # finding the max value
    best_f1_score = max(threshold_dict.values())
    
    print(f"Best F1 Score: {best_f1_score} for threshold {best_threshold}")
    # Appending the f1 score and threshold for count vectorizer
    threshold_list.append(best_threshold)
    best_f1_score_list.append(best_f1_score)

## Regreção Logistica

In [82]:
cv_scores = []
pred_full_test = 0
# Creating a zero list equal to the shape of training data
pred_train = np.zeros([train.shape[0]])

# kfold with 5 n_splits
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)

for dev_index, val_index in kf.split(train):
    dev_X, val_X = train_vectorizer[dev_index], train_vectorizer[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = buildModel(dev_X, dev_y, val_X, val_y, test_vectorizer, LogisticRegression(C=5., solver='sag'))
    pred_full_test = pred_full_test + pred_test_y
    
    # Updating the pred_train list with prediction value
    pred_train[val_index] = pred_val_y
    
    # appending the cv scores
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    break
    
best_threshold_function(val_y, pred_val_y)

Best F1 Score: 0.9802455953016552 for threshold 0.14




### Naive Bayes

In [83]:
cv_scores = []
pred_full_test = 0
# Creating a zero list equal to the shape of training data
pred_train = np.zeros([train.shape[0]])

# kfold with 5 n_splits
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = train_vectorizer[dev_index], train_vectorizer[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = buildModel(dev_X, dev_y, val_X, val_y, test_vectorizer, MultinomialNB())
    pred_full_test = pred_full_test + pred_test_y
    
    # Updating the pred_train list with prediction value
    pred_train[val_index] = pred_val_y
    
    # appending the cv scores
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    break
    
best_threshold_function(val_y, pred_val_y)

Best F1 Score: 0.9800323799244468 for threshold 0.18


### TFIDF Vextorizer

In [84]:
vectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,3))
vectorizer.fit_transform(train['preprocessed_text'].values.tolist() + test['preprocessed_text'].values.tolist())
train_vectorizer = vectorizer.transform(train['preprocessed_text'].values.tolist())
test_vectorizer = vectorizer.transform(test['preprocessed_text'].values.tolist())

### Logistic Regression

In [85]:
cv_scores = []
pred_full_test = 0

# Creating a zero list equal to the shape of training data
pred_train = np.zeros([train.shape[0]])

# kfold with 5 n_splits
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = train_vectorizer[dev_index], train_vectorizer[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = buildModel(dev_X, dev_y, val_X, val_y, test_vectorizer, LogisticRegression(C=5., solver='sag'))
    pred_full_test = pred_full_test + pred_test_y
    
    # Updating the pred_train list with prediction value
    pred_train[val_index] = pred_val_y
    
     # appending the cv scores
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    break
    
best_threshold_function(val_y, pred_val_y)

Best F1 Score: 0.9663865546218487 for threshold 0.19


### Naive Bayes

In [86]:
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0]])
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = train_vectorizer[dev_index], train_vectorizer[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = buildModel(dev_X, dev_y, val_X, val_y, test_vectorizer, MultinomialNB())
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    break
    
best_threshold_function(val_y, pred_val_y)

Best F1 Score: 0.9729729729729729 for threshold 0.18


### Hashing Vectorize

In [87]:
vectorizer = HashingVectorizer(
    dtype=np.float32,
    strip_accents='unicode', 
    analyzer='word',
    ngram_range=(1, 3),
    n_features=2**10
)
vectorizer.fit_transform(train['preprocessed_text'].values.tolist() + test['preprocessed_text'].values.tolist())
train_vectorizer = vectorizer.transform(train['preprocessed_text'].values.tolist())
test_vectorizer = vectorizer.transform(train['preprocessed_text'].values.tolist())

### Logistic Regression

In [88]:
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0]])
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = train_vectorizer[dev_index], train_vectorizer[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = buildModel(dev_X, dev_y, val_X, val_y, test_vectorizer, LogisticRegression(C=5., solver='sag'))
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    break
    
best_threshold_function(val_y, pred_val_y)

Best F1 Score: 0.9693446088794927 for threshold 0.2


## Comparando todos os modelos

In [89]:
from prettytable import PrettyTable
    
table = PrettyTable()
vect = (["CountVectorizer"] * 2) + (["TFIDFVectorizer"] * 2) + (["HashingVectorizer"])
model = (["Logistic Regression", "Naive Bayes"] * 2) + (["Logistic Regression"])
table.add_column("Model", model)
table.add_column("Vectorizer", vect)
table.add_column("Test F1-Score", best_f1_score_list)
table.add_column("Best Threshold", threshold_list)

In [90]:
print(table)

+---------------------+-------------------+--------------------+----------------+
|        Model        |     Vectorizer    |   Test F1-Score    | Best Threshold |
+---------------------+-------------------+--------------------+----------------+
| Logistic Regression |  CountVectorizer  | 0.9802455953016552 |      0.14      |
|     Naive Bayes     |  CountVectorizer  | 0.9800323799244468 |      0.18      |
| Logistic Regression |  TFIDFVectorizer  | 0.9663865546218487 |      0.19      |
|     Naive Bayes     |  TFIDFVectorizer  | 0.9729729729729729 |      0.18      |
| Logistic Regression | HashingVectorizer | 0.9693446088794927 |      0.2       |
+---------------------+-------------------+--------------------+----------------+
