In [114]:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
import numpy as np

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Carregamento dos dados
for dirname, _, filenames in os.walk('/kaggle/input/nlp-getting-started'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

submission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

submission_df = pd.DataFrame(submission)
train_df = pd.DataFrame(train)
test_df = pd.DataFrame(test)

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


**LIMPEZA DO TEXTO**

In [115]:
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')
nltk.download('punkt')  # Necessário para word_tokenize
nltk.download('stopwords')
stop_words = set(stopwords.words('english') + ['u', 'im', 'c'])

def clean_text(mensage):
    mensage = str(mensage).lower()
    mensage = re.sub('\[.*?\]', '', mensage)
    mensage = re.sub('https?://\S+|www\.\S+', '', mensage)
    mensage = re.sub(r'@\w+', '', mensage)
    mensage = re.sub('<.*?>+', '', mensage)
    mensage = re.sub('[%s]' % re.escape(string.punctuation), '', mensage)
    mensage = re.sub('\n', '', mensage)
    mensage = re.sub('\w*\d\w*', '', mensage)
    
    tokens = word_tokenize(mensage)
    clean_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]    
    
    return ' '.join(clean_tokens)

train_df['text'] = train_df['text'].apply(clean_text)
test_df['text'] = test_df['text'].apply(clean_text)

# 2. Remoção de stopwords
def remove_stopwords(mensage):
    mensage = ' '.join(word for word in mensage.split(' ') if word not in stop_words)
    return mensage

train_df['text'] = train_df['text'].apply(remove_stopwords)
test_df['text'] = test_df['text'].apply(remove_stopwords)


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**DIVISÃO DOS DADOS**

In [116]:
x = train_df['text']
y = train_df['target']

# Removendo o documento vazio (índice 217)
empty_index = 217
if empty_index in x.index:
    x = x.drop(index=empty_index)
    y = y.drop(index=empty_index)
    y = y.reset_index(drop=True)
    x = x.reset_index(drop=True)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.23, random_state=19)

# Precisa resetar os índices do x_train e x_test
x_train = x_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)

**MODELO SENTENCE TRANSFORMER**

In [117]:
model_name = 'all-mpnet-base-v2'
embedding_model = SentenceTransformer(model_name)

# Gerar os embeddings
x_train_embeddings = embedding_model.encode(x_train, convert_to_tensor=True)
x_test_embeddings = embedding_model.encode(x_test, convert_to_tensor=True)

# Converter para tensores PyTorch (já estão como tensores após o encode)
x_train_embeddings = x_train_embeddings.float()
x_test_embeddings = x_test_embeddings.float()
y_train_tensor = torch.tensor(y_train.values).float().unsqueeze(1)
y_test_tensor = torch.tensor(y_test.values).float().unsqueeze(1)

Batches:   0%|          | 0/184 [00:00<?, ?it/s]

Batches:   0%|          | 0/55 [00:00<?, ?it/s]

**DATASET + DATALOADER**

In [118]:
class DisasterDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

    def check_distribution(self):
            labels_np = self.labels.cpu().numpy()
            unique, counts = np.unique(labels_np, return_counts=True)
            return dict(zip(unique, counts))

train_dataset = DisasterDataset(x_train_embeddings, y_train_tensor)
test_dataset = DisasterDataset(x_test_embeddings, y_test_tensor)

print(f"Distribuição no dataset de treino: {train_dataset.check_distribution()}")

# Criar DataLoader
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

Distribuição no dataset de treino: {0.0: 3373, 1.0: 2488}


**DEFININDO O MODELO**

In [119]:

class SimpleClassifier(nn.Module):
    def __init__(self, input_dim):
        super(SimpleClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.batch_norm1 = nn.BatchNorm1d(128)
        self.relu = nn.LeakyReLU()
        self.dropout = nn.Dropout(0.4)
        
        self.fc2 = nn.Linear(128, 64)  # ADICIONADO: Camada intermediária
        self.batch_norm2 = nn.BatchNorm1d(64)
        self.dropout2 = nn.Dropout(0.2)

        self.fc3 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()
        
        self._initialize_weights()

    
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.constant_(m.bias, 0)

    
    def forward(self, x):
        x = self.fc1(x)
        x = self.batch_norm1(x)
        x = self.relu(x)
        x = self.dropout1(x)
        
        x = self.fc2(x)
        x = self.batch_norm2(x)
        x = self.relu(x)
        x = self.dropout2(x)
        
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

input_dim = x_train_embeddings.shape[1]
model = SimpleClassifier(input_dim)

**DEBBUGAR**

In [120]:
def train_with_debug(model, train_loader, criterion, optimizer, device, epochs=40):
    losses = []
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        all_train_preds = []
        all_train_labels = []
        
        for embeddings, labels in train_loader:
            embeddings, labels = embeddings.to(device), labels.to(device)
            
            if epoch == 0 and len(all_train_preds) == 0:
                print(f"Amostra de embeddings: {embeddings[0][:5]}")
                print(f"Amostra de labels: {labels[:5]}")
            
            optimizer.zero_grad()
            outputs = model(embeddings)
            
            if epoch == 0 and len(all_train_preds) == 0:
                print(f"Amostra de outputs: {outputs[:5]}")
            
            loss = criterion(outputs, labels)
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            
            epoch_loss += loss.item()
            
            preds = (outputs > 0.5).float()  # ALTERADO: Threshold para 0.5
            all_train_preds.extend(preds.cpu().numpy())
            all_train_labels.extend(labels.cpu().numpy())

            if epoch % 5 == 0 or epoch == epochs - 1:
                train_acc = accuracy_score(all_train_labels, all_train_preds)
                train_f1 = f1_score(all_train_labels, all_train_preds)
                avg_loss = epoch_loss / len(train_loader)
                losses.append(avg_loss)
                print(f'Época {epoch+1}/{epochs}, Loss: {avg_loss:.4f}, Acc: {train_acc:.4f}, F1: {train_f1:.4f}')
            
            # ADICIONADO: Contar distribuição das previsões
            unique, counts = np.unique(np.array(all_train_preds), return_counts=True)
            pred_dist = dict(zip(unique, counts))
            print(f"Distribuição das previsões: {pred_dist}")
            
            # Verificação de gradientes depois do backprop
            if epoch % 10 == 0:
                for name, param in model.named_parameters():
                    if param.requires_grad:
                        print(f"{name} - grad_mean: {param.grad.abs().mean().item():.6f}")
    
    # Plotar curva de loss
    plt.figure(figsize=(10, 4))
    plt.plot(range(0, epochs, 5), losses)
    plt.xlabel('Época')
    plt.ylabel('Loss')
    plt.title('Curva de Loss durante Treinamento')
    plt.grid(True)
    plt.show()
    
    return model

**FUNÇÃO DE PERDA E OTIMIZADOR**

In [124]:
def setup_training():
    # Use GPU se disponível, caso contrário CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Usando dispositivo: {device}")
    
    # Definir hiperparâmetros e modelo
    input_dim = x_train_embeddings.shape[1]
    model = ImprovedClassifier(input_dim).to(device)
    
    # ALTERADO: Weight balancing para classes desbalanceadas
    # Calcular weights para balancear as classes (apenas exemplo)
    # pos_weight = torch.tensor([1.5])  # Ajuste este valor com base na distribuição real
    # criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight).to(device)
    
    # OU use BCELoss padrão se preferir
    criterion = nn.BCELoss().to(device)
    
    # ALTERADO: Otimizador com melhores hiperparâmetros
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
    
    # Treinar com debugging
    model = train_with_debug(model, train_loader, criterion, optimizer, device, epochs=30)
    
    return model, device

# Função de avaliação com análise detalhada
def evaluate_with_analysis(model, test_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    all_outputs = []  # Valor raw da sigmoide
    
    with torch.no_grad():
        for embeddings, labels in test_loader:
            embeddings, labels = embeddings.to(device), labels.to(device)
            outputs = model(embeddings)
            
            # Salvar tanto os valores raw quanto as previsões binárias
            all_outputs.extend(outputs.cpu().numpy())
            preds = (outputs > 0.5).float()  # ALTERADO: Threshold para 0.5
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Converter para arrays numpy para análise
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    all_outputs = np.array(all_outputs)
    
    # ADICIONADO: Verifica se há variação nas previsões
    unique_preds = np.unique(all_preds)
    print(f"Valores únicos nas previsões: {unique_preds}")
    
    # ADICIONADO: Histograma dos scores (valores da sigmoide)
    plt.figure(figsize=(10, 4))
    plt.hist(all_outputs, bins=20)
    plt.xlabel('Score de Previsão')
    plt.ylabel('Contagem')
    plt.title('Distribuição dos Scores de Previsão')
    plt.show()
    
    # Calcular métricas
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    
    print(f'Acurácia: {accuracy:.4f}')
    print(f'F1-score: {f1:.4f}')
    print('\nRelatório de Classificação:')
    print(classification_report(all_labels, all_preds))
    
    # Matriz de confusão
    cm = confusion_matrix(all_labels, all_preds)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='magma',
                xticklabels=['Não Desastre', 'Desastre'], 
                yticklabels=['Não Desastre', 'Desastre'])
    plt.xlabel('Previsto')
    plt.ylabel('Real')
    plt.title('Matriz de Confusão')
    plt.show()
    
    return accuracy, f1


In [125]:
# Definir a função de perda e o otimizador
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Treinar o modelo
epochs = 40
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Define device
model.to(device) # Move model para device
criterion.to(device) 

for epoch in range(epochs):
    model.train()
    for embeddings, labels in train_loader:
        embeddings, labels = embeddings.to(device), labels.to(device) 
        optimizer.zero_grad()
        outputs = model(embeddings)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}')

AttributeError: 'SimpleClassifier' object has no attribute 'dropout1'

**TREINO + AVALIAÇÃO DO MODELO**

In [None]:
model.eval()
all_preds = []
with torch.no_grad():
    for embeddings, labels in test_loader:
        embeddings, labels = embeddings.to(device), labels.to(device)  
        outputs = model(embeddings)
        preds = (outputs > 0.4462).float()
        all_preds.extend(preds.cpu().numpy())


accuracy_pytorch = accuracy_score(y_test, all_preds)
f1_pytorch = f1_score(y_test, all_preds)

print(f'Acurácia com PyTorch: {accuracy_pytorch:.4f}')
print(f'F1-score com PyTorch: {f1_pytorch:.4f}')

print('\nRelatório de Classificação (PyTorch):')
print(classification_report(y_test, all_preds))

cm_pytorch = confusion_matrix(y_test, all_preds)
plt.figure(figsize=(2,2))
sns.heatmap(cm_pytorch, annot=True, fmt='d', cmap='magma',
            xticklabels=['Não Desastre', 'Desastre'], yticklabels=['Não Desastre', 'Desastre'])
plt.xlabel('Previsto (PyTorch)')
plt.ylabel('Real')
plt.title('Matriz de Confusão (PyTorch)')
plt.show()

In [None]:
# Gerar embeddings para o conjunto de teste do Kaggle (usando o embedding_model treinado)
x_test_kaggle = test_df['text']  # Usar o texto já limpo
x_test_kaggle_embeddings = embedding_model.encode(x_test_kaggle, convert_to_tensor=True).float().to(device)

# Fazer previsões no conjunto de teste do Kaggle
model.eval()
all_probabilities_kaggle = []
all_predictions_kaggle = []
with torch.no_grad():
    for embeddings in x_test_kaggle_embeddings:
        output = model(embeddings.unsqueeze(0))  # Passar os embeddings pelo modelo
        probability = torch.sigmoid(output).item()  # Obter probabilidade escalar
        all_probabilities_kaggle.append(probability)
        prediction = 1 if probability > 0.4462 else 0  # Classificar com o limiar
        all_predictions_kaggle.append(prediction)

# Criar o dataframe de submissão
submission_df_kaggle = pd.DataFrame({'id': test_df['id'], 'target': all_predictions_kaggle})
submission_df_kaggle.to_csv('submission.csv', index=False)

print("Arquivo de submissão 'submission.csv' criado com as previsões no conjunto de teste do Kaggle.")
print("Primeiras 20 Previsões:", all_predictions_kaggle[:20])
print("Primeiras 20 Probabilidades:", all_probabilities_kaggle[:20])

**LOGISTIC REGRESSION**

In [None]:
"""
# 7. Treinamento do modelo de Regressão Logística (aplicando o melhor parâmetro C encontrado anteriormente)
logistic_model = LogisticRegression(solver='liblinear', random_state=19, C=10)
logistic_model.fit(x_train_tfidf, y_train)

# 8. Predições
y_pred_logistic = logistic_model.predict(x_test_tfidf)

# 9. Avaliação
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
print(f'Acurácia do modelo de Regressão Logística: {accuracy_logistic:.4f}')
print('\nRelatório de Classificação (Regressão Logística):')
print(classification_report(y_test, y_pred_logistic))

cm_logistic = confusion_matrix(y_test, y_pred_logistic)
plt.figure(figsize=(2,2))
sns.heatmap(cm_logistic, annot=True, fmt='d', cmap='Blues',
            xticklabels=logistic_model.classes_, yticklabels=logistic_model.classes_)
plt.xlabel('Previsto (Regressão Logística)')
plt.ylabel('Real')
plt.title('Matriz de Confusão (Regressão Logística)')
plt.show()

f1_logistic = f1_score(y_test, y_pred_logistic)
print(f'F1-score da Regressão Logística: {f1_logistic:.4f}')"""

In [None]:
"""from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Define os valores de C que você quer testar
param_grid_logistic = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

# Cria o modelo de Regressão Logística
logistic_model = LogisticRegression(solver='liblinear', random_state=19)

# Configura o GridSearchCV
grid_search_logistic = GridSearchCV(logistic_model, param_grid_logistic, cv=5, scoring='f1')

# Executa a busca nos dados de treinamento
grid_search_logistic.fit(x_train_tfidf, y_train)

# Melhores parâmetros encontrados
print("Melhores hiperparâmetros para Regressão Logística:", grid_search_logistic.best_params_)

# Melhor score (F1) obtido
print("Melhor F1-score para Regressão Logística:", grid_search_logistic.best_score_)

# Avalia o modelo com os melhores parâmetros no conjunto de teste
best_logistic_model = grid_search_logistic.best_estimator_
y_pred_best_logistic = best_logistic_model.predict(x_test_tfidf)

f1_best_logistic = f1_score(y_test, y_pred_best_logistic)
accuracy_best_logistic = accuracy_score(y_test, y_pred_best_logistic)

print(f'Acurácia da Regressão Logística (melhores parâmetros): {accuracy_best_logistic:.4f}')
print(f'F1-score da Regressão Logística (melhores parâmetros): {f1_best_logistic:.4f}')

print("\nRelatório de Classificação da Regressão Logística (melhores parâmetros):")
print(classification_report(y_test, y_pred_best_logistic))

cm_best_logistic = confusion_matrix(y_test, y_pred_best_logistic)
plt.figure(figsize=(2,2))
sns.heatmap(cm_best_logistic, annot=True, fmt='d', cmap='Blues',
            xticklabels=best_logistic_model.classes_, yticklabels=best_logistic_model.classes_)
plt.xlabel('Previsto')
plt.ylabel('Real')
plt.title('Matriz de Confusão (Regressão Logística - Melhores Parâmetros)')
plt.show()"""

XGBoost

In [None]:
"""
# 7. Treinamento do modelo XGBoost
xgb_model = XGBClassifier(random_state=19)
xgb_model.fit(x_train_tfidf, y_train)

# 8. Predições
y_pred_xgb = xgb_model.predict(x_test_tfidf)

# 9. Avaliação
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f'Acurácia do modelo XGBoost: {accuracy_xgb:.4f}')
print('\nRelatório de Classificação (XGBoost):')
print(classification_report(y_test, y_pred_xgb))

cm_xgb = confusion_matrix(y_test, y_pred_xgb)
plt.figure(figsize=(2,2))
sns.heatmap(cm_xgb, annot=True, fmt='d', cmap='plasma',
            xticklabels=xgb_model.classes_, yticklabels=xgb_model.classes_)
plt.xlabel('Previsto (XGBoost)')
plt.ylabel('Real')
plt.title('Matriz de Confusão (XGBoost)')
plt.show()

f1_xgb = f1_score(y_test, y_pred_xgb)
print(f'F1-score do XGBoost: {f1_xgb:.4f}')"""

SVM

In [None]:
"""from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.svm import SVC

# Define os valores de C e os kernels que você quer testar
param_grid_svm = {'C': [0.1, 1, 10, 100],
                  'kernel': ['linear', 'rbf']}

# Cria o modelo SVM
svm_model = SVC(random_state=19)

# Configura o GridSearchCV
grid_search_svm = GridSearchCV(svm_model, param_grid_svm, cv=5, scoring='f1')

# Executa a busca nos dados de treinamento
grid_search_svm.fit(x_train_tfidf, y_train)

# Melhores parâmetros encontrados
print("Melhores hiperparâmetros para SVM:", grid_search_svm.best_params_)

# Melhor score (F1) obtido
print("Melhor F1-score para SVM:", grid_search_svm.best_score_)

# Avalia o modelo com os melhores parâmetros no conjunto de teste
best_svm_model = grid_search_svm.best_estimator_
y_pred_best_svm = best_svm_model.predict(x_test_tfidf)

f1_best_svm = f1_score(y_test, y_pred_best_svm)
accuracy_best_svm = accuracy_score(y_test, y_pred_best_svm)

print(f'Acurácia do SVM (melhores parâmetros): {accuracy_best_svm:.4f}')
print(f'F1-score do SVM (melhores parâmetros): {f1_best_svm:.4f}')

print("\nRelatório de Classificação do SVM (melhores parâmetros):")
print(classification_report(y_test, y_pred_best_svm))

cm_best_svm = confusion_matrix(y_test, y_pred_best_svm)
plt.figure(figsize=(2,2))
sns.heatmap(cm_best_svm, annot=True, fmt='d', cmap='viridis',
            xticklabels=best_svm_model.classes_, yticklabels=best_svm_model.classes_)
plt.xlabel('Previsto')
plt.ylabel('Real')
plt.title('Matriz de Confusão (SVM - Melhores Parâmetros)')
plt.show()"""

In [None]:
"""# 2. Remoção de stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
more_stopwords = ['u', 'im', 'c']
stop_words = stop_words + more_stopwords
stop_words = set(stop_words)


def clean_text(mensage):
    mensage = str(mensage).lower()
    mensage = re.sub('\[.*?\]', '', mensage)
    mensage = re.sub('https?://\S+|www\.\S+', '', mensage)
    mensage = re.sub('<.*?>+', '', mensage)
    mensage = re.sub('[%s]' % re.escape(string.punctuation), '', mensage)
    mensage = re.sub('\n', '', mensage)
    mensage = re.sub('\w*\d\w*', '', mensage)
    return mensage

train_df['text'] = train_df['text'].apply(clean_text)
test_df['text'] = test_df['text'].apply(clean_text)



def remove_stopwords(mensage):
    mensage = ' '.join(word for word in mensage.split(' ') if word not in stop_words)
    return mensage

train_df['text'] = train_df['text'].apply(remove_stopwords)
test_df['text'] = test_df['text'].apply(remove_stopwords)

# Divisão dos dados
x = train_df['text']
y = train_df['target']

# Removendo o documento vazio (índice 217)
empty_index = 217
if empty_index in x.index:
    x = x.drop(index=empty_index)
    y = y.drop(index=empty_index)
    # Precisa resetar o index
    x = x.reset_index(drop=True)  
    y = y.reset_index(drop=True) 

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.21, random_state=19)

# Carregar o modelo Sentence Transformer
model_name = 'all-mpnet-base-v2'
embedding_model = SentenceTransformer(model_name)

# Gerar os embeddings
# Converter x_train e x_test para lista antes de  encode
x_train_embeddings = embedding_model.encode(x_train.tolist(), convert_to_tensor=True)  
x_test_embeddings = embedding_model.encode(x_test.tolist(), convert_to_tensor=True)  

# Move the tensors to CPU and convert to NumPy arrays
x_train_embeddings = x_train_embeddings.cpu().numpy()  
x_test_embeddings = x_test_embeddings.cpu().numpy()

# Treinar o modelo de Regressão Logística
from sklearn.linear_model import LogisticRegression # Importing the missing module
logistic_model = LogisticRegression(random_state=19, solver='liblinear')
logistic_model.fit(x_train_embeddings, y_train)

# Fazer as previsões
y_pred_sentence_transformer = logistic_model.predict(x_test_embeddings)

# Avaliar o modelo
accuracy_sentence_transformer = accuracy_score(y_test, y_pred_sentence_transformer)
f1_sentence_transformer = f1_score(y_test, y_pred_sentence_transformer)

print(f'Acurácia com Sentence Transformer: {accuracy_sentence_transformer:.4f}')
print(f'F1-score com Sentence Transformer: {f1_sentence_transformer:.4f}')

print('\nRelatório de Classificação (Sentence Transformer):')
print(classification_report(y_test, y_pred_sentence_transformer))

cm_sentence_transformer = confusion_matrix(y_test, y_pred_sentence_transformer)
plt.figure(figsize=(2,2))
sns.heatmap(cm_sentence_transformer, annot=True, fmt='d', cmap='viridis',
            xticklabels=['Não Desastre', 'Desastre'], yticklabels=['Não Desastre', 'Desastre'])
plt.xlabel('Previsto (Sentence Transformer)')
plt.ylabel('Real')
plt.title('Matriz de Confusão (Sentence Transformer)')
plt.show()"""