## Importando as bibliotecas

In [127]:
# Manipulação de dados
import pandas as pd
import numpy as np
import re
from collections import defaultdict
import requests

# Visualização de dados
import matplotlib.pyplot as plt
import seaborn as sns

# Pré-processamento de dados
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

# Modelos e pipelines
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import NMF, PCA
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import Pipeline

# Métricas de avaliação
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics.pairwise import linear_kernel

# Processamento de linguagem natural (NLP)
import nltk
from nltk.corpus import wordnet
from nltk.stem import SnowballStemmer, WordNetLemmatizer
import random

# Baixar recursos do NLTK, se necessário
# nltk.download('wordnet')

## Tratando o dataset

In [36]:
df = pd.read_csv('archive/spam.csv', encoding='ISO-8859-1')
df = df[['v1', 'v2']].rename(columns={'v1': 'label', 'v2': 'text'})
df = df[df['label'].isin(['ham', 'spam'])]
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

In [37]:
df

Unnamed: 0,label,text,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will Ì_ b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


## Criando o modelo

In [176]:
def build_pipeline(vectorizer, transformer, classifier):
    """
    Constrói o pipeline com as etapas de vetorização, transformação e classificação.
    
    :param vectorizer: Vetorizador (ex: CountVectorizer, TfidfVectorizer)
    :param transformer: Transformer (ex: NMF, PCA, ou None)
    :param classifier: Classificador (ex: BernoulliNB, LogisticRegression)
    
    :return: Pipeline configurado
    """
    steps = [('vectorizer', vectorizer)]
    
    if transformer:
        steps.append(('transformer', transformer))
    
    steps.append(('classifier', classifier))
    
    return Pipeline(steps)

def model_evaluation(df, vectorizer, transformer, classifier, test_size=0.2, n_runs=100, random_state=42):
    """
    Avalia um modelo utilizando diferentes vetorizadores, transformadores e classificadores.
    
    :param df: DataFrame contendo as colunas 'text' e 'label'
    :param vectorizer: Vetorizador (ex: CountVectorizer, TfidfVectorizer)
    :param transformer: Transformer (ex: NMF, PCA, ou None)
    :param classifier: Classificador (ex: BernoulliNB, LogisticRegression)
    :param test_size: Proporção do conjunto de teste
    :param n_runs: Número de execuções para calcular média e desvio padrão
    :param random_state: Semente aleatória para garantir repetibilidade
    
    :return: Média e desvio padrão da acurácia
    """
    accuracies = []
    
    for i in range(n_runs):
        # Divisão dos dados
        x_train, x_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=test_size, random_state=random_state+i)
        
        # Construir o pipeline
        model = build_pipeline(vectorizer, transformer, classifier)
        
        # Treinar o modelo
        model.fit(x_train, y_train)
        
        # Prever e avaliar
        y_pred = model.predict(x_test)
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
    
    return model, np.mean(accuracies), np.std(accuracies)

# Função para prever se um novo email é spam ou não
def predict_email(model, email_text, threshold=0.85):
    """
    Preve se o email é spam ou não com base no modelo treinado, usando um threshold de confiança.
    
    :param model: O modelo treinado
    :param email_text: O conteúdo do email como string
    :param threshold: A confiança mínima para classificar como spam (valor entre 0 e 1)
    :return: Previsão do modelo (1 = spam, 0 = não-spam) com probabilidade
    """
    for email in email_text:
        # Obter a probabilidade da classe 1 (spam)
        proba = model.predict_proba([email])[0][1]
        
        # Se a probabilidade for maior que o threshold, classifica como SPAM
        if proba >= threshold:
            print(f"SPAM (Probabilidade: {proba:.2f})")
        else:
            print(f"NÃO é SPAM (Probabilidade: {proba:.2f})")


In [177]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, PCA
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Lista de vetorizadores (CountVectorizer, TfidfVectorizer)
vectorizers = [
    CountVectorizer(binary=True),
    TfidfVectorizer()
]

# Lista de transformadores (NMF ou sem transformador)
transformers = [
    NMF(n_components=25),
    None  # Sem transformador
]

# Lista de classificadores (BernoulliNB, LogisticRegression, SVC, RandomForest)
classifiers = [
    BernoulliNB(),
    LogisticRegression(max_iter=1000),
    SVC(probability=True),  # Para poder usar predict_proba
    RandomForestClassifier()
]

# Para armazenar os resultados e evitar duplicações
results = []
existing_combinations = set()

# Calcula o total de combinações
total_combinations = len(vectorizers) * len(transformers) * len(classifiers)

# Barra de progresso única
with tqdm(total=total_combinations, desc="Total Progress") as pbar:
    for vectorizer in vectorizers:
        for transformer in transformers:
            for classifier in classifiers:
                # Criar uma combinação única de vetorizador, transformador e classificador
                vec_name = vectorizer.__class__.__name__.replace('Vectorizer', '')
                trans_name = transformer.__class__.__name__ if transformer else 'NoTrans'
                clf_name = classifier.__class__.__name__.replace('Classifier', '')
                combination_name = f"{vec_name}_{trans_name}_{clf_name}"
                
                # Verificar se a combinação já foi avaliada
                if combination_name not in existing_combinations:
                    existing_combinations.add(combination_name)
                    
                    # Avaliar o modelo
                    model, mean_accuracy, std_accuracy = model_evaluation(df, vectorizer, transformer, classifier)
                    
                    # Armazenar os resultados
                    results.append({
                        'Combination': combination_name,
                        'Mean Accuracy': mean_accuracy,
                        'Std Deviation': std_accuracy
                    })
                
                # Atualizar a barra de progresso
                pbar.update(1)


Total Progress: 100%|██████████| 16/16 [46:57<00:00, 176.12s/it]


In [178]:
import plotly.graph_objects as go
import pandas as pd

results = pd.DataFrame(results)
results_df = results.sort_values('Mean Accuracy', ascending=False)

# Criar o gráfico de barras com Plotly
fig = go.Figure()

fig.add_trace(go.Bar(
    x=results_df['Combination'],
    y=results_df['Mean Accuracy'],
    text=results_df['Mean Accuracy'].round(3),  # Exibir os valores arredondados
    textposition='auto',  # Posição automática do texto
    marker=dict(color=results_df['Mean Accuracy'], colorscale='Viridis'),  # Cores baseadas na acurácia
))

# Atualizar layout do gráfico
fig.update_layout(
    title='Mean Accuracy of Model Combinations',
    xaxis_title='Model Combination',
    yaxis_title='Mean Accuracy',
    xaxis_tickangle=-45,  # Rotacionar os rótulos em 45 graus
    bargap=0.2,  # Espaço entre as barras
    height=600,  # Altura do gráfico
    width=1000,  # Largura do gráfico
)

# Exibir o gráfico
fig.show()


In [117]:
emails_legitimos = [
    """We are pleased to inform you that you have been selected to move forward to the next stage of our hiring process! After carefully reviewing your profile, we are impressed by your skills and experience, and we believe you are a strong candidate for the Amazon role.
    The next step will be a virtual interview with our team, and we are excited to get to know you better. We will be sending you more details shortly, including the date, time, and location of the interview.
    Congratulations once again, and best of luck in the next stage!
    Best regards,  
    Emma Johnson""",
    
    """Hello John,
    We are happy to inform you that your refund request has been approved! The amount of $150 will be credited to your account within the next 3 to 5 business days. Please keep an eye on your bank statement to confirm the receipt.
    If you have any questions or need further information, feel free to contact us.
    Best regards,
    Liam Roberts
    Support Team""",
    
    """Dear John,
    Thank you for registering for our **Technology Workshop**. This email is to confirm your participation in the event, which will take place on October 15th at 10:00 AM at the São Paulo Convention Center.
    Please bring a valid ID on the day of the event. If you have any questions, feel free to contact us.
    We look forward to seeing you there!
    Best regards,
    Sarah Miller
    Event Organization Team""",
    
    """Hello John,
    We would love to hear your feedback on **Project Alpha**, which was implemented last week. Your feedback will help us improve our processes and ensure we are meeting the team's needs.
    Please fill out the feedback form by Friday, September 30th. If you have any questions, feel free to reach out.
    Best regards,
    David Thompson
    Project Management Team""",
    
    """Dear John,
    We are implementing a new layer of security on our platform. To ensure your account remains secure, we ask that you update your password by the end of the month.
    Please log in to your profile and follow the instructions to change your password. Should you need any assistance, our support team is available 24/7.
    Thank you for trusting Global Bank!
    Best regards,
    Olivia Parker
    Customer Support Team
    Global Bank""",
    
    """Hello Team,
    Just a quick reminder about our weekly meeting, which will take place next Monday at 10:00 AM. We will discuss the progress of ongoing projects and set goals for the upcoming week.
    Meeting link: [meeting link]
    If you are unable to attend, please let me know in advance.
    Best regards,
    Michael Davis
    Project Manager"""
]


In [123]:
# Testar com um novo email
resultado = predict_email(model_count, emails_legitimos)

print(f"Previsão para o novo email: {resultado}")

NÃO é SPAM (Probabilidade: 0.12)
NÃO é SPAM (Probabilidade: 0.64)
NÃO é SPAM (Probabilidade: 0.54)
NÃO é SPAM (Probabilidade: 0.04)
NÃO é SPAM (Probabilidade: 0.21)
NÃO é SPAM (Probabilidade: 0.07)
Previsão para o novo email: None
