# Trabalho prático - Predição de Spam


## 1) Bibliotecas necessárias 

In [5]:
import pandas as pd
import plotly.graph_objs as go
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics

color = sns.color_palette()

## 2) Carregar dados do DB

In [6]:
nomes = ['word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d', 'word_freq_our', 'word_freq_over', 'word_freq_remove', 'word_freq_internet', 'word_freq_order', 'word_freq_mail', 'word_freq_receive', 'word_freq_will', 'word_freq_people', 'word_freq_report', 'word_freq_addresses', 'word_freq_free', 'word_freq_business', 'word_freq_email', 'word_freq_you', 'word_freq_credit', 'word_freq_your',	'word_freq_font', 'word_freq_000', 'word_freq_money', 'word_freq_hp', 'word_freq_hpl',	'word_freq_george',	'word_freq_650', 'word_freq_lab', 'word_freq_labs', 'word_freq_telnet', 'word_freq_857', 'word_freq_data', 'word_freq_415', 'word_freq_85', 'word_freq_technology',	'word_freq_1999', 'word_freq_parts', 'word_freq_pm', 'word_freq_direct', 'word_freq_cs', 'word_freq_meeting', 'word_freq_original', 'word_freq_project', 'word_freq_re', 'word_freq_edu', 'word_freq_table', 'word_freq_conference', 'char_freq_%3B', 'char_freq_%28', 'char_freq_%5B', 'char_freq_%21', 'char_freq_%24', 'char_freq_%23', 'capital_run_length_average', 'capital_run_length_longest', 'capital_run_length_total', 'class']

spam = pd.read_csv("./spambase.data", names=nomes)

spam.head()


Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_%3B,char_freq_%28,char_freq_%5B,char_freq_%21,char_freq_%24,char_freq_%23,capital_run_length_average,capital_run_length_longest,capital_run_length_total,class
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


## 3) Análise do DB


### 3.1) Funcões úteis 

In [7]:
# Gráficos de barras 
def barChart(x, y, title):
    # This will plot a simple bar chart
    plt.bar(x, y)
 
    # Title to the plot
    plt.title(title)
 
    # Adding the legends
    plt.legend(["bar"])
    plt.show()

# Gráfico de pontos 
def ScatterChart(x, y, title):
    fig = px.scatter(spam, x, y)
    fig.update_traces(marker_color="turquoise",marker_line_color='rgb(8,48,107)',
                    marker_line_width=1.5)
    fig.update_layout(title_text=title)
    fig.show()

### 3.1) Visualização das classes do DB 

In [8]:
dist = spam['class'].value_counts()
colors = ['mediumturquoise', 'darkorange']
trace = go.Pie(values=(np.array(dist)),labels=dist.index)
layout = go.Layout(title='Spam')
data = [trace]
fig = go.Figure(trace,layout)
fig.update_traces(marker=dict(colors=colors, line=dict(color='#000000', width=2)))
fig.show()

### 3.2) Análise das colunas 


In [9]:
ScatterChart(spam['class'], spam['word_freq_3d'], "class x word_freq_3d")

In [16]:
spam.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4601 entries, 0 to 4600
Data columns (total 58 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   word_freq_make              4601 non-null   float64
 1   word_freq_address           4601 non-null   float64
 2   word_freq_all               4601 non-null   float64
 3   word_freq_3d                4601 non-null   float64
 4   word_freq_our               4601 non-null   float64
 5   word_freq_over              4601 non-null   float64
 6   word_freq_remove            4601 non-null   float64
 7   word_freq_internet          4601 non-null   float64
 8   word_freq_order             4601 non-null   float64
 9   word_freq_mail              4601 non-null   float64
 10  word_freq_receive           4601 non-null   float64
 11  word_freq_will              4601 non-null   float64
 12  word_freq_people            4601 non-null   float64
 13  word_freq_report            4601 

### 3.3) Conclusões sobre o DB

**Pontos Importantes**

- Possuem escalas complementamente diferentes. Logo, os dados precisam ser normalizados 
- As classes estão desbalanceadas 

## 4) Modelos 

### 4.1) SVM
#### a) Hiperparâmetros

- Parêmetro C: Penalidade para cada ponto de dado classificado errado. Um C pequeno representa pouca penalidade para erros na classificação, enquanto que um valor alto desse parâmetro pode resultar em overfitting.
- Parâmetro Gamma: Para kernel não lineares o gamma define o grau de influência do kernel trick. Por exemplo, para a função kernel RBF o gamma representa a distância de influência de um único ponto de dados.

#### b) Métrica de avaliação: 
Visando o desbanlanceamento da classes a métrica utilizada para avaliação será a precisão 

### 4.2) MLP

#### a) Hiperparâmetros 
- Número de camadas escondidas 
- Função de ativação 
- Taxa de aprendizado 
- Número de neurônio por camada 

#### b) Métrica de avaliação: 
Visando o desbanlanceamento da classes a métrica utilizada para avaliação será a precisão 


### 4.3) KNN
#### a) Hiperparâmetros 
- Número de vizinhos 
- Pesos (weights): Influência dos vizinhos no ponto de dados referente a classificação desse.
- Métrica: Forma de calcular a distência entre os pontos relativa a similiridade entre eles 

#### b) Métrica de avaliação: 
Visando o desbanlanceamento da classes a métrica utilizada para avaliação será a precisão 

## Referências 
1) https://towardsdatascience.com/a-beginners-guide-to-data-analysis-in-python-188706df5447
2) https://medium.com/@tonop15/spambase-data-exploration-analysis-9a3d6d83ee78
3) https://towardsdatascience.com/hyperparameter-tuning-for-support-vector-machines-c-and-gamma-parameters-6a5097416167
4) https://www.geeksforgeeks.org/svm-hyperparameter-tuning-using-gridsearchcv-ml/
5) https://scikit-learn.org/stable/index.html
6) https://www.analyticsvidhya.com/blog/2021/05/tuning-the-hyperparameters-and-layers-of-neural-network-deep-learning/
7) https://panjeh.medium.com/scikit-learn-hyperparameter-optimization-for-mlpclassifier-4d670413042b
8) https://www.kaggle.com/code/arunimsamudra/k-nn-with-hyperparameter-tuning
