# Laboratorio : Detección de Spam

In [1]:
#Importar las librerías
import pandas as pd
import numpy as np
import nltk
import unidecode
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\diego\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\diego\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Parte 1: Ingeniería de características

### Exploración de Datos

In [2]:
## Cargar el dataset proporcionado
ds1 = pd.read_csv('./Datasets/completeSpamAssassin.csv')
ds2 = pd.read_csv('./Datasets/enronSpamSubset.csv')

print('-------D-A-T-A-S-E-T--1-------\n', ds1.head(), '\n-------------------------------------')
print('-------D-A-T-A-S-E-T--2-------\n', ds2.head(), '\n-------------------------------------')



-------D-A-T-A-S-E-T--1-------
    Unnamed: 0                                               Body  Label
0           0  \r\nSave up to 70% on Life Insurance.\r\nWhy S...      1
1           1  1) Fight The Risk of Cancer!\r\nhttp://www.adc...      1
2           2  1) Fight The Risk of Cancer!\r\nhttp://www.adc...      1
3           3  ##############################################...      1
4           4  I thought you might like these:\r\n1) Slim Dow...      1 
-------------------------------------
-------D-A-T-A-S-E-T--2-------
    Unnamed: 0.1  Unnamed: 0  \
0          2469        2469   
1          5063        5063   
2         12564       12564   
3          2796        2796   
4          1468        1468   

                                                Body  Label  
0  Subject: stock promo mover : cwtd\r\n * * * ur...      1  
1  Subject: are you listed in major search engine...      1  
2  Subject: important information thu , 30 jun 20...      1  
3  Subject: = ? utf - 8 ? q ? 

In [3]:
# Unificación de tablas

# Limpieza dataset 1
columns = list(ds1)
columns.pop(0)
ds1 = ds1[columns]

# Limpieza dataset 2
columns = list([0,1])
ds2.drop(ds2.columns[[i for i in columns]], axis=1, inplace=True)

# Concatenación de datasets
df = pd.concat([ds1, ds2])
df = df.reset_index(drop=True)
print('-------D-A-T-A-S-E-T--F-I-N-A-L-------\n', df.head(), '\n-------------------------------------')


-------D-A-T-A-S-E-T--F-I-N-A-L-------
                                                 Body  Label
0  \r\nSave up to 70% on Life Insurance.\r\nWhy S...      1
1  1) Fight The Risk of Cancer!\r\nhttp://www.adc...      1
2  1) Fight The Risk of Cancer!\r\nhttp://www.adc...      1
3  ##############################################...      1
4  I thought you might like these:\r\n1) Slim Dow...      1 
-------------------------------------


### Preprocesamiento

In [4]:
def preprocessing(text):
    
    # Convierte todo a str
    if type(text) in [float, int]:
        text = str(text)
    elif type(text) != str:
        return None
    
    # Todo en minúsculas
    text = text.lower()
    # Remueve acentos
    text = unidecode.unidecode(text)
    
    # Tokeniza el texto en palabras
    words = nltk.word_tokenize(text)
    # Elimina stop words del texto
    stop_words = set(stopwords.words("english"))
    
    # Unifica el texto filtrado
    filtered_words = [word for word in words if word not in stop_words]
    filtered_text = " ".join(filtered_words)
    
    return filtered_text

In [5]:
# Aplicación de las funciones construídas

df['Body'] = df['Body'].apply(preprocessing)
df.to_csv('./Datasets/text_filtered.csv', index=False)
print(df.sample(n=5).head())

                                                    Body  Label
10205  subject : need lift support bra ! 1903 guarant...      1
9206   subject : bought new cd ? see evidence e - mai...      1
5266   instead spreading bbc fud , 's best go straigh...      0
3344   http : //www.hughes-family.org/bugzilla/show_b...      0
5962                                               empty      0


### Representación de Texto

### Modelo Bag of Words

In [43]:
import re
dBoW = pd.read_csv('./Datasets/text_filtered.csv')
dBoW = dBoW['Body']
dBoW

0        save 70 % life insurance . spend ? life quote ...
1        1 ) fight risk cancer ! http : //www.adclick.w...
2        1 ) fight risk cancer ! http : //www.adclick.w...
3        # # # # # # # # # # # # # # # # # # # # # # # ...
4        thought might like : 1 ) slim - guaranteed los...
                               ...                        
16041    subject : monday 22 nd oct louise , half hour ...
16042    subject : missing bloomberg deals stephanie - ...
16043    subject : eops salary survey questionnaire nee...
16044    subject : q 3 comparison hi louise , compariso...
16045    subject : confidential folder safely pass info...
Name: Body, Length: 16046, dtype: object

In [44]:
dBoW = np.array(dBoW, dtype=str)
dBoW

MemoryError: Unable to allocate 9.94 GiB for an array with shape (16046,) and data type <U166277

In [15]:
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
stop_words = nltk.corpus.stopwords.words('english')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\diego\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\diego\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

norm_corpus = normalize_corpus(dBoW)
norm_corpus

array(['save life insurance spend life quote savings ensuring family financial security important life quote savings makes buying life insurance simple affordable provide free access best companies lowest rateslife quote savings fast easy saves money let us help get started best values country new coverage save hundreds even thousands dollars requesting free quote lifequote savings service take less minutes complete shop compare save types life insurance click free quote protecting family best investment ever make receipt email error andor wish removed list please click type remove reside state prohibits email solicitations insurance please disregard email',
       'fight risk cancer http wwwadclickwspcfm spk slim guaranteed lose lbs days http wwwadclickwspcfm spk get child support deserve free legal advice http wwwadclickwspcfm spk join web fastest growing singles community http wwwadclickwspcfm spk start private photo album online http wwwadclickwspcfm spkhave wonderful day offer man

In [35]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0.2, max_df=0.8)
# Min_df y Max_df nos sirven para controlar el porcentaje mínimo y máximo de apariciones de un token
cv_matrix = cv.fit_transform(norm_corpus)
cv_matrix = cv_matrix.toarray()
cv_matrix = cv_matrix.astype(np.int32)
cv_matrix

array([[0, 3, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 2, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [1, 3, 0, ..., 1, 0, 1]])

In [36]:
# Palabras únicas del corpus
vocabulario = cv.get_feature_names_out()
# Mostrar el vector
pd.DataFrame(cv_matrix, columns=vocabulario)

Unnamed: 0,com,email,get,http,list,new,one,please,subject,time,would
0,0,3,1,0,1,1,0,2,0,0,0
1,0,0,1,7,2,0,0,1,0,0,0
2,0,0,1,6,1,0,0,1,0,0,0
3,0,1,2,6,0,1,3,1,0,0,0
4,0,0,1,5,2,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
16041,0,0,0,0,0,0,0,0,1,0,0
16042,0,0,0,0,0,0,1,1,7,5,0
16043,0,0,0,0,0,0,0,1,2,0,0
16044,0,0,0,0,0,0,0,0,1,0,0


## Modelo Bag of N - grams

In [37]:
bv = CountVectorizer(ngram_range=(2,2))
bv_matrix = bv.fit_transform(norm_corpus)

bv_matrix = bv_matrix.toarray()
vocabulario = bv.get_feature_names_out()
pd.DataFrame(bv_matrix, columns=vocabulario)

MemoryError: Unable to allocate 130. GiB for an array with shape (16046, 1088458) and data type int64

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tv_matrix = tv.fit_transform(norm_corpus)
tv_matrix = tv_matrix.toarray()

vocabulario = tv.get_feature_names_out()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocabulario)

MemoryError: Unable to allocate 15.9 GiB for an array with shape (16046, 133269) and data type float64

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(tv_matrix)
similarity_df = pd.DataFrame(similarity_matrix)
similarity_df

MemoryError: Unable to allocate 15.9 GiB for an array with shape (16046, 133269) and data type float64