**Procesamos HTML**

In [91]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.fed = []
    def handle_data(self,d):
        self.fed.append(d)
    def get_data(self):
        return "".join(self.fed)

In [92]:
def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [93]:
#cadena HTML
t = """<html>

<head>
  <title>HTML</title>
</head>

<body>
  <p>Esta página web es una página HTML válida.</p>
</body>

</html>"""

#cadena sin etiquetas
print(strip_tags(t))




  HTML



  Esta página web es una página HTML válida.





**Procesamiento del Lenguaje Natural (NLTK)**

*NLTK: Biblioteca de Python que proporciona herramientas y recursps para trabajar con texto, análisis, manipulación y procesar
datos linguisticos, se usa para desarrollar sistemas que entiendan, interpreten y generen lenguaje humano.*

**Aplicaciones**

*-Análisis de sentimientos: Determinar si una opinión es negativa, positiva o neutral*

*-Sistemas de recomendación: Basados en la interpretación del texto*

*-Chatbots: Procesamiento y respuestas a entradas a entradas y textos del usuario*

*-Extracción de información: Encontrar información relevante dentro de grandes volúmenes de texto*

**Características**

**1. Tokenización**

*Dividir texto en palabras, oraciones u otros elementos más pequeños, Ejemplo: convertir un párrafo en una lista de palabras.*

In [94]:
#!pip install nltk
import nltk
nltk.download("punkt")
text = "I love eating pizza with my friends"
words = nltk.word_tokenize(text)
print(words)

['I', 'love', 'eating', 'pizza', 'with', 'my', 'friends']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\xavie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**2. Stemming**

*Reducción de las palabras a su raíz o forma base ("running" -> "run" )*

In [95]:
import nltk
from nltk.stem import PorterStemmer

nltk.download("punkt")
text = "I love eating pizza with my friends"
words = nltk.word_tokenize(text)
print(words)

stemmer = PorterStemmer()
stemmer_word = [stemmer.stem(word) for word in words]
print(stemmer_word)

['I', 'love', 'eating', 'pizza', 'with', 'my', 'friends']
['i', 'love', 'eat', 'pizza', 'with', 'my', 'friend']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\xavie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**3. StopWords**

*Palabras comunes que se consideran poco informativas para el análisis de texto, como "y", "el", "a" en español o "the", "is", "in" en ingles
a menudo estas palabras se eliminan del texto durante el preprocesamiento.*

In [96]:
import nltk
from nltk.corpus import stopwords

nltk.download("punkt")
nltk.download("stopwords")

text = "I love eating pizza with my friends"
words = nltk.word_tokenize(text)
print(words)

english_words = stopwords.words("english")
#print(english_words)
filtered_words = [ word for word in words if word.lower() not in english_words]
print(filtered_words)

['I', 'love', 'eating', 'pizza', 'with', 'my', 'friends']
['love', 'eating', 'pizza', 'friends']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\xavie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\xavie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Caracteres especiales**

In [97]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [98]:
import string
class TextProcessor:
    def __init__(self):
        self.punctuation = string.punctuation

processor = TextProcessor()
print(list(processor.punctuation))

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


**Email**

In [99]:
import email
with open("ejemplo_correo.txt","r") as file:
    email_content = file.read()
msg = email.message_from_string(email_content)
#print(msg)

from_address = msg["FROM"]
to_address = msg["To"]
subject = msg["Subject"]
body = msg.get_payload()

#mostrar información
print("Remitente: ", from_address)
print("Destinatario: ", to_address)
print("Asunto: ", subject)
print("Cuerpo del mensaje: ", body)
print(msg.get_content_type())

Remitente:  sender@example.com
Destinatario:  recipient@example.com
Asunto:  Ejemplo de Correo
Cuerpo del mensaje:  Hola,

Este es un ejemplo de correo electrÃ³nico.

Saludos,
Remitente
text/plain


**Código de preprocesamiento**

*Elimina HTML, tokeniza, reduce a raiz y quita palabras vacias*

In [100]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.fed = []
    def handle_data(self,d):
        self.fed.append(d)
    def get_data(self):
        return "".join(self.fed)

In [101]:
def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [102]:
import email
import string
import nltk

class Parser:
    def __init__(self):
        self.stemmer = nltk.PorterStemmer() #raiz
        self.stopwords = set(nltk.corpus.stopwords.words("english")) #palabras vacías
        self.punctuation = list(string.punctuation) #Signos de puntuación
    def parser(self,email_path):
        with open(email_path, errors = "ignore") as e:
            msg = email.message_from_file(e)
        return None if not msg else self.get_email_content(msg)
    def get_email_content(self,msg):
        """Extrae el contenido del EMAIL"""
        subject = self.tokenize(msg["subject"] if msg["subject"] else [])
        body = self.get_email_body(msg.get_payload(), msg.get_content_type())
        content_type = msg.get_content_type()
        return {
                 "Subject": subject,
                 "body": body,
                 "content_type": content_type
                }
    def tokenize(self,text):
        for c in self.punctuation:
            text = text.replace(c,"")
        text = text.replace("\t"," ")
        text = text.replace("\n"," ")
        tokens = list(filter(None,text.split(" ")))
        return [self.stemmer.stem(w) for w in tokens if w not in self.stopwords]
    def get_email_body(self,payload,content_type):
        """Extrae el cuerpo"""
        body = []
        if type(payload) is str and content_type == "text/plain":
            return self.tokenize(payload)
        elif type(payload) is str and content_type == "text/html":
            return self.tokenize(strip_tags(payload))
        elif type(payload) is list:
            for p in payload :
                body += self.get_email_body(p.get_payload(),p.get_content_type())               
        return body

**Lectura de correos**

In [103]:
cnmail = open("datasets\\datasets\\trec07p\\data\\inmail.1").read()
print(inmail)

From RickyAmes@aol.com  Sun Apr  8 13:07:32 2007
Return-Path: <RickyAmes@aol.com>
Received: from 129.97.78.23 ([211.202.101.74])
	by speedy.uwaterloo.ca (8.12.8/8.12.5) with SMTP id l38H7G0I003017;
	Sun, 8 Apr 2007 13:07:21 -0400
Received: from 0.144.152.6 by 211.202.101.74; Sun, 08 Apr 2007 19:04:48 +0100
Message-ID: <WYADCKPDFWWTWTXNFVUE@yahoo.com>
From: "Tomas Jacobs" <RickyAmes@aol.com>
Reply-To: "Tomas Jacobs" <RickyAmes@aol.com>
To: the00@speedy.uwaterloo.ca
Subject: Generic Cialis, branded quality@ 
Date: Sun, 08 Apr 2007 21:00:48 +0300
X-Mailer: Microsoft Outlook Express 6.00.2600.0000
MIME-Version: 1.0
Content-Type: multipart/alternative;
	boundary="--8896484051606557286"
X-Priority: 3
X-MSMail-Priority: Normal
Status: RO
Content-Length: 988
Lines: 24

----8896484051606557286
Content-Type: text/html;
Content-Transfer-Encoding: 7Bit

<html>
<body bgcolor="#ffffff">
<div style="border-color: #00FFFF; border-right-width: 0px; border-bottom-width: 0px; margin-bottom: 0px;" align="

In [104]:
p = Parser()
p.parser("datasets\\datasets\\trec07p\\data\\inmail.1")

{'Subject': ['gener', 'ciali', 'brand', 'qualiti'],
 'body': ['do',
  'feel',
  'pressur',
  'perform',
  'rise',
  'occas',
  'tri',
  'viagra',
  'anxieti',
  'thing',
  'past',
  'back',
  'old',
  'self'],
 'content_type': 'multipart/alternative'}

In [105]:
#Lectura del indice
index = open("datasets\\datasets\\trec07p\\full\\index").readlines()
index[:10]

['spam ../data/inmail.1\n',
 'ham ../data/inmail.2\n',
 'spam ../data/inmail.3\n',
 'spam ../data/inmail.4\n',
 'spam ../data/inmail.5\n',
 'spam ../data/inmail.6\n',
 'spam ../data/inmail.7\n',
 'spam ../data/inmail.8\n',
 'spam ../data/inmail.9\n',
 'ham ../data/inmail.10\n']

In [106]:
import os
DATASET_PATH = "datasets\\datasets/trec07p"

def parse_index(path_to_index, n_elemento):
    ret_indexes = []
    index = open(path_to_index).readlines()
    for i in range(n_elemento):
        mail =  index[i].split(" ../")
        label = mail[0]
        path  = mail[1][:-1]
        ret_indexes.append({"label":label, "email_path": os.path.join(DATASET_PATH,path)})
    return ret_indexes

In [107]:
indexes = parse_index("datasets\\datasets\\trec07p\\full\\index",10)
indexes

[{'label': 'spam', 'email_path': 'datasets\\datasets/trec07p\\data/inmail.1'},
 {'label': 'ham', 'email_path': 'datasets\\datasets/trec07p\\data/inmail.2'},
 {'label': 'spam', 'email_path': 'datasets\\datasets/trec07p\\data/inmail.3'},
 {'label': 'spam', 'email_path': 'datasets\\datasets/trec07p\\data/inmail.4'},
 {'label': 'spam', 'email_path': 'datasets\\datasets/trec07p\\data/inmail.5'},
 {'label': 'spam', 'email_path': 'datasets\\datasets/trec07p\\data/inmail.6'},
 {'label': 'spam', 'email_path': 'datasets\\datasets/trec07p\\data/inmail.7'},
 {'label': 'spam', 'email_path': 'datasets\\datasets/trec07p\\data/inmail.8'},
 {'label': 'spam', 'email_path': 'datasets\\datasets/trec07p\\data/inmail.9'},
 {'label': 'ham', 'email_path': 'datasets\\datasets/trec07p\\data/inmail.10'}]

In [108]:
open("datasets\\datasets\\trec07p\\data\\inmail.1").read()

'From RickyAmes@aol.com  Sun Apr  8 13:07:32 2007\nReturn-Path: <RickyAmes@aol.com>\nReceived: from 129.97.78.23 ([211.202.101.74])\n\tby speedy.uwaterloo.ca (8.12.8/8.12.5) with SMTP id l38H7G0I003017;\n\tSun, 8 Apr 2007 13:07:21 -0400\nReceived: from 0.144.152.6 by 211.202.101.74; Sun, 08 Apr 2007 19:04:48 +0100\nMessage-ID: <WYADCKPDFWWTWTXNFVUE@yahoo.com>\nFrom: "Tomas Jacobs" <RickyAmes@aol.com>\nReply-To: "Tomas Jacobs" <RickyAmes@aol.com>\nTo: the00@speedy.uwaterloo.ca\nSubject: Generic Cialis, branded quality@ \nDate: Sun, 08 Apr 2007 21:00:48 +0300\nX-Mailer: Microsoft Outlook Express 6.00.2600.0000\nMIME-Version: 1.0\nContent-Type: multipart/alternative;\n\tboundary="--8896484051606557286"\nX-Priority: 3\nX-MSMail-Priority: Normal\nStatus: RO\nContent-Length: 988\nLines: 24\n\n----8896484051606557286\nContent-Type: text/html;\nContent-Transfer-Encoding: 7Bit\n\n<html>\n<body bgcolor="#ffffff">\n<div style="border-color: #00FFFF; border-right-width: 0px; border-bottom-width: 0

In [109]:
def parse_email(index):
    p = Parser()
    pemail = p.parser(index['email_path'])
    return pemail, index['label']

In [110]:
mail,label = parse_email(indexes[0])
print("El correo es: ", label)
print(mail)

El correo es:  spam
{'Subject': ['gener', 'ciali', 'brand', 'qualiti'], 'body': ['do', 'feel', 'pressur', 'perform', 'rise', 'occas', 'tri', 'viagra', 'anxieti', 'thing', 'past', 'back', 'old', 'self'], 'content_type': 'multipart/alternative'}


**CountVectorizer**

*Es una herramienta fundamental en el procesamiento de lenguaje natural, se utiliza para convertir texto sin procesar en datos
númericos.*

**Para que se utiliza?**

**1. Conversión de texto en datos númericos**

**2. Extracción de características**

**3. Análisis de texto**

In [111]:
from sklearn.feature_extraction.text import CountVectorizer
#Preparar email
prep_email = [" ".join(mail['Subject']) + " ".join(mail['body'])]
vectorizer = CountVectorizer()
vectorizer.fit(prep_email)
print("Email",prep_email)
print("Características de entrada", vectorizer.get_feature_names_out())

Email ['gener ciali brand qualitido feel pressur perform rise occas tri viagra anxieti thing past back old self']
Características de entrada ['anxieti' 'back' 'brand' 'ciali' 'feel' 'gener' 'occas' 'old' 'past'
 'perform' 'pressur' 'qualitido' 'rise' 'self' 'thing' 'tri' 'viagra']


In [112]:
x = vectorizer.transform(prep_email)
print(x.toarray())

[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]]


In [113]:
#Función auxiliar para procesar los datos y para leer una cantidad x de correos
def create_prep_dataset(index_path,n_element):
    x = []
    y = []
    indexes = parse_index(index_path,n_element) #{'label': 'spam', 'email_path': 'datasets/trec07p\\data/inmail.1'}
    for i in range(n_element):
        mail,label  = parse_email(indexes[i])
        x.append(" ".join(mail['Subject']) + " ".join(mail['body']))
        y.append(label)
    return x,y

In [150]:
x_train, y_train = create_prep_dataset("datasets\\datasets\\trec07p\\full\\index",120)
#x_train

In [None]:
#y_train

In [130]:
#Aplicamos countVectorizer
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(x_train)

In [131]:
print(X_train.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [132]:
import pandas as pd
pd.DataFrame(X_train.toarray(), columns =[vectorizer.get_feature_names_out()])

Unnamed: 0,0000,000000,00085,002,003,00450,0089,009,01,01000u,...,õôõôèõéï,ö¹,öð,öôööµæ,öø³ðåµ,öþ,öˆ,úàí,þîñòµ¼,šè
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
116,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
117,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
118,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**1. Entrenamiento del algoritmo de regresión logística**

In [138]:
#Con el conjunto de datos preparados
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train,y_train) #Crea el modelo

LogisticRegression()

**2. Predicción**

In [144]:
X, Y = create_prep_dataset("datasets\\datasets\\trec07p\\full\\index",120)
X_test = X[100:]
Y_test = Y[100:]
#Vectorizamos
X_test = vectorizer.transform(X_test)
#predicción
y_pred = clf.predict(X_test)
#y_pred

In [None]:
#Y_test

In [145]:
from sklearn.metrics import accuracy_score
print("Accuracy: {:.3f}".format(accuracy_score(Y_test,y_pred)))

Accuracy: 1.000
