In [54]:
!pip install scikit-learn
!pip install pandas
!pip install nltk



In [55]:
import pandas as pd

df = pd.read_csv('datasets/emails.csv')
new_df = df.copy()
new_df

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5728 non-null   object
 1   spam    5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB


In [57]:
df['spam'].value_counts()

spam
0    4360
1    1368
Name: count, dtype: int64

In [58]:
import nltk
import string
nltk.download('stopwords')

class Parser():

    def __init__(self):
        self.stemmer = nltk.PorterStemmer()
        self.stopwords = set(nltk.corpus.stopwords.words('english'))
        self.punctuation = list(string.punctuation)

    def tokenize(self, text):
        for c in self.punctuation:
            text = text.replace(c, "")
        text = text.replace("\t", " ")
        text = text.replace("\n", " ")
        tokens = list(filter(None, text.split(" ")))
        # Stemming of the tokens
        return [self.stemmer.stem(w) for w in tokens if w not in self.stopwords]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RSSpe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [59]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Descargar las stopwords de nltk si es necesario
# nltk.download('stopwords')

# Inicializar el stemmer y las stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_email(text):
    # 1. Convertir a minúsculas
    text = text['text'].lower()

    # 2. Eliminar meta-datos del correo (Subject, To, Cc, etc.)
    meta_data_patterns = [r'subject\s*:', r'to\s*:', r'cc\s*:', r're\s*:']
    for pattern in meta_data_patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)

    # 3. Eliminar direcciones de correo electrónico
    text = re.sub(r'\S+@\S+', '', text)

    # 4. Eliminar URLs
    text = re.sub(r'http\S+', '', text)

    # 5. Eliminar caracteres especiales y números
    text = re.sub(r'\d+', '', text)  # Eliminar números
    text = text.translate(str.maketrans('', '', string.punctuation))  # Eliminar puntuación

    # 6. Tokenizar el texto (dividir en palabras)
    tokens = text.split()

    # 7. Eliminar stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # 8. Aplicar stemming (o lematización)
    tokens = [stemmer.stem(word) for word in tokens]

    # 9. Juntar los tokens de nuevo en un texto limpio
    cleaned_text = ' '.join(tokens)

    return cleaned_text


new_df['text'] = new_df.apply(clean_email, axis=1)
new_df

Unnamed: 0,text,spam
0,natur irresist corpor ident lt realli hard rec...,1
1,stock trade gunsling fanni merril muzo colza a...,1
2,unbeliev new home made easi im want show homeo...,1
3,color print special request addit inform click...,1
4,money get softwar cd softwar compat great grow...,1
...,...,...
5723,research develop charg gpg forward shirley cre...,0
5724,receipt visit jim thank invit visit lsu shirle...,0
5725,enron case studi updat wow day super thank muc...,0
5726,interest david pleas call shirley crenshaw ass...,0


In [60]:
from sklearn.model_selection import train_test_split

def train_val_test_split(df, rstate=42, shuffle=True, stratify=None):
    strat = df[stratify] if stratify else None
    train_set, test_set = train_test_split(
        df, test_size=0.4, random_state=rstate, shuffle=shuffle, stratify=strat)
    strat = test_set[stratify] if stratify else None
    val_set, test_set = train_test_split(
        test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)
    return (train_set, val_set, test_set)

In [61]:
train_set, val_set, test_set = train_val_test_split(new_df)

In [62]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit_transform(['natur irresist corpor ident lt realli hard'])

<1x7 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [63]:
text_list = train_set['text'].to_list()
labels = train_set['spam'].to_list()

In [64]:
vectorizer = CountVectorizer()
x_train = vectorizer.fit_transform(text_list)

In [65]:
print("\nFeatures:", len(vectorizer.get_feature_names_out()))


Features: 19921


In [67]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(x_train, labels)

In [87]:
x_val = vectorizer.transform(val_set['text'])
x_val

<1146x19921 sparse matrix of type '<class 'numpy.int64'>'
	with 88642 stored elements in Compressed Sparse Row format>

In [89]:
labels_predicted = clf.predict(x_val)

In [91]:
from sklearn.metrics import accuracy_score

print('Accuracy: {:.3f}'.format(accuracy_score(val_set['spam'].to_list(), labels_predicted)))

Accuracy: 0.988


In [93]:
x_test = vectorizer.transform(test_set['text'])
x_test

<1146x19921 sparse matrix of type '<class 'numpy.int64'>'
	with 90602 stored elements in Compressed Sparse Row format>

In [95]:
labels_predicted = clf.predict(x_test)

In [97]:
from sklearn.metrics import accuracy_score

print('Accuracy: {:.3f}'.format(accuracy_score(test_set['spam'].to_list(), labels_predicted)))

Accuracy: 0.984
