# Spam Email Detector

## Importing Libraries

In [None]:
import string

import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

## Data Preprocessing

In [None]:
df = pd.read_csv('data/spam_ham_dataset.csv')

df

The data is imbalanced.

In [None]:
df['text'] = df['text'].apply(lambda x: x.replace('\r\n', ' '))

df

In [None]:
df.info()

In [None]:
stemmer = PorterStemmer()
corpus = []

stopwords_set = set(stopwords.words('english'))

for text in df['text']:
    # lower string 'text', remove punctuations, and split it into words
    text = text.lower().translate(str.maketrans('', '', string.punctuation)).split()
    # stem and store non-stopwords into 'text' variable
    text = [stemmer.stem(word) for word in text if word not in stopwords_set]

    corpus.append(' '.join(text))

In [None]:
print(df['text'].iloc[0], '\n')

print(corpus[0])

In [None]:
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(corpus).toarray()
y = df.label_num

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Model Training & Predictions

In [None]:
model = RandomForestClassifier(n_jobs=-1)

model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

## Testing

### Test on a few emails

In [None]:
test_email = df.text.values[10]

print(test_email)

In [None]:
def pre_process(text):
    # lower string 'text', remove punctuations, and split it into words
    text = text.lower().translate(str.maketrans('', '', string.punctuation)).split()
    # stem and store non-stopwords into 'text' variable
    text = [stemmer.stem(word) for word in text if word not in stopwords_set]

    return [' '.join(text)]

X_email = vectorizer.transform(pre_process(test_email))

In [None]:
# classify email
model.predict(X_email)

In [None]:
# actual class
df.label_num.iloc[10]