In [7]:
import string
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [8]:
#common words that doesnt have significant meaning (and, the, in, ...)
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mateusz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df = pd.read_csv('spam_ham_dataset.csv')

In [16]:
df['text'] = df['text'].apply(lambda x: x.replace('\r\n', ''))

In [18]:
df.text.iloc[2]

"Subject: neon retreat ho ho ho , we ' re around to that most wonderful time of the year - - - neon leaders retreat time ! i know that this time of year is extremely hectic , and that it ' s tough to think about anything past the holidays , but life does go on past the week of december 25 through january 1 , and that ' s what i ' d like you to think about for a minute . on the calender that i handed out at the beginning of the fall semester , the retreat was scheduled for the weekend of january 5 - 6 . but because of a youth ministers conference that brad and dustin are connected with that week , we ' re going to change the date to the following weekend , january 12 - 13 . now comes the part you need to think about . i think we all agree that it ' s important for us to get together and have some time to recharge our batteries before we get to far into the spring semester , but it can be a lot of trouble and difficult for us to get away without kids , etc . so , brad came up with a pote

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [21]:
stemmer = PorterStemmer()

In [22]:
#stemmer reduce words to their base form
stemmer.stem('running')

'run'

In [96]:
corpus = []
stopwords_set = set(stopwords.words('english'))

for i in range(len(df)):
    text = df['text'].iloc[i].lower()
    text = text.translate(str.maketrans('', '', string.punctuation)).split()
    text = [stemmer.stem(word) for word in text if word not in stopwords_set]
    corpus.append(text)

In [25]:
df.text.iloc[0]

"Subject: enron methanol ; meter # : 988291 this is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary flow data provided by daren } . please override pop ' s daily volume { presently zero } to reflect daily activity you can obtain from gas control . this change is needed asap for economics purposes ."

In [24]:
corpus[0]

'subject enron methanol meter 988291 follow note gave monday 4 3 00 preliminari flow data provid daren pleas overrid pop daili volum present zero reflect daili activ obtain ga control chang need asap econom purpos'

In [26]:
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(corpus).toarray()
y = df.label_num

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [30]:
clf = RandomForestClassifier(n_jobs=-1)
clf.fit(X_train, y_train)

In [83]:
clf.score(X_test, y_test)

0.9739130434782609

In [90]:
#function to classify email (1 - if spam, 0 - if not)
def classify(email_to_classify):
    email_text = email_to_classify.lower().translate(str.maketrans('', '', string.punctuation)).split()
    email_text = [stemmer.stem(word) for word in email_text if word not in stopwords_set]
    email_text = ' '.join(email_text)
    
    X_email = vectorizer.transform([email_text])
    print(clf.predict(X_email))

In [79]:
email_to_classify = df.text.values[10]

In [80]:
email_to_classify

"Subject: vocable % rnd - word asceticism vcsc - brand new stock for your attention vocalscape inc - the stock symbol is : vcsc vcsc will be our top stock pick for the month of april - stock expected to bounce to 12 cents level the stock hit its all time low and will bounce back stock is going to explode in next 5 days - watch it soar watch the stock go crazy this and next week . breaking news - vocalscape inc . announces agreement to resell mix network services current price : $ 0 . 025 we expect projected speculative price in next 5 days : $ 0 . 12 we expect projected speculative price in next 15 days : $ 0 . 15 vocalscape networks inc . is building a company that ' s revolutionizing the telecommunications industry with the most affordable phone systems , hardware , online software , and rates in canada and the us . vocalscape , a company with global reach , is receiving international attention for the development of voice over ip ( voip ) application solutions , including the award 

In [91]:
classify(email_to_classify)

[1]


In [92]:
test_email_1 = "Hello! You won a brand new Iphone 15! Congratulations! To recieve the prize, go to link below."
classify(new_email)

[1]


In [94]:
test_email_2 = "We are pleased to invite you to our Annual Company Conference, which will be held on August 25, 2024, at the Downtown Convention Center. This year's conference theme is Innovation and growth and we have an exciting lineup of speakers and workshops planned. Event Details:Date: August 25, 2024Time: 9:00 AM - 5:00 PM Location: Downtown Convention Center, 123 Main St, Cityville Please RSVP by August 15, 2024, to confirm your attendance. You can RSVP by clicking the link below: RSVP to Annual Company Conference If you have any questions or need further information, please do not hesitate to contact our event coordinator, Jane Smith, at jane.smith@companywebsite.com or (555) 123-4567. We look forward to seeing you there! Best regards,John DoeEvent Manager"
classify(test_email_2)

[0]
