In [26]:
import re
import pandas as pd
import numpy as np
import warnings
from sklearn.linear_model import LogisticRegression

In [27]:
warnings.filterwarnings('ignore')

In [28]:
dataset = [
    ("You won a prize! Claim now.", 'spam'),
    ("Meeting tomorrow at 5 PM. Don't forget!", "not spam"),
    ("Win big now! Claim your prize.", 'spam'),
    ("Buy now! Lowest price available.", 'spam'),
    ("Claim your prize now!", 'spam'),
    ("Submit the project by Friday.", "not spam"),
    ("You won an iPhone! Get it now.", 'spam'),
    ("Attached is the report.", "not spam"),
    ("Reset your password here.", "spam"),
    ("Are we still going to the gym?", "not spam"),
    ("Send me the files today.", "not spam"),
    ("Limited stock! Buy now.", "spam"),
    ("See you tomorrow at 5 PM.", "not spam"),
    ("Claim your free vacation now.", "spam"),
    ("Check holiday deals today.", "spam"),
    ("Looking forward to our meeting.", "not spam"),
    ("Bring documents for tomorrow.", "not spam"),
    ("Lowest prices! Buy today.", "spam"),
    ("Gym session is at 6 AM.", "not spam"),
    ("Let's have lunch this weekend.", "not spam"),
    ("Claim your car prize now.", "spam"),
    ("Here’s the latest report.", "not spam"),
    ("Earn money from home now.", "spam"),
    ("Please confirm the meeting time.", "not spam"),
    ("30% off! Buy today.", "spam"),
    ("Win big with this survey.", "spam"),
    ("Catch up for lunch soon.", "not spam"),
    ("Loan approved! Get details here.", "spam"),
    ("See you at 9 AM for review.", "not spam"),
    ("Your free iPhone is waiting.", "spam"),
]

In [29]:
text = [i for i,j in dataset]
label = [j for i,j in dataset]

In [30]:
text[:10]

['You won a prize! Claim now.',
 "Meeting tomorrow at 5 PM. Don't forget!",
 'Win big now! Claim your prize.',
 'Buy now! Lowest price available.',
 'Claim your prize now!',
 'Submit the project by Friday.',
 'You won an iPhone! Get it now.',
 'Attached is the report.',
 'Reset your password here.',
 'Are we still going to the gym?']

In [31]:
label[:10]

['spam',
 'not spam',
 'spam',
 'spam',
 'spam',
 'not spam',
 'spam',
 'not spam',
 'spam',
 'not spam']

In [32]:
len(dataset)

30

In [33]:
df = pd.DataFrame(dataset, columns=['Sentence', 'Label'])
df.head()

Unnamed: 0,Sentence,Label
0,You won a prize! Claim now.,spam
1,Meeting tomorrow at 5 PM. Don't forget!,not spam
2,Win big now! Claim your prize.,spam
3,Buy now! Lowest price available.,spam
4,Claim your prize now!,spam


In [34]:
def tokenize(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'[^\w\s]', '', sentence)
    return sentence.split()

In [35]:
tokenized_sentences = [tokenize(t) for t in text]
tokenized_sentences[:10]

[['you', 'won', 'a', 'prize', 'claim', 'now'],
 ['meeting', 'tomorrow', 'at', '5', 'pm', 'dont', 'forget'],
 ['win', 'big', 'now', 'claim', 'your', 'prize'],
 ['buy', 'now', 'lowest', 'price', 'available'],
 ['claim', 'your', 'prize', 'now'],
 ['submit', 'the', 'project', 'by', 'friday'],
 ['you', 'won', 'an', 'iphone', 'get', 'it', 'now'],
 ['attached', 'is', 'the', 'report'],
 ['reset', 'your', 'password', 'here'],
 ['are', 'we', 'still', 'going', 'to', 'the', 'gym']]

In [36]:
def build_vocabulary(tokenized_sentences):
    vocabulary = set()
    for sentence in tokenized_sentences:
        vocabulary.update(sentence)
    return sorted(list(vocabulary))

In [37]:
vocabulary = build_vocabulary(tokenized_sentences)
vocabulary

['30',
 '5',
 '6',
 '9',
 'a',
 'am',
 'an',
 'approved',
 'are',
 'at',
 'attached',
 'available',
 'big',
 'bring',
 'buy',
 'by',
 'car',
 'catch',
 'check',
 'claim',
 'confirm',
 'deals',
 'details',
 'documents',
 'dont',
 'earn',
 'files',
 'for',
 'forget',
 'forward',
 'free',
 'friday',
 'from',
 'get',
 'going',
 'gym',
 'have',
 'here',
 'heres',
 'holiday',
 'home',
 'iphone',
 'is',
 'it',
 'latest',
 'lets',
 'limited',
 'loan',
 'looking',
 'lowest',
 'lunch',
 'me',
 'meeting',
 'money',
 'now',
 'off',
 'our',
 'password',
 'please',
 'pm',
 'price',
 'prices',
 'prize',
 'project',
 'report',
 'reset',
 'review',
 'see',
 'send',
 'session',
 'soon',
 'still',
 'stock',
 'submit',
 'survey',
 'the',
 'this',
 'time',
 'to',
 'today',
 'tomorrow',
 'up',
 'vacation',
 'waiting',
 'we',
 'weekend',
 'win',
 'with',
 'won',
 'you',
 'your']

In [38]:
len(vocabulary)

91

In [39]:
def create_bow_vector(sentence, vocabulary):
    vec = [0]*len(vocabulary)
    for word in sentence:
        if word in vocabulary:
            idx = vocabulary.index(word)
            vec[idx] = 1
    return vec

In [40]:
bow_vectors = [create_bow_vector(sentence, vocabulary) for sentence in tokenized_sentences]
len(bow_vectors)

30

In [41]:
encoded_vectors = np.array(bow_vectors)
encoded_vectors[0]

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0])

In [42]:
cols = [i for i in vocabulary]

In [43]:
df[cols] = encoded_vectors

In [44]:
df.head()

Unnamed: 0,Sentence,Label,30,5,6,9,a,am,an,approved,...,up,vacation,waiting,we,weekend,win,with,won,you,your
0,You won a prize! Claim now.,spam,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1,Meeting tomorrow at 5 PM. Don't forget!,not spam,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Win big now! Claim your prize.,spam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
3,Buy now! Lowest price available.,spam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Claim your prize now!,spam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [45]:
model = LogisticRegression()

In [46]:
model.fit(encoded_vectors, df['Label'])

In [47]:
model.score(encoded_vectors, df['Label'])

1.0

In [48]:
sen = ''
tokenize(sen)

[]

In [49]:
test_sen = create_bow_vector(sen, vocabulary)
test_sen = np.array(test_sen).reshape(1,-1)
test_sen

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0]])

In [50]:
model.predict(test_sen)

array(['spam'], dtype=object)