In [1]:
import pandas as pd
import numpy as np
import re
import string

In [2]:
text = "Hey, just wanted to check if we’re still meeting for lunch today at 1pm. Let me know!"

In [3]:
def remove_punctuations(text):
    if isinstance(text, str):
        return re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    else:
        return ""

In [4]:
with open("../artifacts/corpora/stopwords/english", "r") as file:
    stwords = file.read().splitlines()

In [5]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [7]:
def preprocess(txt):

    data = pd.DataFrame([txt], columns=["v2"]) # we should convert this to a data frame in order to apply preprocess actions
    
    data.loc[:, 'v2'] = data['v2'].str.lower()
    data.loc[:, "v2"] = data["v2"].apply(lambda x: re.sub(r"http\S+|www.\S+", "", x))
    data.loc[:, "v2"] = data["v2"].apply(remove_punctuations)
    data.loc[:, "v2"] = data["v2"].apply(lambda x: re.sub(r'\d+', '', x))
    data.loc[:, "v2"] = data["v2"].apply(
        lambda x: " ".join(word for word in x.split() if word.lower() not in stwords)
    )
    data.loc[:, "v2"] = data["v2"].apply(
        lambda x: " ".join(stemmer.stem(x) for x in x.split())
    )

    return data["v2"]

In [8]:
preprocessed_text = preprocess(text)

In [9]:
preprocessed_text

0    hey want check we’r still meet lunch today pm ...
Name: v2, dtype: object

In [10]:
vocab = pd.read_csv("../artifacts/vocabulary.txt", header=None) # if we don't write header = none, it selects first one as name of the column
tokens = vocab[0].tolist()

In [11]:
tokens

['go',
 'point',
 'crazi',
 'avail',
 'n',
 'great',
 'world',
 'e',
 'got',
 'wat',
 'ok',
 'lar',
 'joke',
 'wif',
 'u',
 'free',
 'entri',
 'win',
 'final',
 'st',
 'may',
 'text',
 'receiv',
 'txt',
 'appli',
 'dun',
 'say',
 'earli',
 'c',
 'alreadi',
 'dont',
 'think',
 'goe',
 'usf',
 'live',
 'around',
 'though',
 'freemsg',
 'hey',
 'week',
 'word',
 'back',
 'id',
 'like',
 'fun',
 'still',
 'xxx',
 'send',
 'å£',
 'even',
 'brother',
 'speak',
 'treat',
 'per',
 'set',
 'caller',
 'friend',
 'winner',
 'valu',
 'network',
 'custom',
 'select',
 'prize',
 'reward',
 'claim',
 'call',
 'code',
 'valid',
 'hour',
 'mobil',
 'month',
 'r',
 'updat',
 'latest',
 'colour',
 'camera',
 'co',
 'im',
 'gonna',
 'home',
 'soon',
 'want',
 'talk',
 'stuff',
 'tonight',
 'k',
 'ive',
 'enough',
 'today',
 'chanc',
 'cash',
 'pound',
 'cost',
 'day',
 'repli',
 'info',
 'urgent',
 'tc',
 'pobox',
 'search',
 'right',
 'thank',
 'promis',
 'wont',
 'take',
 'help',
 'wonder',
 'time',
 'd

In [12]:
def vectorizor(data_set, vocabulary):
    vectorized_list = []

    for sentence in data_set:
        sentence_list = np.zeros(len(vocabulary))

        for i in range(len(vocabulary)):
            if vocabulary[i] in sentence.split():
                sentence_list[i] = 1

        vectorized_list.append(sentence_list)

    vectorized_list_new = np.asarray(vectorized_list, dtype=np.float32)

    return vectorized_list_new

In [13]:
vectorized_text_list = vectorizor(preprocessed_text, tokens)

In [14]:
vectorized_text_list

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [15]:
import pickle

In [17]:
!pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.7.1-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Using cached scipy-1.16.1-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.7.1-cp313-cp313-win_amd64.whl (8.7 MB)
Using cached scipy-1.16.1-cp313-cp313-win_amd64.whl (38.5 MB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, scikit-learn

   ------------- -------------------------- 1/3 [scipy]
   ------------- -------------------------- 1/3 [scipy]
   ------------- -------------------------- 1/3 [scipy]
   ------------- -------------------------- 1/3 [scipy]
   ------------- -------------------------- 1/3 [scipy]
   ------------- -------------------------- 1/3 [scipy]
   ------------- -------------------------- 1/3 [scipy]
   ------------- 


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [18]:
with open("../artifacts/model/model.pickle", "rb") as file:
    model = pickle.load(file)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [19]:
def get_prediction(txt):
    predicted_result = model.predict(txt)
    if predicted_result == 1:
        return "spam"
    else:
        return "Not a spam"

In [20]:
predicted_result = get_prediction(vectorized_text_list)

In [21]:
predicted_result

'Not a spam'