In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize,RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [2]:
df= pd.read_csv('Dataset/spam.csv',encoding='ISO8859-1')
en = LabelEncoder()

In [3]:
data = df.to_numpy()
data.shape

(5572, 5)

In [4]:
X = data[:,1]
y = data[:,0]

In [5]:
tokenizer = RegexpTokenizer('\w+')
sw = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [6]:
def clean(text):
    text = text.lower()
    token = tokenizer.tokenize(text)
    removed_stopwords = [w for w in token if w not in sw]
    stemmed = [stemmer.stem(w) for w in removed_stopwords]
    cleaned = ' '.join(stemmed)
    return cleaned
def cleandoc(doc):
    document = []
    for sentence in doc:
        document.append(clean(sentence))
    return document

In [7]:
doc = cleandoc(X)

In [8]:
cv = CountVectorizer()
vc = cv.fit_transform(doc)
X = vc.toarray()

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

In [10]:
from sklearn.naive_bayes import MultinomialNB

In [11]:
class CustomNaiveBayes:
    def fit(self,X,y):
        self.X_train = X
        self.y_train = y
    # Formula
    # p(y/x) = p(y) * p(x/y)
    
    # Prior probability i.e. p(y) 
    def prior_prob(self,label):
        return np.sum(self.y_train==label)/self.y_train.shape[0]
    # Conditional Probability i.e. p(x/y)
    def conditional_prob(self,label,feature_col,feature_val):
        X_mod = self.X_train[self.y_train == label]
        return np.sum(X_mod[:,feature_col]==feature_val)/float(X_mod.shape[0])
    
    def predict_point(self,X_test):
        post = []
        classes = np.unique(self.y_train)
        features = self.X_train.shape[1]
        for label in classes:
            likelihood = 1.0
            for feature in range(features):
                likelihood*= self.conditional_prob(label,feature,X_test[feature])
            post.append(likelihood*self.prior_prob(label))
        return np.argmax(post)
    
    def predict(self,X):
        results = []
        for point in X:
            results.append(self.predict_point(point))
        return np.array(results)
    def score(self,X_test,y_test):
        ans = self.predict(X_test)
        return (y_test == ans).mean()

In [12]:
model = CustomNaiveBayes()
model.fit(X_train,y_train)

In [13]:
# model.score(X_test,y_test)

In [None]:
X_new = ["""
Join us today at 12:00 PM ET / 16:00 UTC for a Red Hat DevNation tech talk on AWS Lambda and serverless Java with Bill Burke.
Have you ever tried Java on AWS Lambda but found that the cold-start latency and memory usage were far too high? 
In this session, we will show how we optimized Java for serverless applications by leveraging GraalVM with Quarkus to 
provide both supersonic startup speed and a subatomic memory footprint.

""",
"""
Dear Harbor Freight Shopper,

We would like to offer you a unique opportunity to receive a brand new Milwaukee Drill Ace Set!
To claim, simply take this short survey about your experience with us.
START SURVEY NOW
""",
"""
Hello, Om_Alve.
You are welcome to register and compete in Codeforces Round 874 (Div. 3).
It starts on Friday, May, 19, 2023 14:35 (UTC). The contest duration is 2 hours 15 minutes. 
The allowed programming languages are C/C++, Pascal, Java, C#, Python, Ruby, Perl, PHP,
Haskell, Scala, OCaml, Go, D, JavaScript, Rust and Kotlin.
"""
]
X_new = cleandoc(X_new)
vc = cv.transform(X_new)
Xnew = vc.toarray()
model.predict(Xnew)
