In [2]:
import numpy as np

In [3]:
class TextVectorizer:
    def __init__(self):
        self.vocab = {}
    
    def fit_transform(self, texts):
        import re
        processed_words = [re.findall(r"\w+", i.lower()) for i in texts]
        flattened_list = [word for i in processed_words for word in i]
        unique_list = np.unique(flattened_list)
        # for i in range(len(unique_list)):
        #     self.vocab[unique_list[i]] = i
        self.vocab = {word: i for i, word in enumerate(unique_list)}
        matrix = np.zeros((len(texts), len(unique_list)), dtype=int)
        for i in range(len(processed_words)):
            for j in processed_words[i]:
                matrix[i][self.vocab[j]] += 1
        return matrix

    def transform(self, texts):
        import re
        matrix = np.zeros((len(texts), len(self.vocab)), dtype=int)
        processed_words = [re.findall(r"\w+", i.lower()) for i in texts]
        for i in range(len(processed_words)):
            for j in processed_words[i]:
                if j in self.vocab:
                    matrix[i][self.vocab[j]] += 1
        return matrix

In [6]:
import numpy as np

class MultiNomialNB:
    def __init__(self):
        self.log_class_priors = {}
        self.word_log_probs = {}
        self.classes = []

    #y = [0,1,1,0]
    def fit(self, X, y):
        self.classes = np.unique(y)
        count = np.bincount(y)
        class_count = count/count.sum()
        for num in range(len(self.classes)):
            self.log_class_priors[self.classes[num]] = np.log(class_count[num])
            class_words = X[y==self.classes[num]]
            word_counts_per_class = [word.sum() for word in class_words.T]
            # total_word_count_in_class = class_words.sum()
            # vocab_size = len(word_counts_per_class)
            # for word in range(len(word_counts_per_class)):
            #     inner_value = (word_counts_per_class[word] + 1) / (total_word_count_in_class + vocab_size)
            #     self.word_log_probs[classes[num]][word] = np.log(inner_value)
            self.word_log_probs[self.classes[num]] = None
            word_counts = class_words.sum(axis=0)
            total_count = class_words.sum()
            inner_val = (word_counts + 1) / (total_count + X.shape[1])
            self.word_log_probs[self.classes[num]] = np.log(inner_val)

    def predict(self, X):
        score = []

        for num in self.classes:
            # class_word_log_probs = [probs for probs in self.word_log_probs[num]]
            Likelihood = np.dot(X,self.word_log_probs[num].T)
            total_score = self.log_class_priors[num] + Likelihood
            score.append(total_score)
        return np.argmax(score, axis=0)

In [7]:
train_sentences = [
    "Win a free lottery prize now",
    "Meeting is scheduled for tomorrow",
    "Free money limited time offer",
    "Can we reschedule our meeting"
]
train_labels = np.array([1, 0, 1, 0])

# Initialize
vectorizer = TextVectorizer()
classifier = MultiNomialNB()

# Train
X_train = vectorizer.fit_transform(train_sentences)
classifier.fit(X_train, train_labels)

# Test on new data
test_sentences = ["urgent free prize money"]
X_test = vectorizer.transform(test_sentences)
prediction = classifier.predict(X_test)

print(f"Sentence: '{test_sentences[0]}'")
print(f"Prediction: {'SPAM' if prediction[0] == 1 else 'HAM'}")

Sentence: 'urgent free prize money'
Prediction: SPAM


In [41]:
dit[0]

{0: 0.5, 1: 0.5, 2: 0.6}