In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from collections import Counter, defaultdict

In [None]:
train_data = [
    "Chinese Beijing Chinese",
    "Chinese Chinese Shanghai",
    "Chinese Macao",
    "Tokyo Japan Chinese"
]

In [None]:
train_labels = ['China', 'China', 'China', 'Japan']
test_data = ["Chinese Chinese Chinese Tokyo Japan"]

In [None]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_data)
X_test = vectorizer.transform(test_data)
vocab = vectorizer.get_feature_names_out()
vocab_size = len(vocab)

In [None]:
nb_model = MultinomialNB()
nb_model.fit(X_train, train_labels)

In [None]:
predicted_label = nb_model.predict(X_test)
predicted_prob = nb_model.predict_proba(X_test)

In [None]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

X_train shape: (4, 6)
X_test shape: (1, 6)


In [None]:
label_counts = Counter(train_labels)
total_docs = len(train_labels)
prior_probabilities = {label: count/total_docs for label, count in label_counts.items()}

In [None]:
class_word_counts = defaultdict(Counter)
total_words_per_class = Counter()

for doc, label in zip(train_data, train_labels):
    words = doc.lower().split()
    class_word_counts[label].update(words)
    total_words_per_class[label] += len(words)

likelihoods = {}
for label in label_counts:
    likelihoods[label] = {}
    for word in vocab:
        word_count = class_word_counts[label][word]
        likelihoods[label][word] = (word_count + 1) / (total_words_per_class[label] + vocab_size)


In [None]:
print("Test Document:", test_data[0])
print("Prediction Probabilities:", predicted_prob)
print("Vocabulary:", vocab)
print("\nPrior Probabilities:", prior_probabilities)

print("\nLikelihoods (P(word|class)):")
for label in likelihoods:
    print(f"\nClass = {label}")
    for word, prob in likelihoods[label].items():
        print(f"P({word}|{label}) = {prob:.4f}")
print("\nPredicted Class:", predicted_label[0])


Test Document: Chinese Chinese Chinese Tokyo Japan
Prediction Probabilities: [[0.68975861 0.31024139]]
Vocabulary: ['beijing' 'chinese' 'japan' 'macao' 'shanghai' 'tokyo']

Prior Probabilities: {'China': 0.75, 'Japan': 0.25}

Likelihoods (P(word|class)):

Class = China
P(beijing|China) = 0.1429
P(chinese|China) = 0.4286
P(japan|China) = 0.0714
P(macao|China) = 0.1429
P(shanghai|China) = 0.1429
P(tokyo|China) = 0.0714

Class = Japan
P(beijing|Japan) = 0.1111
P(chinese|Japan) = 0.2222
P(japan|Japan) = 0.2222
P(macao|Japan) = 0.1111
P(shanghai|Japan) = 0.1111
P(tokyo|Japan) = 0.2222

Predicted Class: China


In [None]:
def predict(test_docs, prior_probabilities, likelihoods, vocab):
    predictions = []
    for doc in test_docs:
        words = doc.lower().split()
        word_counts = Counter(words)

        class_scores = {}
        for c in prior_probabilities:
            log_prob = np.log(prior_probabilities[c])
            for word, count in word_counts.items():
                if word in vocab:
                    log_prob += count * np.log(likelihoods[c].get(word, 1e-10))
            class_scores[c] = log_prob

        predicted_class = max(class_scores, key=class_scores.get)
        predictions.append(predicted_class)
    return predictions

In [None]:
predicted_labels = predict(test_data, prior_probabilities, likelihoods, vocab)
print("\nPredicted class for test documents:")
for doc, pred in zip(test_data, predicted_labels):
    print(f"Doc: '{doc}' --> Predicted Class: {pred}")


Predicted class for test documents:
Doc: 'Chinese Chinese Chinese Tokyo Japan' --> Predicted Class: China
