In [1]:
import math
from collections import defaultdict

def train_naive_bayes(docs, labels):
    word_counts = defaultdict(lambda: defaultdict(int))
    doc_counts = defaultdict(int)
    train_vocab = set(word for doc in docs for word in doc.lower().split())

    for doc, label in zip(docs, labels):
        words = doc.lower().split()
        doc_counts[label] += 1
        for word in words:
            word_counts[label][word] += 1

    return train_vocab, doc_counts, word_counts

def classify(docs, vocab, doc_counts, word_counts):
    results = []
    total_docs = sum(doc_counts.values())
    total_words = {label: sum(word_counts[label].values()) + len(vocab) for label in doc_counts}

    for doc in docs:
        words = doc.lower().split()
        doc_probs = {label: math.log(doc_counts[label] / total_docs) for label in doc_counts}
        word_probs = {}

        for word in words:
            vocab.add(word)

            word_probs[word] = {
                label: (word_counts[label].get(word, 0) + 1) / (total_words[label] + len(vocab))
                for label in doc_counts
            }
            for label in doc_counts:
                doc_probs[label] += math.log(word_probs[word][label])

        inferred_class = max(doc_probs, key=doc_probs.get)
        results.append({
            'Test document': doc,
            'Inferred class': inferred_class,
            'Document probabilities': {label: math.exp(doc_probs[label]) for label in doc_counts},
            'Word conditional probabilities': word_probs
        })
    return results


train_docs = [
    "London is the Capital of GB",
    "Oxford is a city in GB",
    "Dublin is the capital of Ireland",
    "Limerick is a city in Ireland"
]
train_labels = ['GB', 'GB', 'IE', 'IE']

vocab, doc_counts, word_counts = train_naive_bayes(train_docs, train_labels)

test_docs = [
    "University of Limerick",
    "University College Dublin",
    "Imperial College London",
    "University of Oxford",
    "Ireland & GB"
]

def format_output(classified_docs, vocab):
    formatted_output = []

    for doc_info in classified_docs:
        formatted_doc = {
            'Test document': f'("{doc_info["Test document"]}", "?")',
            'Word conditional probabilities': [],
            'docProbGB': f"docProbGB= {doc_info['Document probabilities']['GB']:.6f}",
            'docProbIE': f"docProbIE= {doc_info['Document probabilities']['IE']:.6f}",
            'Inferred class': f"Inferred class= {doc_info['Inferred class']}"
        }

        max_word_length = max(len(word) for word in doc_info['Test document'].split())
        pad_width = max_word_length + 8
        for word in doc_info['Test document'].split():
            word_lower = word.lower()
            prob_gb = doc_info['Word conditional probabilities'].get(word_lower, {}).get('GB', '—')
            prob_ie = doc_info['Word conditional probabilities'].get(word_lower, {}).get('IE', '—')
            word_prob_str = create_aligned_string(word, prob_gb, prob_ie, pad_width)
            formatted_doc['Word conditional probabilities'].append(word_prob_str)

        formatted_output.append(formatted_doc)

    return formatted_output

def create_aligned_string(word, prob_gb, prob_ie, pad_width=20):
    word_str = f"word= '{word}'".ljust(pad_width)
    prob_gb_str = f"wordConditionalProbGB= {prob_gb}".ljust(pad_width)
    prob_ie_str = f"wordConditionalProbIE= {prob_ie}".ljust(pad_width)
    return f"{word_str} {prob_gb_str} {prob_ie_str}"

vocab, doc_counts, word_counts = train_naive_bayes(train_docs, train_labels)

classified = classify(test_docs, vocab, doc_counts, word_counts)

formatted_output_with_all_words_aligned = format_output(classified, vocab)


def additional_output(doc_counts, word_counts, train_vocab):
    mega_doc_gb = " ".join([word for word, count in word_counts['GB'].items() if count > 0])
    mega_doc_ie = " ".join([word for word, count in word_counts['IE'].items() if count > 0])

    total_docs = sum(doc_counts.values())
    prob_gb = doc_counts['GB'] / total_docs
    prob_ie = doc_counts['IE'] / total_docs

    gb_bow = {word: word_counts['GB'].get(word, 0) for word in train_vocab}
    ie_bow = {word: word_counts['IE'].get(word, 0) for word in train_vocab}

    v = list(train_vocab)
    vocab_size = len(train_vocab)

    additional_output = {
        'megaDocGB': mega_doc_gb,
        'megaDocIE': mega_doc_ie,
        'probGB': prob_gb,
        'probIE': prob_ie,
        'GB_BOW': gb_bow,
        'IE_BOW': ie_bow,
        'V': v,
        '|V|': vocab_size
    }

    return additional_output
train_vocab, doc_counts, word_counts = train_naive_bayes(train_docs, train_labels)
additional_output = additional_output(doc_counts, word_counts, train_vocab)

print(f"megaDocGB: {additional_output['megaDocGB']}")
print(f"megaDocIE: {additional_output['megaDocIE']}")
print(f"probGB: {additional_output['probGB']:.6f}")
print(f"probIE: {additional_output['probIE']:.6f}")
print(f"GB_BOW: {additional_output['GB_BOW']}")
print(f"IE_BOW: {additional_output['IE_BOW']}")
print(f"V: {additional_output['V']}")
print(f"|V|: {additional_output['|V|']}")
print("-" * 40)
for item in formatted_output_with_all_words_aligned:
    print('Test Document=', item['Test document'])
    for word_prob_str in item['Word conditional probabilities']:
        print(word_prob_str)
    print(item['docProbGB'])
    print(item['docProbIE'])
    print(item['Inferred class'])
    print("-" * 40)


megaDocGB: london is the capital of gb oxford a city in
megaDocIE: dublin is the capital of ireland limerick a city in
probGB: 0.500000
probIE: 0.500000
GB_BOW: {'of': 1, 'the': 1, 'gb': 2, 'ireland': 0, 'city': 1, 'limerick': 0, 'in': 1, 'is': 2, 'london': 1, 'dublin': 0, 'a': 1, 'oxford': 1, 'capital': 1}
IE_BOW: {'of': 1, 'the': 1, 'gb': 0, 'ireland': 2, 'city': 1, 'limerick': 1, 'in': 1, 'is': 2, 'london': 0, 'dublin': 1, 'a': 1, 'oxford': 0, 'capital': 1}
V: ['of', 'the', 'gb', 'ireland', 'city', 'limerick', 'in', 'is', 'london', 'dublin', 'a', 'oxford', 'capital']
|V|: 13
----------------------------------------
Test Document= ("University of Limerick", "?")
word= 'University' wordConditionalProbGB= 0.02564102564102564 wordConditionalProbIE= 0.02564102564102564
word= 'of'         wordConditionalProbGB= 0.05128205128205128 wordConditionalProbIE= 0.05128205128205128
word= 'Limerick'   wordConditionalProbGB= 0.02564102564102564 wordConditionalProbIE= 0.05128205128205128
docProbGB= 0