<a href="https://colab.research.google.com/github/Szymoniakfoltynson/ai-course-friday/blob/main/Email_Spam_Ham.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
import json
import random
import math

def load_email_data(spam_path, ham_path):
  with open(spam_path, encoding="utf-8") as f1, open(ham_path, encoding="utf-8") as f2:
    spam_data = json.load(f1)
    ham_data = json.load(f2)
  return spam_data + ham_data

def train_test_split(data, test_ratio=0.2):
  random.shuffle(data)
  cut = int(len(data) * (1 - test_ratio))
  return data[:cut], data[cut:]

def preprocess(text):
  return text.lower().replace("-", " ").replace("—", " ").split()

def prepare_email_data(data):
  for rec in data:
    rec["tags"] = preprocess(rec["text"])
  return data

def build_vocabulary(data):
  vocab = set()
  for rec in data:
    vocab.update(rec["tags"])
  return list(vocab)

def train_nb(train, vocab, alpha=1.0):
  class_counts = {}
  word_counts = {}
  total_words = {}

  for rec in train:
    c = rec["label"]
    class_counts[c] = class_counts.get(c, 0) + 1
    word_counts.setdefault(c, {})
    total_words.setdefault(c, 0)

    for tag in rec["tags"]:
      word_counts[c][tag] = word_counts[c].get(tag, 0) + 1
      total_words[c] += 1

  model = {
    "class_counts": class_counts,
    "word_counts":  word_counts,
    "total_words":  total_words,
    "vocab":        vocab,
    "alpha":        alpha,
    "total_docs":   len(train)
  }

  print(model)
  return model

def log_prob(model, rec, class_name):
    logp = math.log(model["class_counts"][class_name] / model["total_docs"])
    V    = len(model["vocab"])
    a    = model["alpha"]


    for tag in rec["tags"]:
        wc = model["word_counts"][class_name].get(tag, 0)
        logp += math.log((wc + a) / (model["total_words"][class_name] + a * V))
    return logp

#Predykcja – wybieramy klasę z najwyższym log‑prawdop.
def predict(model, rec):
    best_class, best_log = None, -1e99
    for c in model["class_counts"]:
        lp = log_prob(model, rec, c)
        if lp > best_log:
            best_class, best_log = c, lp
    return best_class

#Ewaluacja na zbiorze testowym
def evaluate(model, test):
    correct = 0
    for rec in test:
        if predict(model, rec) == rec["label"]:
            correct += 1
    accuracy = correct / len(test)
    print(f"Dokładność (accuracy) = {accuracy:.2%}")

In [27]:
from pprint import pprint

data = load_email_data("spam.json", "ham.json")
data = prepare_email_data(data) # Preprocess data to add 'tags'
train, test = train_test_split(data)
vocab = build_vocabulary(data)
model = train_nb(train, vocab)
evaluate(model, test)

{'class_counts': {'ham': 877, 'spam': 811}, 'word_counts': {'ham': {'do': 161, 'usłyszenia': 2, 'później.': 4, 'cześć,': 55, 'możesz': 93, 'proszę': 86, 'oddzwonić,': 1, 'gdy': 4, 'będziesz': 15, 'mieć': 1, 'chwilę?': 1, 'nie': 32, 'pilne.': 1, 'przypominamy': 3, 'o': 152, 'wizycie': 3, 'u': 22, 'stomatologa': 1, 'jutro': 32, 'godzinie': 2, '10:30.': 3, 'prosimy': 3, 'potwierdzenie.': 7, 'miłego': 10, 'dnia': 6, 'i': 35, 'usłyszenia.': 5, 'dziękuję': 49, 'za': 128, 'cierpliwość.': 3, 'zaraz': 6, 'ciebie': 20, 'oddzwonię,': 1, 'kończę': 1, 'inne': 1, 'połączenie.': 1, 'czy': 200, 'ten': 18, 'dokument': 6, 'jest': 42, 'aktualny?': 1, 'wiesz,': 7, 'której': 9, 'otwiera': 1, 'się': 82, 'apteka?': 1, 'masz': 33, 'pożyczyć': 2, 'jakąś': 1, 'fajną': 1, 'książkę': 1, 'na': 187, 'weekend?': 3, 'zapomnij': 5, 'kupić': 11, 'mleka': 1, 'kawy,': 1, 'bo': 4, 'rano': 7, 'będzie': 10, 'co': 8, 'pić.': 1, 'zobaczenia': 4, 'spotkaniu.': 6, 'przypominam': 20, 'dzisiejszym': 3, 'szkoleniu.': 1, 'już': 24,