# Read data

In [6]:
from collections import Counter
from math import log
import math
import numpy as np
from pathlib import Path
from itertools import chain

def read_file(path):
  is_spam = 'spmsg' in path.name
  with path.open() as file:
    subject = list(map(int, file.readline().split()[1:]))
    file.readline() # skip line
    content = list(map(int, file.readline().split()))
  return subject, content, is_spam


def read_data():
  root_path = Path('/content/drive/My Drive/Универ/ML/Bayes_messages')
  splits = []
  for split in range(1, 11):
    dataset = []
    split_path = root_path / 'part{}'.format(split)
    for entry in split_path.iterdir():
      if entry.is_file():
        dataset.append(read_file(entry))
    splits.append(dataset)
  return splits

# Bayes

In [3]:
EPS = 1e-100


def bayes(dataset, lambdas, alpha):
  encountered_features = set()
  features_counter = Counter()
  class_counter = Counter()

  for features, c in dataset:
    encountered_features.update(features)
    features_counter.update((c, word) for word in set(words))
    class_counter.update([c])

  def classifier(features):
    logprob = []
    for klass in range(1, k):
      logp = log(lambdas[klass] * class_counter[klass] / n + EPS)

      for word in encountered_words:
        p_word = (feature_counter[(klass, word)] + alpha) / (class_counter[klass] + alpha * 2)
        if word in words:
          logp += log(p_word + EPS)
        else:
          logp += log(1 - p_word + EPS)

      logprob.append(logp)
    return np.argmax(logprob)

  return classifier

# N-gram creation

In [4]:
def extract_ngrams(features, n):
  ngrams = []
  for i in range(len(features) - n + 1):
    ngrams.append(tuple(features[i:i+n]))
  return ngrams


def calculate_n_grams(features_set, n):
  ngram_mapping = {}
  for features in features_set:
    ngrams = extract_ngrams(features, n)
    for ngram in ngrams:
      if ngram not in ngram_mapping:
        ngram_mapping[ngram] = len(ngram_mapping)
  
  return ngram_mapping


def calculate_combined_ngrams(dataset, n):
  subject_ngrams = calculate_n_grams((subject for subject, _, _ in dataset), n)
  content_ngrams = calculate_n_grams((content for _, content, _ in dataset), n)

  for ngram in content_ngrams.keys():
    content_ngrams[ngram] += len(subject_ngrams)

  return subject_ngrams, content_ngrams


def encode_to_ngrams(dataset, subject_mapping, content_mapping, n):
  encoded_dataset = []
  for subject, content, is_spam in dataset:
    subject_ngrams = extract_ngrams(subject, n)
    content_ngrams = extract_ngrams(content, n)
    encoded = []
    for ngram in subject_ngrams:
      encoded.append(subject_mapping[ngram])
    for ngram in content_ngrams:
      encoded.append(content_mapping[ngram])
    encoded_dataset.append(encoded)
  return encoded

# k-fold

In [None]:
def calc_kfold(splits, lambdas, alpha, k=10):
  correctly_predicted = 0
  for i in range(k):
    dataset = chain.from_iterable(split for n, split in enumerate(splits))
    classifier = bayes(dataset, lambdas, alpha)

    for features, is_spam in splits[i]:
      prediction = classifier(features)
      if prediction == is_spam:
        correctly_predicted += 1

  return correctly_predicted / len(chain.from_iterable(splits))

# Main

In [7]:
def main():
  splits = read_data()

  subject_mapping, content_mapping = calculate_combined_ngrams(splits[0], n=2)
  print(subject_mapping)
  print(content_mapping)
  print(len(subject_mapping), len(content_mapping))


main()

KeyboardInterrupt: ignored