In [1]:
from collections import Counter
import numpy as np

In [2]:
def read_dataset():
    return ([
        line.strip().decode('utf-8') for line in open("train_X", "rb")
    ], [
        line.strip().decode('utf-8') for line in open("train_Y", "rb")
    ], [
        line.strip().decode('utf-8') for line in open("val_X", "rb")
    ], [
        line.strip().decode('utf-8') for line in open("val_Y", "rb")
    ])

In [3]:
train_X, train_Y, val_X, val_Y = read_dataset()

In [4]:
len(train_X), len(train_Y), len(val_X), len(val_Y)

(8024136, 8024136, 892966, 892966)

In [5]:
classes = Counter(train_Y)

In [6]:
def make_classes_words_counters(X, Y):
    counters = Counter()
    for x, y in zip(X, Y):
        for word in x.split():
            counters[(y, word)] += 1
    return counters

def count_words_in_classes(X, Y):
    counter = Counter()
    class_words = {
        y: set()
        for y in set(Y)
    }
    for x, y in zip(X, Y):
        for word in x.split():
            class_words[y].add(word)
    return {
        y: len(class_words[y])
        for y in class_words
    }

In [7]:
classes_words_counters = make_classes_words_counters(train_X, train_Y)
words_in_classes = count_words_in_classes(train_X, train_Y)

In [8]:
def make_predictions(words_in_classes, classes_words_counters, classes, X):
    classes_array = np.array([cur_class for cur_class in classes])
    classes_values = np.array([classes[cur_class] for cur_class in classes])
    classes_probs = np.log(classes_values) - np.log(np.sum(classes_values))
    words_count = sum(classes.values())
    denum = np.log(
        np.array([words_in_classes[cur_class] + classes[cur_class] for cur_class in classes])
    ).reshape([-1, 1])
    predictions = []
    for x in X[:1000]:
        words = np.array([
            [classes_words_counters[(cur_class, word)] for word in x.split()]
            for cur_class in classes]
        ).astype(float)
        words += 1
        words = np.log(words)
        words -= denum.repeat(words.shape[1], axis=-1)
        scores = np.mean(words, axis=-1) + classes_probs
        predictions.append(classes_array[np.argmax(scores)])
    return np.array(predictions)

In [9]:
val_prediction = make_predictions(words_in_classes, classes_words_counters, classes, val_X)

In [10]:
len(val_prediction[val_prediction == val_Y[:len(val_prediction)]]) / len(val_prediction)

0.578

In [12]:
def make_data(words_in_classes, classes_words_counters, classes, X):
    classes_array = np.array([cur_class for cur_class in classes])
    classes_values = np.array([classes[cur_class] for cur_class in classes])
    words_count = sum(classes.values())
    denum = np.log(
        np.array([words_in_classes[cur_class] + classes[cur_class] for cur_class in classes])
    ).reshape([-1, 1])
    with open("bayes_data", "w") as handler:
        for i, x in enumerate(X):
            if i % 1000 == 0:
                print("{} / {}".format(i, len(X)))
            if len(x.split()) == 0:
                print("!!!", i, "!!!")
                print([[0] * len(classes)] * 3, file=handler)
                continue
            words = np.array([
                [classes_words_counters[(cur_class, word)] for word in x.split()]
                for cur_class in classes]
            ).astype(float)
            words += 1
            words = np.log(words)
            words -= denum.repeat(words.shape[1], axis=-1)
            print([
                list(np.mean(words, axis=-1)),
                list(np.min(words, axis=-1)),
                list(np.max(words, axis=-1))
            ], file=handler)

In [13]:
val_data = make_data(words_in_classes, classes_words_counters, classes, val_X)

0 / 892966
1000 / 892966
2000 / 892966
3000 / 892966
4000 / 892966
5000 / 892966
6000 / 892966
7000 / 892966
8000 / 892966
9000 / 892966
10000 / 892966
11000 / 892966
12000 / 892966
13000 / 892966
14000 / 892966
15000 / 892966
16000 / 892966
17000 / 892966
18000 / 892966
19000 / 892966
20000 / 892966
21000 / 892966
22000 / 892966
23000 / 892966
24000 / 892966
25000 / 892966
26000 / 892966
27000 / 892966
28000 / 892966
29000 / 892966
30000 / 892966
31000 / 892966
32000 / 892966
33000 / 892966
34000 / 892966
35000 / 892966
36000 / 892966
37000 / 892966
38000 / 892966
39000 / 892966
40000 / 892966
41000 / 892966
42000 / 892966
43000 / 892966
44000 / 892966
45000 / 892966
46000 / 892966
47000 / 892966
48000 / 892966
49000 / 892966
50000 / 892966
!!! 50166 !!!
51000 / 892966
52000 / 892966
53000 / 892966
54000 / 892966
!!! 54376 !!!
55000 / 892966
56000 / 892966
57000 / 892966
58000 / 892966
59000 / 892966
60000 / 892966
61000 / 892966
62000 / 892966
63000 / 892966
64000 / 892966
65000 / 89

504000 / 892966
505000 / 892966
506000 / 892966
507000 / 892966
508000 / 892966
509000 / 892966
510000 / 892966
511000 / 892966
512000 / 892966
513000 / 892966
514000 / 892966
515000 / 892966
516000 / 892966
517000 / 892966
518000 / 892966
519000 / 892966
520000 / 892966
521000 / 892966
522000 / 892966
523000 / 892966
524000 / 892966
525000 / 892966
526000 / 892966
527000 / 892966
528000 / 892966
529000 / 892966
530000 / 892966
531000 / 892966
532000 / 892966
533000 / 892966
534000 / 892966
535000 / 892966
536000 / 892966
537000 / 892966
538000 / 892966
539000 / 892966
540000 / 892966
!!! 540500 !!!
541000 / 892966
542000 / 892966
543000 / 892966
544000 / 892966
545000 / 892966
546000 / 892966
547000 / 892966
548000 / 892966
549000 / 892966
550000 / 892966
551000 / 892966
552000 / 892966
553000 / 892966
554000 / 892966
555000 / 892966
556000 / 892966
557000 / 892966
558000 / 892966
559000 / 892966
560000 / 892966
561000 / 892966
562000 / 892966
563000 / 892966
564000 / 892966
565000 / 

In [None]:
[[0] * 3] * 4