In [1]:
from collections import Counter
from sklearn.model_selection import KFold
import numpy as np

In [2]:
def read_train(filename):
    with open(filename) as handler:
        X, Y = [], []
        for line in handler:
            x, y = line.strip().split("\t")
            X.append(x)
            Y.append(y)
        return np.array(X), np.array(Y)

def read_test(filename):
    with open(filename) as handler:
        X = []
        for line in handler:
            x = line.strip()
            X.append(x)
        return np.array(X)

def acc_metric(labels, predictions):
    assert isinstance(labels, np.ndarray)
    assert isinstance(predictions, np.ndarray)
    return len(labels[labels == predictions]) / len(labels)

def meashure_model(model, X, Y):
    kf = KFold(n_splits=2)
    metrics = []
    for train_index, test_index in kf.split(X):
        train_X, test_X = X[train_index], X[test_index]
        train_Y, test_Y = Y[train_index], Y[test_index]
        model.fit(train_X, train_Y)
        predictions = model.predict(test_X)
        metrics.append(acc_metric(predictions, test_Y))
    return np.mean(metrics), np.var(metrics)

class MostPopularModel:
    def fit(self, X, Y):
        self._answer = Counter(Y).most_common()[0][0]

    def predict(self, X):
        return np.array([self._answer] * len(X))

In [3]:
TRAIN_FILENAME = "names_and_rubrics_learn.tsv"
TEST_FILENAME = "names_and_rubrics_test_no_rubric.tsv"

In [4]:
train_X, train_Y = read_train(TRAIN_FILENAME)
test_X = read_test(TEST_FILENAME)

In [5]:
len(train_X), len(train_Y), len(test_X)

(8917102, 8917102, 1000000)

In [None]:
most_popular_model = MostPopularModel()
mean, var = meashure_model(most_popular_model, train_X, train_Y)

In [None]:
mean, var