In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.sparse import csr_matrix, vstack
from collections import Counter
from tqdm import tqdm, trange
from sklearn.datasets import load_svmlight_file
from scipy.stats import norm
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input/movie-ratings"))

# Any results you write to the current directory are saved as output.

In [None]:
training_data, training_labels = load_svmlight_file('../input/movie-ratings/movie-ratings/data-splits/data.train')
testing_data, testing_labels = load_svmlight_file('../input/movie-ratings/movie-ratings/data-splits/data.test')
testing_data = csr_matrix((testing_data.data, testing_data.indices, testing_data.indptr), shape=(testing_data.shape[0], training_data.shape[1]))

In [None]:
print(training_data.shape, training_labels.shape)
print(testing_data.shape, testing_labels.shape)

In [None]:
gauss = norm()
def gaussian(mean, std_dev, value):
    return np.clip(gauss.pdf((value - mean)/np.maximum(np.sqrt(std_dev), 1e-6)), a_min= 1e-6, a_max=None)

In [None]:
class NB:
    def __init__(self):
        self.priors = None
        self.probs =  None
    
    def fit(self, X, y):
        y_counts = dict(Counter(y))
        n = len(y)
        self.priors = {key: np.log(value / n) for key, value in y_counts.items()}
        
        y1_idx = np.nonzero(y == 1)[0]
        y0_idx = np.nonzero(y == 0)[0]
        self.probs = []
        for index, column in tqdm(enumerate(X.T)):
            column = np.asarray(column.todense()).reshape(-1)
            self.probs.append([None, None])
            
            # dealing with y = 0.
            x_0 = column[y0_idx]
            mean = np.mean(column[y0_idx])
            std_dev = np.std(column[y0_idx])
            self.probs[index][0] = ((mean, std_dev))
            
            # dealing with y = 1
            x_1 = column[y1_idx]
            mean = np.mean(column[y1_idx])
            std_dev = np.std(column[y1_idx])
            self.probs[index][1] = ((mean, std_dev))
    
    def predict(self, X):
        predictions = np.zeros((X.shape[0], 2))
        predictions[:,0] = self.priors[0]
        predictions[:,1] = self.priors[1]
        
        X = X.tocsc()
        for idx in tqdm(range(X.shape[1])):
            col = np.asarray(X[:,idx].todense()).reshape(-1)
            for possible_y in range(2):
                mean, std_dev = self.probs[idx][possible_y]
                predictions[:, possible_y] += np.log(gaussian(mean, std_dev, col))
        return np.argmax(predictions, axis=1)
    
    def score(self, X, y):
        assert X.shape[0] == y.shape[0]
        return (self.predict(X) == testing_labels).mean()

In [None]:
nb = NB()
nb.fit(training_data, training_labels)

In [None]:
nb.score(testing_data, testing_labels)

In [None]:
(pred == testing_labels).mean()

In [None]:
eval_data, _ = load_svmlight_file('../input/movie-ratings/movie-ratings/data-splits/data.eval.anon')
eval_data = csr_matrix((eval_data.data, eval_data.indices, eval_data.indptr), shape=(eval_data.shape[0], training_data.shape[1]))
submission_pred = nb.predict(eval_data)

In [None]:
with open('submission.csv', 'w') as submission:
    with open('../input/movie-ratings/movie-ratings/data-splits/data.eval.anon.id', 'r') as example_ids:
        submission.write('example_id,label\n')
        for example_id, label in zip(example_ids, submission_pred):
            submission.write('{},{}\n'.format(example_id.strip(), int(label)))