In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.sparse import csr_matrix
from tqdm import tqdm
from sklearn.datasets import load_svmlight_file
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input/movie-ratings"))

# Any results you write to the current directory are saved as output.

In [None]:
training_data, training_labels = load_svmlight_file('../input/movie-ratings/movie-ratings/data-splits/data.train')
testing_data, testing_labels = load_svmlight_file('../input/movie-ratings/movie-ratings/data-splits/data.test')

In [None]:
training_labels[training_labels == 0] = -1
testing_labels[testing_labels == 0] = -1

In [None]:
class MarginPerceptron:
    def __init__(self, learning_rate=0.01, margin=1, epochs=20):
        self.w = None
        self.b = None
        self.learning_rate = learning_rate
        self.margin = margin
        self.epochs = epochs
        self.decay = False
        
    def fit(self, X, y, updates=0, **kwargs):
        if kwargs:
            self.learning_rate =  kwargs['learning_rate']
            self.margin =  kwargs['margin']
            self.epochs =  kwargs['epochs']
            self.decay = kwargs['decay']
        assert X.shape[0] == y.shape[0]
        
        if self.w is None:
            self.w = ((0.1 - (-0.1)) * (np.random.random_sample(X.shape[1])) - 0.1).T
        if self.b is None:
            self.b = (0.1 - (-0.1)) * (np.random.random_sample()) - 0.1
        t = 0
        
        for _ in tqdm(range(self.epochs)):
            # shuffling data
            p = np.random.permutation(X.shape[0])
            current_X, current_y = X[p], y[p]
            # now training the perceptron weights
            for x, label in zip(current_X, current_y):
                # checking if y's sign corresponds to the activation's sign
                if (label * (np.sum(x * self.w.T) + self.b)) < self.margin:
                    # updates = updates + 1
                    # update weights and bias
                    self.w += self.learning_rate * label * x  / (1 + t) 
                    self.b += self.learning_rate * label  / (1 + t)
                if self.decay:
                    t = t + 1
                    
        return self.score(X,y)
    
    def predict(self, X):
        X = csr_matrix((X.data, X.indices, X.indptr), shape=(X.shape[0], self.w.shape[1]))
        return np.array([np.sign(np.sum(x * self.w.T) + self.b) for x in X])
    
    def get_params(self, deep = False):
        return {'learning_rate':self.learning_rate, 'margin': self.margin}
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self
    
    def score(self, X, y):
        assert X.shape[0] == y.shape[0]
        correct = sum(1 for actual, prediction in zip(y, self.predict(X)) if np.sign(actual) == np.sign(prediction))
        return correct/X.shape[0]

In [None]:
perceptron = MarginPerceptron()
print('training accuracy: {}'.format(perceptron.fit(training_data, training_labels)))

In [None]:
perceptron.score(testing_data, testing_labels)

In [None]:
from sklearn.model_selection import cross_val_score, GridSearchCV
clf = GridSearchCV(MarginPerceptron(), param_grid={'learning_rate':[30, 20, 10, 1, 0.1, 0.01], 'margin':[ 1, 2], 'decay': [False, True], 'epochs':[50]}, cv=5, scoring='accuracy', n_jobs=-1)
clf.fit(testing_data, testing_labels)

In [None]:
print(clf.best_params_)
print(clf.best_score_)

In [None]:
cross_val_score(MarginPerceptron(), training_data, y=training_labels, scoring='accuracy', 
                fit_params=clf.best_params_, cv=5, n_jobs=-1)

In [None]:
clf.best_estimator_.score(testing_data, testing_labels)

In [None]:
perceptron = clf.best_estimator_
eval_data, _ = load_svmlight_file('../input/movie-ratings/movie-ratings/data-splits/data.eval.anon')

submission_pred = perceptron.predict(eval_data)
submission_pred[submission_pred == -1.0] = 0

In [None]:
with open('submission.csv', 'w') as submission:
    with open('../input/movie-ratings/movie-ratings/data-splits/data.eval.anon.id', 'r') as example_ids:
        submission.write('example_id,label\n')
        for example_id, label in zip(example_ids, submission_pred):
            submission.write('{},{}\n'.format(example_id.strip(), int(label)))