In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.sparse import csr_matrix
from tqdm import tqdm, trange
from sklearn.datasets import load_svmlight_file
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input/movie-ratings"))

# Any results you write to the current directory are saved as output.

In [None]:
training_data, training_labels = load_svmlight_file('../input/movie-ratings/movie-ratings/data-splits/data.train')
testing_data, testing_labels = load_svmlight_file('../input/movie-ratings/movie-ratings/data-splits/data.test')
testing_data = csr_matrix((testing_data.data, testing_data.indices, testing_data.indptr), shape=(testing_data.shape[0], training_data.shape[1]))

In [None]:
print(training_data.shape, training_labels.shape)
print(testing_data.shape, testing_labels.shape)

In [None]:
class SVM:
    def __init__(self, learning_rate=0.1, reg_const=0.05, l1=False, num_iters=10, batch_size=200):
        self.W = None
        self.learning_rate =  learning_rate
        self.reg_const =  reg_const
        self.l1 = l1
        if self.l1:
                self.reg_fun = lambda x: np.max(np.sum(np.abs(x), axis=0))
                self.reg_fun_grad = lambda x: np.sum([np.sign(val) for val in x.flatten() if not np.isclose(val,0, atol=1e-5)])
        else:
            self.reg_fun = lambda x: 0.5*np.sum(x*x)
            self.reg_fun_grad=lambda x:x
        self.num_iters = num_iters
        self.batch_size = batch_size
        
    def loss(self,X_batch, y_batch):
        num_train = X_batch.shape[0]
        num_classes = self.W.shape[1]
        
        scores = X_batch.dot(self.W)
        correct_class_scores = scores[range(num_train), y_batch].reshape(-1,1)
        margins = np.maximum(0, scores + 1 - correct_class_scores)
        margins[range(num_train), y_batch] =0
        loss = np.sum(margins)/ num_train + self.reg_const * self.reg_fun(self.W)

        coeff_mat = np.zeros((num_train, num_classes))
        coeff_mat[margins > 0] = 1
        coeff_mat[range(num_train), y_batch] = - np.sum(coeff_mat, axis=1)
        dW = (X_batch.T).dot(coeff_mat)
        dW = dW/num_train + self.reg_const * self.reg_fun_grad(self.W)
        
        return loss, dW
    
    def fit(self, X, y, **kwargs):
        if kwargs:
            self.learning_rate =  kwargs['learning_rate']
            self.reg_const =  kwargs['reg_const']
            self.l1 = kwargs['l1']
            if self.l1:
                self.reg_fun = lambda x: np.max(np.sum(np.abs(x), axis=0))
                self.reg_fun_grad = lambda x: np.sum([np.sign(val) for val in x.flatten() if not np.isclose(val,0, atol=1e-5)])
            else:
                self.reg_fun = lambda x: 0.5*np.sum(x*x)
                self.reg_fun_grad=lambda x:x
            self.num_iters = kwargs['num_iters']
            self.batch_size = kwargs['batch_size']
        
        num_train, dim = X.shape
        num_classes = 2 # assume y takes values 0...K-1 where K is number of classes
        if self.W is None:
          # lazily initialize W
          self.W = 0.001 * np.random.randn(int(dim), int(num_classes))

        # Run stochastic gradient descent to optimize W
        for it in trange(self.num_iters, mininterval=5):
            batch_idx = np.random.choice(num_train, self.batch_size, replace = True)
            X_batch =  X[batch_idx]
            y_batch = y[batch_idx]
        
            loss, grad = self.loss(X_batch, y_batch.astype(int))
        
            self.W += - self.learning_rate * grad
    
    def get_params(self, deep = False):
        return {'learning_rate': self.learning_rate, 'reg_const': self.reg_const, 
                'l1': self.l1, 'num_iters':self.num_iters, 'batch_size':self.batch_size}
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self
    
    def predict(self, X):
        scores = X.dot(self.W)
        y_pred = np.argmax(scores, axis =1)
        return y_pred
    
    def score(self, X, y):
        assert X.shape[0] == y.shape[0]
        correct = sum(1 for actual, prediction in zip(y, self.predict(X)) if np.sign(actual) == np.sign(prediction))
        return correct/X.shape[0]

In [None]:
svm = SVM()
svm.fit(training_data, training_labels, learning_rate=0.01, reg_const=0.05, l1=False, num_iters=100_000, batch_size=200)

In [None]:
svm.score(testing_data, testing_labels)

In [None]:
from sklearn.model_selection import cross_val_score, GridSearchCV
clf = GridSearchCV(SVM(), param_grid={'learning_rate':[0.0001, 0.0005, 0.001], 'reg_const':[0.0001,0.001,0.01,0.1], 'l1': [False], 'num_iters':[200_000], 'batch_size':[200]}, cv=5, scoring='accuracy', n_jobs=-1)
clf.fit(testing_data, testing_labels)

In [None]:
print(clf.best_params_)
print(clf.best_score_)

In [None]:
cross_val_score(SVM(), training_data, y=training_labels, scoring='accuracy', 
                fit_params=clf.best_params_, cv=5, n_jobs=-1)

In [None]:
clf.best_estimator_.score(testing_data, testing_labels)

In [None]:
svm = clf.best_estimator_
eval_data, _ = load_svmlight_file('../input/movie-ratings/movie-ratings/data-splits/data.eval.anon')
eval_data = csr_matrix((eval_data.data, eval_data.indices, eval_data.indptr), shape=(eval_data.shape[0], training_data.shape[1]))
submission_pred = svm.predict(eval_data)

In [None]:
with open('submission.csv', 'w') as submission:
    with open('../input/movie-ratings/movie-ratings/data-splits/data.eval.anon.id', 'r') as example_ids:
        submission.write('example_id,label\n')
        for example_id, label in zip(example_ids, submission_pred):
            submission.write('{},{}\n'.format(example_id.strip(), int(label)))