In [2]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from numpy import linalg
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn import preprocessing, cross_validation
import cvxopt
import cvxopt.solvers

In [3]:
df = pd.read_csv('transfusion.csv')
df.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),class
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


In [4]:
df.dropna(inplace=True)
X = np.array(df.drop('class', axis=1))
X = preprocessing.scale(X)
y = np.array(df['class'])

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)

In [5]:
clf = SVC()
clf.fit(X_train, y_train)

accuracy = clf.score(X_test, y_test)
print(accuracy)

0.806666666667


In [6]:
def gaussian_kernel(x, y, sigma=5.0):
    return np.exp(-linalg.norm(x-y)**2 / (2 * (sigma ** 2)))

In [7]:
class SVM(object):

    def __init__(self, kernel):
        self.kernel = kernel

    def fit(self, X, y):
        n_samples, n_features = X.shape

        K = np.zeros((n_samples, n_samples))
        for i in range(n_samples):
            for j in range(n_samples):
                K[i,j] = self.kernel(X[i], X[j])

        # cvxopt is for convex optimization 
        P = cvxopt.matrix(np.outer(y,y) * K)
        q = cvxopt.matrix(np.ones(n_samples) * -1)
        A = cvxopt.matrix(y, (1,n_samples))
        b = cvxopt.matrix(0.0)

        G = cvxopt.matrix(np.diag(np.ones(n_samples) * -1))
        h = cvxopt.matrix(np.zeros(n_samples))

        a = np.ravel(cvxopt.solvers.qp(P, q, G, h, A, b)['x'])

        sv = a > 1e-5
        ind = np.arange(len(a))[sv]
        self.a = a[sv]
        self.sv = X[sv]
        self.sv_y = y[sv]

        self.b = 0
        for n in range(len(self.a)):
            self.b += self.sv_y[n]
            self.b -= np.sum(self.a * self.sv_y * K[ind[n],sv])
        self.b /= len(self.a)

    def predict(self, X):
        y_predict = np.zeros(len(X))
        for i in range(len(X)):
            s = 0
            for a, sv_y, sv in zip(self.a, self.sv_y, self.sv):
                s += a * sv_y * self.kernel(X[i], sv)
            y_predict[i] = s
        return np.sign(y_predict + self.b)


In [8]:
def zero_to_minus_one(data):
    y_tmp = []
    for value in data:
        if value == 0:
            y_tmp.append(-1.0)
        else: 
            y_tmp.append(1.0)
    return np.array(y_tmp)

In [9]:
y_train_my_svm = zero_to_minus_one(y_train)
y_test_my_svm = zero_to_minus_one(y_test)

my_clf = SVM(gaussian_kernel)
my_clf.fit(X_train.astype(float), y_train_my_svm)

predictions = my_clf.predict(X_test)
my_accuracy = accuracy_score(y_test_my_svm, predictions)
print(my_accuracy)

     pcost       dcost       gap    pres   dres
 0: -3.8023e+02 -1.1311e+03  2e+03  3e+01  3e+00
 1: -1.7478e+03 -2.5273e+03  9e+02  1e+01  1e+00
 2: -5.2998e+03 -6.3151e+03  1e+03  1e+01  1e+00
 3: -1.5246e+04 -1.7310e+04  2e+03  1e+01  1e+00
 4: -7.1047e+04 -7.6480e+04  5e+03  1e+01  1e+00
 5: -1.8544e+05 -1.9798e+05  1e+04  1e+01  1e+00
 6: -8.3889e+05 -8.8302e+05  4e+04  1e+01  1e+00
 7: -2.3110e+06 -2.4212e+06  1e+05  1e+01  1e+00
 8: -9.1470e+06 -9.5247e+06  4e+05  1e+01  1e+00
 9: -2.7187e+07 -2.8221e+07  1e+06  1e+01  1e+00
10: -5.5946e+07 -5.7989e+07  2e+06  1e+01  1e+00
11: -1.7929e+08 -1.8533e+08  6e+06  1e+01  1e+00
12: -5.0142e+08 -5.1732e+08  2e+07  1e+01  1e+00
13: -1.7120e+09 -1.7626e+09  5e+07  1e+01  1e+00
14: -3.3171e+09 -3.4125e+09  1e+08  1e+01  1e+00
15: -1.3267e+10 -1.3619e+10  4e+08  1e+01  1e+00
16: -2.2466e+10 -2.3048e+10  6e+08  1e+01  1e+00
17: -2.6588e+10 -2.7273e+10  7e+08  1e+01  1e+00
18: -2.8277e+10 -2.9003e+10  7e+08  1e+01  1e+00
19: -4.5414e+10 -4.65