In [1]:
import numpy as np
import pandas as pd
from numpy import linalg
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn import preprocessing, cross_validation
import cvxopt
import cvxopt.solvers



In [10]:
df = pd.read_csv('transfusion.csv')
df.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),class
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


In [17]:
df.dropna(inplace=True)
X = np.array(df.drop('class', axis=1))
X = preprocessing.scale(X)
y = np.array(df['class'])

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)



In [18]:
clf = SVC()
clf.fit(X_train, y_train)

accuracy = clf.score(X_test, y_test)
print(accuracy)

0.806666666667


In [19]:
def gaussian_kernel(x, y, sigma=5.0):
    return np.exp(-linalg.norm(x-y)**2 / (2 * (sigma ** 2)))

In [20]:
class SVM(object):

    def __init__(self, kernel):
        self.kernel = kernel

    def fit(self, X, y):
        n_samples, n_features = X.shape

        K = np.zeros((n_samples, n_samples))
        for i in range(n_samples):
            for j in range(n_samples):
                K[i,j] = self.kernel(X[i], X[j])

        # cvxopt is for convex optimization 
        P = cvxopt.matrix(np.outer(y,y) * K)
        q = cvxopt.matrix(np.ones(n_samples) * -1)
        A = cvxopt.matrix(y, (1,n_samples))
        b = cvxopt.matrix(0.0)

        G = cvxopt.matrix(np.diag(np.ones(n_samples) * -1))
        h = cvxopt.matrix(np.zeros(n_samples))

        a = np.ravel(cvxopt.solvers.qp(P, q, G, h, A, b)['x'])

        sv = a > 1e-5
        ind = np.arange(len(a))[sv]
        self.a = a[sv]
        self.sv = X[sv]
        self.sv_y = y[sv]

        self.b = 0
        for n in range(len(self.a)):
            self.b += self.sv_y[n]
            self.b -= np.sum(self.a * self.sv_y * K[ind[n],sv])
        self.b /= len(self.a)

    def predict(self, X):
        y_predict = np.zeros(len(X))
        for i in range(len(X)):
            s = 0
            for a, sv_y, sv in zip(self.a, self.sv_y, self.sv):
                s += a * sv_y * self.kernel(X[i], sv)
            y_predict[i] = s
        return np.sign(y_predict + self.b)


In [21]:
def zero_to_minus_one(data):
    y_tmp = []
    for value in data:
        if value == 0:
            y_tmp.append(-1.0)
        else: 
            y_tmp.append(1.0)
    return np.array(y_tmp)

In [22]:
y_train_my_svm = zero_to_minus_one(y_train)
y_test_my_svm = zero_to_minus_one(y_test)

my_clf = SVM(gaussian_kernel)
my_clf.fit(X_train.astype(float), y_train_my_svm)

predictions = my_clf.predict(X_test)
my_accuracy = accuracy_score(y_test_my_svm, predictions)
print(my_accuracy)

     pcost       dcost       gap    pres   dres
 0: -3.6346e+02 -1.1066e+03  2e+03  3e+01  3e+00
 1: -1.7994e+03 -2.5316e+03  9e+02  1e+01  1e+00
 2: -5.4051e+03 -6.5682e+03  1e+03  1e+01  1e+00
 3: -2.4087e+04 -2.6222e+04  2e+03  1e+01  1e+00
 4: -9.2626e+04 -9.8888e+04  6e+03  1e+01  1e+00
 5: -1.2310e+05 -1.3123e+05  8e+03  1e+01  1e+00
 6: -4.1250e+05 -4.3580e+05  2e+04  1e+01  1e+00
 7: -1.1125e+06 -1.1695e+06  6e+04  1e+01  1e+00
 8: -2.2408e+06 -2.3497e+06  1e+05  1e+01  1e+00
 9: -4.5501e+06 -4.7604e+06  2e+05  1e+01  1e+00
10: -3.3969e+07 -3.5102e+07  1e+06  1e+01  1e+00
11: -3.5906e+07 -3.7100e+07  1e+06  1e+01  1e+00
12: -4.3504e+07 -4.4935e+07  1e+06  1e+01  1e+00
13: -7.5427e+07 -7.7835e+07  2e+06  1e+01  1e+00
14: -1.4880e+08 -1.5340e+08  5e+06  1e+01  1e+00
15: -3.4017e+08 -3.5027e+08  1e+07  1e+01  1e+00
16: -4.6064e+08 -4.7417e+08  1e+07  1e+01  1e+00
17: -2.2282e+09 -2.2868e+09  6e+07  1e+01  1e+00
18: -5.8507e+09 -5.9972e+09  1e+08  1e+01  1e+00
19: -1.4080e+10 -1.44