In [1]:
# importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import scale
from math import exp
from collections import defaultdict 
from sklearn.metrics.pairwise import chi2_kernel
# import warnings
# warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Google Colab Notebooks/DMW/SVMC Markov/letter-recognition.csv')

# feature names were like - 'letter ' changed it to 'letter'
df.columns = ['letter', 'xbox', 'ybox', 'width', 'height', 'onpix', 'xbar',
       'ybar', 'x2bar', 'y2bar', 'xybar', 'x2ybar', 'xy2bar', 'xedge',
       'xedgey', 'yedge', 'yedgex']
print(df.columns)

Index(['letter', 'xbox', 'ybox', 'width', 'height', 'onpix', 'xbar', 'ybar',
       'x2bar', 'y2bar', 'xybar', 'x2ybar', 'xy2bar', 'xedge', 'xedgey',
       'yedge', 'yedgex'],
      dtype='object')


In [3]:
print(f"Number of rows = {df.shape[0]}, cols = {df.shape[1]}")
df.head()

Number of rows = 20000, cols = 17


Unnamed: 0,letter,xbox,ybox,width,height,onpix,xbar,ybar,x2bar,y2bar,xybar,x2ybar,xy2bar,xedge,xedgey,yedge,yedgex
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


In [4]:
# number of different classes
classes = df['letter'].unique()
print(f'Number of different labels in dataset = {len(classes)}')
classes.sort()
for cls in classes:
    rows = len(df[df['letter'] == cls])
    print(f'Number of data points in class {cls} = {rows}')

Number of different labels in dataset = 26
Number of data points in class A = 789
Number of data points in class B = 766
Number of data points in class C = 736
Number of data points in class D = 805
Number of data points in class E = 768
Number of data points in class F = 775
Number of data points in class G = 773
Number of data points in class H = 734
Number of data points in class I = 755
Number of data points in class J = 747
Number of data points in class K = 739
Number of data points in class L = 761
Number of data points in class M = 792
Number of data points in class N = 783
Number of data points in class O = 753
Number of data points in class P = 803
Number of data points in class Q = 783
Number of data points in class R = 758
Number of data points in class S = 748
Number of data points in class T = 796
Number of data points in class U = 813
Number of data points in class V = 764
Number of data points in class W = 752
Number of data points in class X = 787
Number of data points

In [5]:
def predict(z):
  x = zt.loc[:, zt.columns != 'letter']
  y = zt.loc[:, zt.columns == 'letter']
  # doubtful
  fx = f0.predict(x)
  return fx

In [6]:
# returns the loss value calculated using model f0
def loss(z):
  y = z.iloc[0]['letter']
  # doubtful
  fx = predict(z)

  if fx*y > 1:
    return 0
  else:
    return 1-fx*y

In [7]:
def calculate_P(zs,zst):
  P = exp(-loss(zs))/exp(-loss(zt))
  return P

In [8]:
def acceptance_prob(zs,zt):
  ys = zs.iloc[0]['letter']
  yt = zt.iloc[0]['letter']

  fs = predict(zs)
  ft = predict(zt)

  return exp(-fs*ys)/exp(-ft*yt)

In [9]:
def hellinger(X1, X2):
  return np.sqrt(np.dot(X1,X2.T))

In [10]:
avg_acc = {'linear':0, 'rbf':0, 'poly':0, 'chi_squared':0, 'hellinger':0}

for alp in classes:
    print(f'\nRunning for alphabet = {alp}')

    df1 = df.copy()

    alphabet = alp
    df1.loc[(df1.letter != alphabet),'letter'] = -1
    df1.loc[(df1.letter == alphabet),'letter'] = 1
    pos = df1[df1['letter']==1].sample(frac=1)
    neg = df1[df1['letter']==-1].sample(frac=1/25)
    df_final = pos.append(neg,ignore_index=True)

    df_final_train, df_final_test = train_test_split(df_final, test_size=0.3, random_state=42)
    M = 1000
    N1 = round(M * 0.75)
    df_sample = df_final_train.sample(n=N1)
    df_sample_features = df_sample.loc[:, df_sample.columns != 'letter']
    df_sample_target = df_sample.loc[:, ['letter']]

    # df_sample_features = scale(df_sample_features)
    df_sample_target = df_sample_target.astype('int')

    f0 = SVC(kernel = 'linear')
    f0.fit(df_sample_features, df_sample_target.values.ravel())

    m_pos, m_neg = 0, 0
    zt = df_final_train.sample(n=1)
    # print(zt)
    if M%2 == 0:
        label = zt.iloc[0]['letter']
        print("")
        if label == 1:
            m_pos += 1
        else:
            m_neg += 1

    # Step 3
    # should its type be pandas.dataframe?
    sampled_data = []
    # keep count of how many times a sample is rejected
    rejected = defaultdict(int)

    # all variable names are as given in paper
    # constants used in algorithm, see section 4A point 1 in paper
    K = 5
    Q = 1.2
    while m_pos < M//2 or m_neg < M//2:
        # draw one sample randomly = z_star(zs)
        zs = df_final_train.sample(n=1)
        # print("considering ",zs)
        P = calculate_P(zs,zt)
        
        # yt corresponds to Y of zt
        yt = zt.iloc[0]['letter']
        ys = zs.iloc[0]['letter']

        # alpha is the acceptance probability of zs, given in step 5
        alpha = P
        # need to convert it to tuple so that it becomes hashable
        tpl = tuple(zs.to_records(index=False)[0])

        if P==1 and yt==-1 and ys==-1:
            alpha = acceptance_prob(zs,zt)  # P'
        elif P==1 and yt==1 and ys==1:    
            alpha = acceptance_prob(zs,zt)  # P'
        elif (P==1 and yt*ys==-1) or P<1 :
            alpha = P                       # P
        elif rejected[tpl]>K:
            alpha = QP                      # P''
                                
        if alpha>1:
            alpha=1

        # print(f"alpha {alpha}")
        if np.random.random() < alpha:
            sampled_data.append(zs)
        else:
            rejected[tpl]+=1

        if yt == 1:
            m_pos += 1 
        else:
            m_neg += 1
        
        # for next iteration 
        zt = zs
        # print(f"Positive = {m_pos} Negative = {m_neg}")

    dataset = pd.concat(sampled_data)
    features = dataset.loc[:, dataset.columns != 'letter']
    target = dataset.loc[:, ['letter']]

    # X_train = scale(features)
    X_train = features
    y_train = target
    y_train = y_train.astype('int')


    # X_test = scale(df_final_test.loc[:, df_final_test.columns != 'letter'])
    X_test = df_final_test.loc[:, df_final_test.columns != 'letter']
    y_test = df_final_test.loc[:, ['letter']]
    y_test = y_test.astype('int')


    # model training
    import warnings
    warnings.filterwarnings("ignore")
    kernels = ['linear', 'rbf', 'poly']
    for kernel in kernels:
        model = SVC(kernel = kernel)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_true = y_test, y_pred = y_pred)
        print(f"Kernel = {kernel}, accuracy = {acc * 100}")
        avg_acc[kernel] += acc

    model = SVC(kernel = hellinger)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_true = y_test, y_pred = y_pred)
    print(f"Kernel = hellinger, accuracy = {acc * 100}")
    avg_acc['hellinger'] += acc

    model = SVC(kernel = chi2_kernel)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_true = y_test, y_pred = y_pred)
    print(f"Kernel = chi2_kernel, accuracy = {acc * 100}")
    avg_acc['chi_squared'] += acc

print()
for item in avg_acc:
    val = avg_acc[item] / 26
    print(f"Average accuracy for {item} = {val * 100}")


Running for alphabet = A

Kernel = linear, accuracy = 95.94017094017094
Kernel = rbf, accuracy = 97.00854700854701
Kernel = poly, accuracy = 99.14529914529915
Kernel = hellinger, accuracy = 94.87179487179486
Kernel = chi2_kernel, accuracy = 96.15384615384616

Running for alphabet = B

Kernel = linear, accuracy = 89.80477223427332
Kernel = rbf, accuracy = 93.27548806941431
Kernel = poly, accuracy = 93.70932754880694
Kernel = hellinger, accuracy = 91.3232104121475
Kernel = chi2_kernel, accuracy = 96.74620390455532

Running for alphabet = C

Kernel = linear, accuracy = 92.05298013245033
Kernel = rbf, accuracy = 94.48123620309052
Kernel = poly, accuracy = 95.14348785871964
Kernel = hellinger, accuracy = 90.50772626931567
Kernel = chi2_kernel, accuracy = 97.57174392935983

Running for alphabet = D

Kernel = linear, accuracy = 87.07627118644068
Kernel = rbf, accuracy = 92.37288135593221
Kernel = poly, accuracy = 93.22033898305084
Kernel = hellinger, accuracy = 82.41525423728814
Kernel = chi