In [1]:
!curl -L https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data -o letter-recognition.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  695k  100  695k    0     0   170k      0  0:00:04  0:00:04 --:--:--  170k


In [2]:
# importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import scale
from math import exp
from collections import defaultdict 
from sklearn.metrics.pairwise import chi2_kernel
# import warnings
# warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('letter-recognition.csv')

# feature names were like - 'letter ' changed it to 'letter'
df.columns = ['letter', 'xbox', 'ybox', 'width', 'height', 'onpix', 'xbar',
       'ybar', 'x2bar', 'y2bar', 'xybar', 'x2ybar', 'xy2bar', 'xedge',
       'xedgey', 'yedge', 'yedgex']
print(df.columns)

Index(['letter', 'xbox', 'ybox', 'width', 'height', 'onpix', 'xbar', 'ybar',
       'x2bar', 'y2bar', 'xybar', 'x2ybar', 'xy2bar', 'xedge', 'xedgey',
       'yedge', 'yedgex'],
      dtype='object')


In [4]:
print(f"Number of rows = {df.shape[0]}, cols = {df.shape[1]}")
df.head()

Number of rows = 19999, cols = 17


Unnamed: 0,letter,xbox,ybox,width,height,onpix,xbar,ybar,x2bar,y2bar,xybar,x2ybar,xy2bar,xedge,xedgey,yedge,yedgex
0,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
1,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
2,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
3,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10
4,S,4,11,5,8,3,8,8,6,9,5,6,6,0,8,9,7


In [5]:
# number of different classes
classes = df['letter'].unique()
print(f'Number of different labels in dataset = {len(classes)}')
classes.sort()
for cls in classes:
    rows = len(df[df['letter'] == cls])
    print(f'Number of data points in class {cls} = {rows}')

Number of different labels in dataset = 26
Number of data points in class A = 789
Number of data points in class B = 766
Number of data points in class C = 736
Number of data points in class D = 805
Number of data points in class E = 768
Number of data points in class F = 775
Number of data points in class G = 773
Number of data points in class H = 734
Number of data points in class I = 755
Number of data points in class J = 747
Number of data points in class K = 739
Number of data points in class L = 761
Number of data points in class M = 792
Number of data points in class N = 783
Number of data points in class O = 753
Number of data points in class P = 803
Number of data points in class Q = 783
Number of data points in class R = 758
Number of data points in class S = 748
Number of data points in class T = 795
Number of data points in class U = 813
Number of data points in class V = 764
Number of data points in class W = 752
Number of data points in class X = 787
Number of data points

In [6]:
# Preparing the dataset

# binary labeling of the dataset
df1 = df.copy()

# one alphabet will be treated as positive sample and others as negative sample
alphabet = 'A'  
df1.loc[(df1.letter != alphabet),'letter'] = -1
df1.loc[(df1.letter == alphabet),'letter'] = 1

# we will make number of positve samples = number of negative samples otherwise
# otherwise the dataset will have huge number of negative samples which will slow
# down the sampling process
pos = df1[df1['letter']==1].sample(frac=1)
# we need roughly only 1/25 of total negative samples 
neg = df1[df1['letter']==-1].sample(frac=1/25)

df_final = pos.append(neg,ignore_index=True)
print(f"Length of final dataset {len(df_final)}")
df_final.head()

Length of final dataset 1557


Unnamed: 0,letter,xbox,ybox,width,height,onpix,xbar,ybar,x2bar,y2bar,xybar,x2ybar,xy2bar,xedge,xedgey,yedge,yedgex
0,1,3,7,4,5,2,9,3,3,3,9,1,8,2,6,2,7
1,1,2,7,4,5,2,11,2,3,3,10,2,9,2,6,3,8
2,1,2,6,4,4,2,10,3,1,2,8,3,9,2,6,2,8
3,1,2,2,4,3,2,8,3,2,3,8,1,8,2,6,2,7
4,1,1,1,2,1,0,7,4,3,0,7,1,8,2,7,1,8


In [7]:
# train test split
df_final_train, df_final_test = train_test_split(df_final, test_size=0.3, random_state=42)


# we need to generate a preliminary model first = f0 using N1(< M) datapoints
M = 1000
N1 = round(M * 0.75)

# we will extract N1 data points randomly
df_sample = df_final_train.sample(n=N1)

df_sample_features = df_sample.loc[:, df_sample.columns != 'letter']
df_sample_target = df_sample.loc[:, ['letter']]

print(df_sample_features.head())
print(df_sample_target.head())

      xbox  ybox  width  height  onpix  xbar  ybar  x2bar  y2bar  xybar  \
1168     3    10      3       8      2     0     2      3      6      1   
457      3     6      5       4      2     8     4      3      0      7   
839      2     8      2       6      2     7     7      0      8      7   
434      6    11      8       8      8     8     9      8      5      6   
931      3     4      4       2      1     5    13      3      6     12   

      x2ybar  xy2bar  xedge  xedgey  yedge  yedgex  
1168       0       8      0       8      0       8  
457        1       8      2       7      1       8  
839        6       8      0       8      3       8  
434        6       8      3       7      8       4  
931        9       3      1      10      1       5  
     letter
1168     -1
457       1
839      -1
434       1
931      -1


In [8]:
# Markov Sampling 

# Step1 
# generating priliminary model f0

# scale? is it required
df_sample_features = scale(df_sample_features)
df_sample_target = df_sample_target.astype('int')

f0 = SVC(kernel = 'linear')
f0.fit(df_sample_features, df_sample_target.values.ravel())


SVC(kernel='linear')

In [9]:
# Step 2
# draw one sample randomly

m_pos, m_neg = 0, 0
zt = df_final_train.sample(n=1)
print(zt)
if M%2 == 0:
  label = zt.iloc[0]['letter']
  print("")
  if label == 1:
    m_pos += 1
  else:
    m_neg += 1


   letter  xbox  ybox  width  height  onpix  xbar  ybar  x2bar  y2bar  xybar  \
64      1     1     0      2       0      0     7     4      2      0      7   

    x2ybar  xy2bar  xedge  xedgey  yedge  yedgex  
64       2       8      2       7      1       8  



In [10]:
w_norm = np.linalg.norm(f0.coef_)  
def predict(z):
  x = zt.loc[:, zt.columns != 'letter']
  y = zt.loc[:, zt.columns == 'letter']
  # doubtful
  fx = f0.decision_function(x)/w_norm
  return fx[0]

In [11]:
# returns the loss value calculated using model f0
def loss(z):
  y = z.iloc[0]['letter']
  # doubtful
  fx = predict(z)

  if fx*y > 1:
    return 0
  else:
    return 1-fx*y

In [12]:
def calculate_P(zs,zst):
  P = exp(-loss(zs))/exp(-loss(zt))
  return P

In [13]:
def acceptance_prob(zs,zt):
  ys = zs.iloc[0]['letter']
  yt = zt.iloc[0]['letter']

  fs = predict(zs)
  ft = predict(zt)

  return exp(-fs*ys)/exp(-ft*yt)

In [14]:
# Step 3
# should its type be pandas.dataframe?
sampled_data = []
# keep count of how many times a sample is rejected
rejected = defaultdict(int)

# all variable names are as given in paper
# constants used in algorithm, see section 4A point 1 in paper
K = 5
Q = 1.2
while m_pos < M//2 or m_neg < M//2:
  # draw one sample randomly = z_star(zs)
  zs = df_final_train.sample(n=1)
  print("considering ",zs)
  P = calculate_P(zs,zt)
  
  # yt corresponds to Y of zt
  yt = zt.iloc[0]['letter']
  ys = zs.iloc[0]['letter']

  # alpha is the acceptance probability of zs, given in step 5
  alpha = P
  # need to convert it to tuple so that it becomes hashable
  tpl = tuple(zs.to_records(index=False)[0])

  if P==1 and yt==-1 and ys==-1:
    alpha = acceptance_prob(zs,zt)  # P'
  elif P==1 and yt==1 and ys==1:    
    alpha = acceptance_prob(zs,zt)  # P'
  elif (P==1 and yt*ys==-1) or P<1 :
    alpha = P                       # P
  elif rejected[tpl]>K:
    alpha = QP                      # P''
                         
  if alpha>1:
    alpha=1

  print(f"alpha {alpha}")
  if np.random.random() < alpha:
    sampled_data.append(zs)
  else:
    rejected[tpl]+=1

  if yt == 1:
    m_pos += 1 
  else:
    m_neg += 1
  
  # for next iteration 
  zt = zs
  print(f"Positive = {m_pos} Negative = {m_neg}")

     0     7     3      2      0      7   

     x2ybar  xy2bar  xedge  xedgey  yedge  yedgex  
253       2       8      2       6      1       8  
alpha 1.0
Positive = 501 Negative = 471
considering       letter  xbox  ybox  width  height  onpix  xbar  ybar  x2bar  y2bar  \
1012     -1     4     7      6       5      5     7     7      6      3   

      xybar  x2ybar  xy2bar  xedge  xedgey  yedge  yedgex  
1012      7       6      10      4       8      7       7  
alpha 1
Positive = 502 Negative = 471
considering      letter  xbox  ybox  width  height  onpix  xbar  ybar  x2bar  y2bar  xybar  \
138      1     6    11      8       9      8     7     7      8      4      8   

     x2ybar  xy2bar  xedge  xedgey  yedge  yedgex  
138       5       8      4       8     10       1  
alpha 0.0004952774059774304
Positive = 502 Negative = 472
considering      letter  xbox  ybox  width  height  onpix  xbar  ybar  x2bar  y2bar  xybar  \
698      1     4    10      5       7      4     9     4  

In [15]:
# Markov Sampling Done
# sampled data points are stored in sampled_data
print(len(sampled_data))
dataset = pd.concat(sampled_data)
print(dataset.head())

787
     letter  xbox  ybox  width  height  onpix  xbar  ybar  x2bar  y2bar  \
1366     -1     4     4      6       6      7     9     4      5      3   
1504     -1     5    10      7       7      5     7     8      8      5   
191       1     3     6      5       4      2    11     3      2      2   
866      -1     4     9      4       7      1     0     1      6      6   
1541     -1     6    10      7       8      4     4     8      6      8   

      xybar  x2ybar  xy2bar  xedge  xedgey  yedge  yedgex  
1366      7       7       8      5       9      4       8  
1504      7       7       9      3       7      4       7  
191       9       2       9      2       6      3       9  
866       0       0       6      0       8      0       8  
1541     10       9       9      3       9      2       5  


In [16]:
print("In the final dataset on which SVM is being trained ")

print(f"Number of positive samples {len(dataset[dataset['letter']==1])}")
print(f"Number of negative samples {len(dataset[dataset['letter']==-1])}")
features = dataset.loc[:, dataset.columns != 'letter']
target = dataset.loc[:, ['letter']]

print(features.head())
print(target.head())

In the final dataset on which SVM is being trained 
Number of positive samples 286
Number of negative samples 501
      xbox  ybox  width  height  onpix  xbar  ybar  x2bar  y2bar  xybar  \
1366     4     4      6       6      7     9     4      5      3      7   
1504     5    10      7       7      5     7     8      8      5      7   
191      3     6      5       4      2    11     3      2      2      9   
866      4     9      4       7      1     0     1      6      6      0   
1541     6    10      7       8      4     4     8      6      8     10   

      x2ybar  xy2bar  xedge  xedgey  yedge  yedgex  
1366       7       8      5       9      4       8  
1504       7       9      3       7      4       7  
191        2       9      2       6      3       9  
866        0       6      0       8      0       8  
1541       9       9      3       9      2       5  
     letter
1366     -1
1504     -1
191       1
866      -1
1541     -1


In [17]:
def hellinger(X1, X2):
  return np.sqrt(np.dot(X1,X2.T))

In [18]:
# final SVMC algorithm
# scaling the feature matrix values into a td distribution

# X_train = scale(features)
X_train = features
y_train = target
y_train = y_train.astype('int')


# X_test = scale(df_final_test.loc[:, df_final_test.columns != 'letter'])
X_test = df_final_test.loc[:, df_final_test.columns != 'letter']
y_test = df_final_test.loc[:, ['letter']]
y_test = y_test.astype('int')


# model training
import warnings
warnings.filterwarnings("ignore")
kernels = ['linear', 'rbf', 'poly']
for kernel in kernels:
    model = SVC(kernel = kernel)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_true = y_test, y_pred = y_pred)
    print(f"Kernel = {kernel}, accuracy = {acc * 100}")

model = SVC(kernel = hellinger)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_true = y_test, y_pred = y_pred)
print(f"Kernel = hellinger, accuracy = {acc * 100}")

model = SVC(kernel = chi2_kernel)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_true = y_test, y_pred = y_pred)
print(f"Kernel = chi2_kernel, accuracy = {acc * 100}")


Kernel = linear, accuracy = 94.44444444444444
Kernel = rbf, accuracy = 93.37606837606837
Kernel = poly, accuracy = 98.07692307692307
Kernel = hellinger, accuracy = 92.52136752136752
Kernel = chi2_kernel, accuracy = 95.94017094017094
