In [18]:
# importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import scale
from math import exp
from collections import defaultdict 
from sklearn.metrics.pairwise import chi2_kernel
# import warnings
# warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Google Colab Notebooks/DMW/SVMC Markov/letter-recognition.csv')

# feature names were like - 'letter ' changed it to 'letter'
df.columns = ['letter', 'xbox', 'ybox', 'width', 'height', 'onpix', 'xbar',
       'ybar', 'x2bar', 'y2bar', 'xybar', 'x2ybar', 'xy2bar', 'xedge',
       'xedgey', 'yedge', 'yedgex']
print(df.columns)

Index(['letter', 'xbox', 'ybox', 'width', 'height', 'onpix', 'xbar', 'ybar',
       'x2bar', 'y2bar', 'xybar', 'x2ybar', 'xy2bar', 'xedge', 'xedgey',
       'yedge', 'yedgex'],
      dtype='object')


In [3]:
print(f"Number of rows = {df.shape[0]}, cols = {df.shape[1]}")
df.head()

Number of rows = 20000, cols = 17


Unnamed: 0,letter,xbox,ybox,width,height,onpix,xbar,ybar,x2bar,y2bar,xybar,x2ybar,xy2bar,xedge,xedgey,yedge,yedgex
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


In [4]:
# number of different classes
classes = df['letter'].unique()
print(f'Number of different labels in dataset = {len(classes)}')
classes.sort()
for cls in classes:
    rows = len(df[df['letter'] == cls])
    print(f'Number of data points in class {cls} = {rows}')

Number of different labels in dataset = 26
Number of data points in class A = 789
Number of data points in class B = 766
Number of data points in class C = 736
Number of data points in class D = 805
Number of data points in class E = 768
Number of data points in class F = 775
Number of data points in class G = 773
Number of data points in class H = 734
Number of data points in class I = 755
Number of data points in class J = 747
Number of data points in class K = 739
Number of data points in class L = 761
Number of data points in class M = 792
Number of data points in class N = 783
Number of data points in class O = 753
Number of data points in class P = 803
Number of data points in class Q = 783
Number of data points in class R = 758
Number of data points in class S = 748
Number of data points in class T = 796
Number of data points in class U = 813
Number of data points in class V = 764
Number of data points in class W = 752
Number of data points in class X = 787
Number of data points

In [5]:
# Preparing the dataset

# binary labeling of the dataset
df1 = df.copy()

# one alphabet will be treated as positive sample and others as negative sample
alphabet = 'A'  
df1.loc[(df1.letter != alphabet),'letter'] = -1
df1.loc[(df1.letter == alphabet),'letter'] = 1

# we will make number of positve samples = number of negative samples otherwise
# otherwise the dataset will have huge number of negative samples which will slow
# down the sampling process
pos = df1[df1['letter']==1].sample(frac=1)
# we need roughly only 1/25 of total negative samples 
neg = df1[df1['letter']==-1].sample(frac=1/25)

df_final = pos.append(neg,ignore_index=True)
print(f"Length of final dataset {len(df_final)}")
df_final.head()

Length of final dataset 1557


Unnamed: 0,letter,xbox,ybox,width,height,onpix,xbar,ybar,x2bar,y2bar,xybar,x2ybar,xy2bar,xedge,xedgey,yedge,yedgex
0,1,3,8,5,5,2,9,6,3,1,8,0,8,2,7,1,8
1,1,3,3,5,4,1,7,5,3,0,7,1,8,2,7,2,8
2,1,4,10,5,7,4,9,4,3,1,8,2,8,2,7,2,8
3,1,2,6,4,4,2,12,3,4,2,11,1,8,2,6,2,9
4,1,2,1,4,2,1,7,2,2,2,6,2,8,2,6,2,7


In [6]:
# train test split
df_final_train, df_final_test = train_test_split(df_final, test_size=0.3, random_state=42)


# we need to generate a preliminary model first = f0 using N1(< M) datapoints
M = 1000
N1 = round(M * 0.75)

# we will extract N1 data points randomly
df_sample = df_final_train.sample(n=N1)

df_sample_features = df_sample.loc[:, df_sample.columns != 'letter']
df_sample_target = df_sample.loc[:, ['letter']]

print(df_sample_features.head())
print(df_sample_target.head())

      xbox  ybox  width  height  onpix  ...  xy2bar  xedge  xedgey  yedge  yedgex
917      7    10      7       8      4  ...       8      2      10      1       8
25       2     7      4       5      2  ...       9      2       6      3       9
1403     5     8      8       6      5  ...      11      5       8      2       7
195      3     7      5       5      3  ...       9      3       5      3       8
1480     3     6      4       4      3  ...       7      3       8      2       8

[5 rows x 16 columns]
     letter
917      -1
25        1
1403     -1
195       1
1480     -1


In [7]:
# Markov Sampling 

# Step1 
# generating priliminary model f0

# scale? is it required
df_sample_features = scale(df_sample_features)
df_sample_target = df_sample_target.astype('int')

f0 = SVC(kernel = 'linear')
f0.fit(df_sample_features, df_sample_target.values.ravel())


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [8]:
# Step 2
# draw one sample randomly

m_pos, m_neg = 0, 0
zt = df_final_train.sample(n=1)
print(zt)
if M%2 == 0:
  label = zt.iloc[0]['letter']
  print("")
  if label == 1:
    m_pos += 1
  else:
    m_neg += 1


    letter  xbox  ybox  width  height  ...  xy2bar  xedge  xedgey  yedge  yedgex
996     -1     6     9      4       4  ...       8      2       9      5       9

[1 rows x 17 columns]



In [9]:
def predict(z):
  x = zt.loc[:, zt.columns != 'letter']
  y = zt.loc[:, zt.columns == 'letter']
  # doubtful
  fx = f0.predict(x)
  return fx

In [10]:
# returns the loss value calculated using model f0
def loss(z):
  y = z.iloc[0]['letter']
  # doubtful
  fx = predict(z)

  if fx*y > 1:
    return 0
  else:
    return 1-fx*y

In [11]:
def calculate_P(zs,zst):
  P = exp(-loss(zs))/exp(-loss(zt))
  return P

In [12]:
def acceptance_prob(zs,zt):
  ys = zs.iloc[0]['letter']
  yt = zt.iloc[0]['letter']

  fs = predict(zs)
  ft = predict(zt)

  return exp(-fs*ys)/exp(-ft*yt)

In [13]:
# Step 3
# should its type be pandas.dataframe?
sampled_data = []
# keep count of how many times a sample is rejected
rejected = defaultdict(int)

# all variable names are as given in paper
# constants used in algorithm, see section 4A point 1 in paper
K = 5
Q = 1.2
while m_pos < M//2 or m_neg < M//2:
  # draw one sample randomly = z_star(zs)
  zs = df_final_train.sample(n=1)
  print("considering ",zs)
  P = calculate_P(zs,zt)
  
  # yt corresponds to Y of zt
  yt = zt.iloc[0]['letter']
  ys = zs.iloc[0]['letter']

  # alpha is the acceptance probability of zs, given in step 5
  alpha = P
  # need to convert it to tuple so that it becomes hashable
  tpl = tuple(zs.to_records(index=False)[0])

  if P==1 and yt==-1 and ys==-1:
    alpha = acceptance_prob(zs,zt)  # P'
  elif P==1 and yt==1 and ys==1:    
    alpha = acceptance_prob(zs,zt)  # P'
  elif (P==1 and yt*ys==-1) or P<1 :
    alpha = P                       # P
  elif rejected[tpl]>K:
    alpha = QP                      # P''
                         
  if alpha>1:
    alpha=1

  print(f"alpha {alpha}")
  if np.random.random() < alpha:
    sampled_data.append(zs)
  else:
    rejected[tpl]+=1

  if yt == 1:
    m_pos += 1 
  else:
    m_neg += 1
  
  # for next iteration 
  zt = zs
  print(f"Positive = {m_pos} Negative = {m_neg}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
alpha 0.1353352832366127
Positive = 96 Negative = 84
considering      letter  xbox  ybox  width  height  ...  xy2bar  xedge  xedgey  yedge  yedgex
994     -1     5     9      6       8  ...       9      5      10      9      11

[1 rows x 17 columns]
alpha 1
Positive = 97 Negative = 84
considering      letter  xbox  ybox  width  height  ...  xy2bar  xedge  xedgey  yedge  yedgex
732      1     3     4      5       3  ...       9      4       5      2       9

[1 rows x 17 columns]
alpha 0.1353352832366127
Positive = 97 Negative = 85
considering       letter  xbox  ybox  width  height  ...  xy2bar  xedge  xedgey  yedge  yedgex
1393     -1     6    10      9       7  ...       6      1       8      6       7

[1 rows x 17 columns]
alpha 1
Positive = 98 Negative = 85
considering      letter  xbox  ybox  width  height  ...  xy2bar  xedge  xedgey  yedge  yedgex
893     -1     6     9      8       8  ...       8      4       7  

In [14]:
# Markov Sampling Done
# sampled data points are stored in sampled_data
print(len(sampled_data))
dataset = pd.concat(sampled_data)
print(dataset.head())

798
     letter  xbox  ybox  width  height  ...  xy2bar  xedge  xedgey  yedge  yedgex
1016     -1     5     7      5       5  ...       6      6      12      2       5
686       1     3     6      5       4  ...       9      2       6      3       8
53        1     5    10      5       5  ...      11      4       3      4      10
741       1     4     9      5       7  ...       8      2       7      1       8
724       1     4     9      6       7  ...       8      2       7      3       6

[5 rows x 17 columns]


In [15]:
print("In the final dataset on which SVM is being trained ")

print(f"Number of positive samples {len(dataset[dataset['letter']==1])}")
print(f"Number of negative samples {len(dataset[dataset['letter']==-1])}")
features = dataset.loc[:, dataset.columns != 'letter']
target = dataset.loc[:, ['letter']]

print(features.head())
print(target.head())

In the final dataset on which SVM is being trained 
Number of positive samples 300
Number of negative samples 498
      xbox  ybox  width  height  onpix  ...  xy2bar  xedge  xedgey  yedge  yedgex
1016     5     7      5       5      4  ...       6      6      12      2       5
686      3     6      5       4      3  ...       9      2       6      3       8
53       5    10      5       5      3  ...      11      4       3      4      10
741      4     9      5       7      4  ...       8      2       7      1       8
724      4     9      6       7      4  ...       8      2       7      3       6

[5 rows x 16 columns]
     letter
1016     -1
686       1
53        1
741       1
724       1


In [16]:
def hellinger(X1, X2):
  return np.sqrt(np.dot(X1,X2.T))

In [20]:
# final SVMC algorithm
# scaling the feature matrix values into a td distribution

# X_train = scale(features)
X_train = features
y_train = target
y_train = y_train.astype('int')


# X_test = scale(df_final_test.loc[:, df_final_test.columns != 'letter'])
X_test = df_final_test.loc[:, df_final_test.columns != 'letter']
y_test = df_final_test.loc[:, ['letter']]
y_test = y_test.astype('int')


# model training
import warnings
warnings.filterwarnings("ignore")
kernels = ['linear', 'rbf', 'poly']
for kernel in kernels:
    model = SVC(kernel = kernel)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_true = y_test, y_pred = y_pred)
    print(f"Kernel = {kernel}, accuracy = {acc * 100}")

model = SVC(kernel = hellinger)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_true = y_test, y_pred = y_pred)
print(f"Kernel = hellinger, accuracy = {acc * 100}")

model = SVC(kernel = chi2_kernel)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_true = y_test, y_pred = y_pred)
print(f"Kernel = chi2_kernel, accuracy = {acc * 100}")


Kernel = linear, accuracy = 95.51282051282051
Kernel = rbf, accuracy = 95.72649572649573
Kernel = poly, accuracy = 95.94017094017094
Kernel = hellinger, accuracy = 93.58974358974359
Kernel = chi2_kernel, accuracy = 94.87179487179486
