In [None]:
import tensorflow as tf
from tensorflow import keras
from sklearn import svm, metrics
import timeit
import matplotlib.pyplot as plt 
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
import random

In [None]:
(x_train_og, y_train_og), (x_test_og, y_test_og) = keras.datasets.cifar10.load_data()
assert x_train_og.shape == (50000, 32, 32, 3)
assert x_test_og.shape == (10000, 32, 32, 3)
assert y_train_og.shape == (50000, 1)
assert y_test_og.shape == (10000, 1) 


Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


In [None]:
tf.test.gpu_device_name()


'/device:GPU:0'

Splitting the data: Train/Validation/Test
& Samples 

In [None]:
full_training_observations = x_train_og
full_training_labels = y_train_og

x_train =  full_training_observations[:35000,:]
x_validate =  full_training_observations[35000:,:]
x_test = x_test_og

y_train =  full_training_labels[:35000]
y_validate = full_training_labels[35000:]


train_sample = x_train[:20000,:]
train_sample_labels = y_train[:20000,:]

validation_sample = x_validate[6500:]
validation_sample_labels = y_validate[6500:,:]

np.unique(validation_sample_labels, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=uint8),
 array([862, 862, 807, 877, 835, 886, 832, 832, 848, 859]))

RGB Conversion to Grayscale

In [None]:
#print(x_train[1].size) #-> 32x32x3 = 3072
#rgb_weights = np.array([0.299, 0.587,  0.114])
rgb_weights = np.array([1/3, 1/3, 1/3]) 
#Train Data in Gray:
n1 = len(train_sample)
train_sample_gray = np.zeros((n1, 32, 32,))
for x in range(n1):
  for i in range(0,32):
    train_sample_gray[x,i] = np.dot(train_sample[x,i],rgb_weights)/255

train_sample_gray=train_sample_gray.reshape((n1,1024)) 

#Validation Data in Gray:
n2 = len(validation_sample)
validation_sample_gray = np.zeros((n2, 32, 32,))
for x in range(n2):
  for i in range(0,32):
    validation_sample_gray[x,i] = np.dot(validation_sample[x,i],rgb_weights)/255

validation_sample_gray = validation_sample_gray.reshape((n2,1024)) 

#Test Data in Gray:
test_data_gray = np.zeros((10000, 32, 32,))
for x in range(0,1000):
  for i in range(0,32):
    test_data_gray[x,i] = np.dot(x_test[x,i],rgb_weights)/255

test_data_gray=test_data_gray.reshape((10000,1024))

NameError: ignored

Final from of data with the RGB scale

In [None]:
train_sample_rgb = train_sample.reshape((len(train_sample), 3072))
validation_sample_rgb = validation_sample.reshape((len(validation_sample), 3072))
test_data_rgb = x_test.reshape((10000, 3072))

# Classifiers: No Feature Selection


1. SVM



In [None]:
#Training Model
model = svm.SVC(gamma=0.1, C=10, kernel="poly")
start = timeit.default_timer()
model.fit(train_sample_gray,train_sample_labels) 
stop = timeit.default_timer()
print('Training Time: ', stop - start)  


NameError: ignored

In [None]:
start = timeit.default_timer()
acc = metrics.accuracy_score(model.predict(validation_sample_gray),validation_sample_labels)
stop = timeit.default_timer()
print('Prediction Time: ', stop - start)  
print('Accuracy:', acc) 

2. Decision Tree Classifier

In [None]:
tree = DecisionTreeClassifier(criterion="gini")
start = timeit.default_timer()
tree.fit(train_sample_gray,train_sample_labels)
stop = timeit.default_timer()
print('Training Time: ', stop - start)  


In [None]:
start = timeit.default_timer()
acc = metrics.accuracy_score(tree.predict(validation_sample_gray),validation_sample_labels)
stop = timeit.default_timer()
print('Prediction Time: ', stop - start) 
print(acc) 

# Recursive Feature Elimination
--With Decision Trees--

In [None]:
def RFE_features(step, features):
  estimator = DecisionTreeClassifier()
  selector = RFE(estimator, step=step, n_features_to_select=features)
  selector = selector.fit(train_sample_rgb, train_sample_labels)
  return list(selector.get_support(indices=True))

In [None]:
def run_selected_features_on_SVM(indices):

  selected_train = train_sample_rgb[:, indices]
  selected_validation = validation_sample_rgb[:, indices]

  model = svm.SVC(gamma=0.1, C=10, kernel="poly")

  start = timeit.default_timer()
  model.fit(selected_train,train_sample_labels) 
  stop = timeit.default_timer()

  train_time = stop - start

  start = timeit.default_timer()
  acc = metrics.accuracy_score(model.predict(selected_validation),validation_sample_labels.reshape((len(selected_validation), )))
  stop = timeit.default_timer()

  prediction_time = stop - start

  return [train_time, prediction_time, acc]

In [None]:
steps = [0.1]
n_features = [700, 800, 900, 1100]


#steps = [0.01, 0.05, 0.1]
#n_features = [250, 300, 350, 400, 450, 500]


results = np.zeros((len(steps), len(n_features), 3))

time_to_select_features = {}
for x in steps:
  time_to_select_features[x] = []



for i in range(len(steps)):
  for j in range(len(n_features)):
    start = timeit.default_timer()
    indices = RFE_features(steps[i], n_features[j])
    stop = timeit.default_timer()
    time_to_select_features[steps[i]].append(stop - start)
    res = run_selected_features_on_SVM(indices)
    print(res)
    results[i, j] = res

In [None]:
print(time_to_select_features)
print(results)

{0.1: [341.86986907300025, 346.1487343400004, 351.48958407699956, 326.60679819600045]}
[[[168.40891511  81.10075825   0.45247059]
  [181.32610126  92.16081307   0.454     ]
  [200.54152799 100.91550172   0.44258824]
  [239.36692643 121.84208316   0.45705882]]]


# ReliefF

In [None]:
def get_kth_nearest_hits_and_misses(rand_instance, k=10):

  nearest_neighbours = {}
  for x in range(10):
    nearest_neighbours[x] = []


  # i searches for values before the instance
  # j searches for values after the instance
  i=1
  j=1

  
  items_found = 0 # we should end up with k*10 items found
  while(items_found<10*k):

    if(i<=rand_instance): # if i=rand_instance then rand_instance-i = 0 (first element in array)
      class_new_instance = int(train_sample_labels[rand_instance-i])
      if(len(nearest_neighbours[class_new_instance]) < k):
        nearest_neighbours[class_new_instance].append(rand_instance-i)
        items_found+=1


    if (j<=len(train_sample_rgb)-1-rand_instance): # if j=len(train_sample)-1-rand_instance then rand_instance+j = len(train_sample)-1 (last element in array)
      class_new_instance = int(train_sample_labels[rand_instance+j])
      if(len(nearest_neighbours[class_new_instance]) < k):
        nearest_neighbours[class_new_instance].append(rand_instance+j)
        items_found+=1

  
    #update i and j
    i+=1 
    j+=1


  return nearest_neighbours

In [None]:
def sum_dist_point_list(point, list):
  d = np.zeros(3072)
  for x in list:
    d+=np.absolute(point - train_sample_rgb[x])
  return d



In [None]:
def reliefF(k,n):

  weights = np.zeros(3072)


  for i in range(0,n):
    rand_instance = random.randint(0,len(train_sample_rgb)-1)
    class_of_instance = int(y_train[rand_instance])
    nearest_neighbours = get_kth_nearest_hits_and_misses(rand_instance, k)
    class_of_misses = [x for x in range(0,10) if x != class_of_instance]

    sum_dist_hit = sum_dist_point_list(train_sample_rgb[rand_instance], nearest_neighbours[class_of_instance])
    sum_dist_hit = sum_dist_hit/(n*k)

    sum_dist_misses = 0

    for c in class_of_misses:
      sum_dist_misses += sum_dist_point_list(train_sample_rgb[rand_instance], nearest_neighbours[c])
    
    sum_dist_misses = sum_dist_misses/(10*n*k) # the 10 is there bec probability of each class is 1/10

    weights += sum_dist_misses - sum_dist_hit 

    weight_indices_sorted = np.argsort(-weights) #negative so we can get it in desecending order

    return weight_indices_sorted

In [None]:
def svm_predict_relieff(weights, n_features=500):


  best_features_index = weights[:n_features]

  selected_train = train_sample_rgb[:,best_features_index]
  selected_valid = validation_sample_rgb[:,best_features_index]

  model = svm.SVC(gamma=0.1, C=10, kernel="poly")

  #training model + time
  start_train = timeit.default_timer()
  model.fit(selected_train,train_sample_labels)
  stop_train = timeit.default_timer()
  train_time = stop_train-start_train

  #making predictions + time
  start_prediction = timeit.default_timer()
  acc = metrics.accuracy_score(model.predict(selected_valid),validation_sample_labels)
  stop_prediction = timeit.default_timer()
  prediction_time = stop_prediction-start_prediction

  return [train_time, prediction_time, acc]

In [None]:
#random.seed(5)
num_neighbours = [5, 10, 15, 20, 25] #list of k's we want to try
num_instances = [10,50, 100,200] #list of n's we want to try


results = np.zeros((len(num_neighbours), len(num_instances), 3))

time_to_select_features = {}
for x in num_neighbours:
  time_to_select_features[x] = []


for i in range(len(num_neighbours)):
  for j in range(len(num_instances)):
    start = timeit.default_timer()
    weights = reliefF(k=num_neighbours[i], n=num_instances[j])
    stop = timeit.default_timer()
    time_to_select_features[num_neighbours[i]].append(stop - start)
    res = svm_predict_relieff(weights)
    print(res)
    results[i, j] = res




  y = column_or_1d(y, warn=True)


KeyboardInterrupt: ignored

In [None]:
num_features = [500]
results = np.zeros((len(num_features), 3))
weights = reliefF(k=25, n=50)

for i in range(len(num_features)):
  res = svm_predict_relieff(weights, num_features[i])
  print(res)
  results[i] = res


  y = column_or_1d(y, warn=True)


[182.87745250699982, 57.42041029899997, 0.4475294117647059]


In [None]:
results

array([[[3.50702813e+02, 5.85608068e+01, 4.19294118e-01],
        [2.92040771e+02, 5.97351894e+01, 4.32588235e-01],
        [4.23118944e+02, 5.64582362e+01, 3.97647059e-01],
        [2.99742749e+02, 5.65666655e+01, 4.23764706e-01]],

       [[4.89353542e+02, 5.73470438e+01, 3.98235294e-01],
        [2.98741014e+02, 5.69045656e+01, 4.14470588e-01],
        [3.28922386e+02, 5.73482608e+01, 4.14235294e-01],
        [9.02668453e+02, 5.84017194e+01, 3.61647059e-01]],

       [[2.95438535e+02, 5.60317150e+01, 4.20352941e-01],
        [4.19576223e+02, 5.64589617e+01, 3.98000000e-01],
        [2.72280782e+02, 5.86172357e+01, 4.33176471e-01],
        [2.89483794e+02, 6.10252594e+01, 4.19647059e-01]],

       [[3.28096487e+02, 5.97862002e+01, 4.33294118e-01],
        [2.78778695e+02, 6.24166572e+01, 4.47294118e-01],
        [3.43730168e+02, 5.95704243e+01, 4.30235294e-01],
        [4.18718760e+02, 6.01825586e+01, 4.06470588e-01]]])

# Running ReliefF and RFE on the full training and testing set




In [None]:
# the functions we used earlier are call upon "train_sample" and "validation_sample"
#here we make an adjustment where we make the "samples" be the full training and testing sets


train_sample_rgb = full_training_observations.reshape((50000, 3072)) # make the train "sample", the full training set as our classifier uses "train_sample_rg"
train_sample_labels = full_training_labels


validation_sample_rgb = test_data_rgb.reshape((10000, 3072))
validation_sample_labels = y_test_og # make the "validation sample" be equal to the full testing set


In [None]:
# the RFE Algo

time_to_select_features = 0


start = timeit.default_timer()
indices = RFE_features(step=0.1, features=100)
stop = timeit.default_timer()
time_to_select_features = stop - start
res = run_selected_features_on_SVM(indices)
print(res)
print(time_to_select_features)

NameError: ignored

In [None]:
#with reliefF

start = timeit.default_timer()
weights = reliefF(k=25, n=50)
stop = timeit.default_timer()
print(stop-start)

res = svm_predict_relieff(weights, n_features=300)
print(res)


0.006503205000058188


  y = column_or_1d(y, warn=True)


[3986.566000233, 120.929680233, 0.4538]


In [None]:

model = svm.SVC(gamma=0.1, C=10, kernel="poly")
start = timeit.default_timer()
model.fit(train_sample_rgb,train_sample_labels) 
stop = timeit.default_timer() 
print('Training Time: ', stop - start) 
start = timeit.default_timer()
acc = metrics.accuracy_score(tree.predict(validation_sample_rgb),validation_sample_labels)
stop = timeit.default_timer()
print('Prediction Time: ', stop - start) 
print(acc)  

  y = column_or_1d(y, warn=True)
