In [0]:
import numpy as np
import random
import math

In [0]:
# Loading the data
dataframe = []
file = open("House-votes-data.TXT", "r")
for line in file:
    dataframe.append(line.rstrip())
random.shuffle(dataframe)
split_length = int(len(dataframe)/5)
set1 = dataframe[:split_length]
set2 = dataframe[split_length:split_length * 2]
set3 = dataframe[split_length * 2:split_length * 3]
set4 = dataframe[split_length * 3:split_length * 4]
set5 = dataframe[split_length * 4:]
data = [set1, set2, set3, set4, set5]

# Naive Bayes

In [0]:
class NaiveBayesClassifier:
  def __init__(self):
    self.apriori_dem = 0
    self.apriori_rep = 0
    self.count_democrat = 0
    self.count_republican = 0
    self.democrat = {'y':[0]*16, 'n':[0]*16}
    self.republican = {'y':[0]*16, 'n':[0]*16}

  def train(self, train):
    train_all = []
    train_fill = []
    for line in train:
        if('?' in line):
            train_fill.append(line)
        else:
            train_all.append(line)
    for line in train_all:
        attb = line.split(',')
        last = attb[len(attb) - 1]
        if(last == 'democrat'):
            self.count_democrat += 1
            for index in range(len(attb) - 1):
                if(attb[index] == 'y'):
                    self.democrat['y'][index] += 1
                else:
                    self.democrat['n'][index] += 1
        else:
            self.count_republican += 1
            for index in range(len(attb) - 1):
                if(attb[index] == 'y'):
                    self.republican['y'][index] += 1
                else:
                    self.republican['n'][index] += 1

    for line in train_fill:
        attb = line.split(',')
        last = attb[len(attb) - 1]
        if(last == 'democrat'):
            self.count_democrat += 2
            for index in range(len(attb) - 1):
                if(attb[index] == 'y'):
                    self.democrat['y'][index] += 2
                elif(attb[index] == 'n'):
                    self.democrat['n'][index] += 2
                else:
                    self.democrat['y'][index] += 1
                    self.democrat['n'][index] += 1
        else:
            self.count_republican += 2
            for index in range(len(attb) - 1):
                if(attb[index] == 'y'):
                    self.republican['y'][index] += 2
                elif(attb[index] == 'n'):
                    self.republican['n'][index] += 2
                else:
                    self.republican['y'][index] += 1
                    self.republican['n'][index] += 1  

    self.apriori_dem = (self.count_democrat)/(self.count_democrat + self.count_republican)
    self.apriori_rep = 1 - self.apriori_dem
  
  def predict(self, test):
    true_pos = 0
    true_neg = 0
    false_pos = 0
    false_neg = 0
    predictions = []
    for line in test:
        attb = line.split(',')
        ind = -1
        p_dem = self.apriori_dem
        p_rep = self.apriori_rep
        for index in range(len(attb) - 1):
            if(attb[index]  == 'y' or attb[index] == 'n'):
                p_dem = p_dem * (self.democrat[attb[index]][index]/self.count_democrat)
                p_rep = p_rep * (self.republican[attb[index]][index]/self.count_republican)

            else:
                if(self.democrat['y'][index] > self.democrat['n'][index]):
                    p_dem = p_dem * (self.democrat['y'][index]/self.count_democrat)
                else:
                    p_dem = p_dem * (self.democrat['n'][index]/self.count_democrat)
                if(self.republican['y'][index] > self.republican['n'][index]):
                    p_rep = p_rep * (self.republican['y'][index]/self.count_republican)
                else:
                    p_rep = p_rep * (self.republican['n'][index]/self.count_republican)

        if(p_dem > p_rep):
          predictions.append('democrat')
        else:
          predictions.append('republican')

    return predictions
  
  def metrics(self, predictions, actual_values):
    true_pos = 0
    true_neg = 0
    false_pos = 0
    false_neg = 0    
    for i in range(len(predictions)):
      if predictions[i] == 'democrat':
        if actual_values[i] =='democrat':
          true_neg += 1
        else:
          false_pos += 1
      else:
        if actual_values[i] == 'republican':
          true_pos += 1
        else:
          false_neg += 1

    total =   true_pos + false_pos + false_neg + true_neg
    confMat = [[true_pos,false_pos],[false_neg,true_neg]] 
    metrics_dict = {  'accuracy' : (true_pos + true_neg)/total,
                      'recall' : true_pos/(true_pos + false_pos),
                      'precision' : true_pos/(true_pos + false_neg),
                      'confMat'  :  confMat
      
    }
    return metrics_dict    

  def configs(self):
    return {'apriori_dem' :self.apriori_dem ,
    'apriori_rep':self.apriori_rep,
    'count_democrat':self.count_democrat ,
    'count_republican':self.count_republican ,
    'dem':self.democrat ,
    'rep':self.republican }

In [0]:
# K fold validation
classifiers = []
metrics_dict = []
for num in range(5):
    test = data[num]
    train = []
    for x in range(5):
        if(num == x):
            continue
        train.append(data[x])
    train = [item for sublist in train for item in sublist]

    classifiers.append(NaiveBayesClassifier())
    classifiers[num].train(train)
    predictions = classifiers[num].predict(test)

    actual_values = []
    for case in test:
      actual_values.append(case.split(',')[-1])    
      
    metrics_dict.append(classifiers[num].metrics(predictions, actual_values))

In [0]:
print("Metrics of each Plain Naive Bayes Classifier in 5 fold validation")
print("First row of confusion matrix corresponds to 'republican' label")
i = 1
for metric in metrics_dict:
  print("Model ",i," ",metric)
  i+=1

Metrics of each Plain Naive Bayes Classifier in 5 fold validation
First row of confusion matrix corresponds to 'republican' label
Model  1   {'accuracy': 0.8620689655172413, 'recall': 0.8974358974358975, 'precision': 0.813953488372093, 'confMat': [[35, 4], [8, 40]]}
Model  2   {'accuracy': 0.8620689655172413, 'recall': 0.9166666666666666, 'precision': 0.7857142857142857, 'confMat': [[33, 3], [9, 42]]}
Model  3   {'accuracy': 0.9425287356321839, 'recall': 1.0, 'precision': 0.8571428571428571, 'confMat': [[30, 0], [5, 52]]}
Model  4   {'accuracy': 0.9310344827586207, 'recall': 0.90625, 'precision': 0.90625, 'confMat': [[29, 3], [3, 52]]}
Model  5   {'accuracy': 0.9310344827586207, 'recall': 0.9354838709677419, 'precision': 0.8787878787878788, 'confMat': [[29, 2], [4, 52]]}


In [0]:
print("5 fold validation metrics of Plain Naive Bayes Classifier")
acc = 0
for lst in metrics_dict:
    acc += lst['accuracy']
acc = acc/5
print("\nAccuracy:", acc)

#finding recall
rec = 0
for lst in metrics_dict:
    rec += lst['recall']
rec = rec/5
print("\nRecall:", rec)

#finding precision
prec = 0
for lst in metrics_dict:
    prec += lst['precision']
prec = prec/5
print("\nPrecision:", prec)

#finding F score
f_score = 0
print("\nF-score:", (2*prec*rec)/(prec+rec))


5 fold validation metrics of Plain Naive Bayes Classifier

Accuracy: 0.9057471264367816

Recall: 0.9311672870140612

Precision: 0.848369702003423

F-score: 0.8878423080552144


#Randomized Boosting

In [0]:
# Boosting 
class randomBoostClassifier:
  def __init__(self, classifier, numberBoosting, samplingRatio):
    self.numberBoosting = numberBoosting
    self.samplingRatio = samplingRatio
    self.classifiers = [classifier() for _ in range(numberBoosting)]
    self.alphas = [0]*numberBoosting
  
  def train(self, train):
    train_size = len(train)
    data_weights = [1/train_size] * train_size
    sample_size = int(self.samplingRatio * train_size)

    target_value =[]
    for case in train:
      target_value.append(case.split(',')[-1])   
    # Randomised Boosting
    for i in range(self.numberBoosting):
      indices = np.random.randint(train_size, size = sample_size)
      sampled_data = {}  
      for ind in indices:
          sampled_data[ind] = train[ind]
      self.classifiers[i].train(list(sampled_data.values()))
      data_weights = [1/train_size] * train_size

      # Boosting
      error = 0
      for j in indices: 
          predicted_value = self.classifiers[i].predict([sampled_data[j]])[0]
          if predicted_value != target_value[j]:
            error += data_weights[j]

      # Normalising error
      error = error / sum(data_weights)

      # Setting alpha
      if error!=0:
        self.alphas[i] = (1/2)*math.log((1-error)/error)
      else: 
        self.alphas[i] = 349

  def predict(self, test):
    predictions = []

    for i in range(len(test)):
      all_predictions = []
      for j in range(self.numberBoosting):
        if self.classifiers[j].predict([test[i]])[0] == 'democrat':
          all_predictions.append(self.alphas[j])
        else:
          all_predictions.append(-1* self.alphas[j])
        
      if sum(all_predictions) > 0:
        predictions.append('democrat')
      else:
        predictions.append('republican')
      
    return predictions

  def metrics(self, predictions, actual_values):
      true_pos = 0
      true_neg = 0
      false_pos = 0
      false_neg = 0   
      for i in range(len(predictions)):
        if predictions[i] == 'democrat':
          if actual_values[i] =='democrat':
            true_neg += 1
          else:
            false_pos += 1
        else:
          if actual_values[i] == 'republican':
            true_pos += 1
          else:
            false_neg += 1
      total =   true_pos + false_pos + false_neg + true_neg
      confMat = [[true_pos,false_pos],[false_neg,true_neg]]
      metrics_dict = {  'accuracy' : (true_pos + true_neg)/total,
                        'recall' : true_pos/(true_pos + false_pos),
                        'precision' : true_pos/(true_pos + false_neg),
                        'confMat'  :  confMat
        
      }
      return metrics_dict      

In [0]:
random_classifiers = []
random_metrics_dict = []
for num in range(5):
    test = data[num]
    train = []
    for x in range(5):
        if(num == x):
            continue
        train.append(data[x])
    train = [item for sublist in train for item in sublist]
    random_classifiers.append(randomBoostClassifier(NaiveBayesClassifier, 200, 0.1))
    random_classifiers[num].train(train)
    random_predictions = random_classifiers[num].predict(test)
  
    actual_values = []
    for case in test:
      actual_values.append(case.split(',')[-1])    
    random_metrics_dict.append(random_classifiers[num].metrics(random_predictions, actual_values))

In [0]:
print("Metrics of each Randomized Boosted Naive Bayes classifier in 5 fold validation")
print("First row of confusion matrix corresponds to 'republican' label")
i = 1
for metric in random_metrics_dict:
  print("Model ",i," ",metric)
  i+=1

Metrics of each Randomized Boosted Naive Bayes classifier in 5 fold validation
First row of confusion matrix corresponds to 'republican' label
Model  1   {'accuracy': 0.896551724137931, 'recall': 0.9487179487179487, 'precision': 0.8409090909090909, 'confMat': [[37, 2], [7, 41]]}
Model  2   {'accuracy': 0.9310344827586207, 'recall': 0.8888888888888888, 'precision': 0.9411764705882353, 'confMat': [[32, 4], [2, 49]]}
Model  3   {'accuracy': 0.9770114942528736, 'recall': 1.0, 'precision': 0.9375, 'confMat': [[30, 0], [2, 55]]}
Model  4   {'accuracy': 0.9540229885057471, 'recall': 0.90625, 'precision': 0.9666666666666667, 'confMat': [[29, 3], [1, 54]]}
Model  5   {'accuracy': 0.9655172413793104, 'recall': 1.0, 'precision': 0.9117647058823529, 'confMat': [[31, 0], [3, 53]]}


In [0]:
print("With randomized boosting (200 boosted Naive Bayes classifier,0.1 sampling ratio), the metrics are: ")
acc = 0
for lst in random_metrics_dict:
    acc += lst['accuracy']
acc = acc/5
print("\nAccuracy:", acc)
rec = 0
for lst in random_metrics_dict:
    rec += lst['recall']
rec = rec/5
print("\nRecall:", rec)
prec = 0
for lst in random_metrics_dict:
    prec += lst['precision']
prec = prec/5
print("\nPrecision:", prec)
f_score = 0
print("\nF-score:", (2*prec*rec)/(prec+rec))


With randomized boosting (200 boosted Naive Bayes classifier,0.1 sampling ratio), the metrics are: 

Accuracy: 0.9448275862068964

Recall: 0.9487713675213675

Precision: 0.9196033868092692

F-score: 0.9339597003845094


# Plain Bagging

In [0]:
# Bagging
class BaggedClassifier:
  def __init__(self, classifier, numberClassifiers, baggingRatio):
    self.numberClassifiers = numberClassifiers
    self.baggingRatio = baggingRatio
    self.classifiers = [classifier() for _ in range(numberClassifiers)]
  
  def train(self, train):
    train_size = len(train)
    data_weights = [1/train_size] * train_size

    sample_size = int(self.baggingRatio * train_size)
    # Bagging
    for i in range(self.numberClassifiers):
      random_integers = np.random.randint(train_size, size = sample_size)
      sampled_data = []
      for ind in random_integers:
          sampled_data.append(train[ind])
      self.classifiers[i].train(sampled_data)


  def predict(self, test):
    predictions = []

    for i in range(len(test)):
      all_predictions = []
      for j in range(self.numberClassifiers):
        if self.classifiers[j].predict([test[i]])[0] == 'democrat':
          all_predictions.append(1)
        else:
          all_predictions.append(-1)
        
      if sum(all_predictions) > 0:
        predictions.append('democrat')
      else:
        predictions.append('republican')
      
    return predictions

  def metrics(self, predictions, actual_values):
      true_pos = 0
      true_neg = 0
      false_pos = 0
      false_neg = 0   
      for i in range(len(predictions)):
        if predictions[i] == 'democrat':
          if actual_values[i] =='democrat':
            true_neg += 1
          else:
            false_pos += 1
        else:
          if actual_values[i] == 'republican':
            true_pos += 1
          else:
            false_neg += 1
      total =   true_pos + false_pos + false_neg + true_neg
      confMat = [[true_pos,false_pos],[false_neg,true_neg]]
      metrics_dict = {  'accuracy' : (true_pos + true_neg)/total,
                        'recall' : true_pos/(true_pos + false_pos),
                        'precision' : true_pos/(true_pos + false_neg),
                        'confMat'  :  confMat
        
      }
      return metrics_dict      

In [0]:
bagged_classifiers = []
bagged_metrics_dict = []
for num in range(5):
    test = data[num]
    train = []
    for x in range(5):
        if(num == x):
            continue
        train.append(data[x])
    train = [item for sublist in train for item in sublist]
    bagged_classifiers.append(BaggedClassifier(NaiveBayesClassifier, 200, 0.1))
    bagged_classifiers[num].train(train)
    bagged_predictions = bagged_classifiers[num].predict(test)
  
    actual_values = []
    for case in test:
      actual_values.append(case.split(',')[-1])    
    bagged_metrics_dict.append(bagged_classifiers[num].metrics(bagged_predictions, actual_values))

In [0]:
print("Metrics of each Bagged Naive Bayes classifier in 5 fold validation")
print("First row of confusion matrix corresponds to 'republican' label")
i = 1
for metric in bagged_metrics_dict:
  print("Model ",i," ",metric)
  i+=1

Metrics of each Bagged Naive Bayes classifier in 5 fold validation
First row of confusion matrix corresponds to 'republican' label
Model  1   {'accuracy': 0.8735632183908046, 'recall': 0.8974358974358975, 'precision': 0.8333333333333334, 'confMat': [[35, 4], [7, 41]]}
Model  2   {'accuracy': 0.9310344827586207, 'recall': 0.8888888888888888, 'precision': 0.9411764705882353, 'confMat': [[32, 4], [2, 49]]}
Model  3   {'accuracy': 0.9770114942528736, 'recall': 1.0, 'precision': 0.9375, 'confMat': [[30, 0], [2, 55]]}
Model  4   {'accuracy': 0.9540229885057471, 'recall': 0.90625, 'precision': 0.9666666666666667, 'confMat': [[29, 3], [1, 54]]}
Model  5   {'accuracy': 0.9540229885057471, 'recall': 0.9354838709677419, 'precision': 0.9354838709677419, 'confMat': [[29, 2], [2, 54]]}


In [0]:
print("With bagging (200 Naive Bayes classifiers, 0.1 bagging ratio), the metrics are: ")
acc = 0
for lst in bagged_metrics_dict:
    acc += lst['accuracy']
acc = acc/5
print("\nAccuracy:", acc)
rec = 0
for lst in bagged_metrics_dict:
    rec += lst['recall']
rec = rec/5
print("\nRecall:", rec)
prec = 0
for lst in bagged_metrics_dict:
    prec += lst['precision']
prec = prec/5
print("\nPrecision:", prec)
f_score = 0
print("\nF-score:", (2*prec*rec)/(prec+rec))

With bagging (200 Naive Bayes classifiers, 0.1 bagging ratio), the metrics are: 

Accuracy: 0.9379310344827587

Recall: 0.9256117314585056

Precision: 0.9228320683111955

F-score: 0.9242198098761596


#Plain Boosting


In [0]:
# Boosting
class PlainBoosting:
  def __init__(self, classifier, numberBoosting, samplingRatio):
    self.numberBoosting = numberBoosting
    self.samplingRatio = samplingRatio
    self.classifiers = [classifier() for _ in range(numberBoosting)]
    self.alphas = [0]*numberBoosting
  
  def train(self, train):
    train_size = len(train)
    data_weights = [1/train_size] * train_size
    target_value =[]
    for case in train:
      target_value.append(case.split(',')[-1])   
    sample_size = int(self.samplingRatio * train_size)

    indices = list(range(0,len(train)))
    # Bagging
    for i in range(self.numberBoosting):
      sampled_data = {}
      for ind in indices:
        sampled_data[ind] = train[ind]
        self.classifiers[i].train(list(sampled_data.values()))

      # Boosting
      error = 0
      for j in range(len(train)):
        if j in indices:
          predicted_value = self.classifiers[i].predict([sampled_data[j]])[0]
          if predicted_value != target_value[j]:
            error += data_weights[j]

      # Normalising error
      error = error / sum(data_weights)

      # Setting alpha
      if error!=0:
        self.alphas[i] = (1/2)*math.log((1-error)/error)
      else: 
        self.alphas[i] = 349

      # Updating weights
      for j in range(len(train)):
        if j in indices:
          predicted_value = self.classifiers[i].predict([sampled_data[j]])[0]
          if predicted_value != target_value[j]:
              data_weights[j]  = data_weights[j]*math.exp((self.alphas[i]))
          else:
              data_weights[j]  = data_weights[j]*math.exp((self.alphas[i]))
      
      sum_data_weights = sum(data_weights)
      for j in range(len(train)):
         data_weights[j] = data_weights[j]/ sum_data_weights
      
      data_weights_threshold = sorted(data_weights, reverse=True)[int(len(data_weights)*0.5)]

      indices = []
      for ind in range(0,len(data_weights)):
        if data_weights[ind] >= data_weights_threshold:
          indices.append(ind)
      random.shuffle(indices)     
      indices = indices[:int(len(indices)*self.samplingRatio)]

  def predict(self, test):
    predictions = []

    for i in range(len(test)):
      all_predictions = []
      for j in range(self.numberBoosting):
        if self.classifiers[j].predict([test[i]])[0] == 'democrat':
          all_predictions.append(self.alphas[j])
        else:
          all_predictions.append(-1* self.alphas[j])
        
      if sum(all_predictions) > 0:
        predictions.append('democrat')
      else:
        predictions.append('republican')
      
    return predictions

  def metrics(self, predictions, actual_values):
      true_pos = 0
      true_neg = 0
      false_pos = 0
      false_neg = 0   
      for i in range(len(predictions)):
        if predictions[i] == 'democrat':
          if actual_values[i] =='democrat':
            true_neg += 1
          else:
            false_pos += 1
        else:
          if actual_values[i] == 'republican':
            true_pos += 1
          else:
            false_neg += 1
      total =   true_pos + false_pos + false_neg + true_neg
      confMat = [[true_pos,false_pos],[false_neg,true_neg]]
      metrics_dict = {  'accuracy' : (true_pos + true_neg)/total,
                        'recall' : true_pos/(true_pos + false_pos),
                        'precision' : true_pos/(true_pos + false_neg),
                        'confMat'  :  confMat
        
      }
      return metrics_dict      

In [0]:
from tqdm import tqdm
PlainBoosting_classifiers = []
PlainBoosting_metrics_dict = []
for num in tqdm(range(3)):
    test = data[num]
    train = []
    for x in range(5):
        if(num == x):
            continue
        train.append(data[x])
    train = [item for sublist in train for item in sublist]
    random.shuffle(train)
    PlainBoosting_classifiers.append(PlainBoosting(NaiveBayesClassifier, 200, 0.6))
    PlainBoosting_classifiers[num].train(train)
    PlainBoosting_predictions = PlainBoosting_classifiers[num].predict(test)
  
    actual_values = []
    for case in test:
      actual_values.append(case.split(',')[-1])    
    PlainBoosting_metrics_dict.append(PlainBoosting_classifiers[num].metrics(PlainBoosting_predictions, actual_values))


  0%|          | 0/3 [00:00<?, ?it/s][A
 33%|███▎      | 1/3 [00:13<00:27, 13.65s/it][A
 67%|██████▋   | 2/3 [00:25<00:13, 13.25s/it][A
100%|██████████| 3/3 [00:33<00:00, 11.67s/it][A
[A

In [0]:
print("Metrics of each Boosted Naive Bayes classifier in 3 fold validation")
print("First row of confusion matrix corresponds to 'republican' label")
i = 1
for metric in PlainBoosting_metrics_dict:
  print("Model ",i," ",metric)
  i+=1

Metrics of each Boosted Naive Bayes classifier in 3 fold validation
First row of confusion matrix corresponds to 'republican' label
Model  1   {'accuracy': 0.8620689655172413, 'recall': 0.8974358974358975, 'precision': 0.813953488372093, 'confMat': [[35, 4], [8, 40]]}
Model  2   {'accuracy': 0.8620689655172413, 'recall': 0.9166666666666666, 'precision': 0.7857142857142857, 'confMat': [[33, 3], [9, 42]]}
Model  3   {'accuracy': 0.9425287356321839, 'recall': 1.0, 'precision': 0.8571428571428571, 'confMat': [[30, 0], [5, 52]]}


In [0]:
print("With plain boosting (200 Naive Bayes classifiers), the metrics are: ")
acc = 0
for lst in PlainBoosting_metrics_dict:
    acc += lst['accuracy']
acc = acc/3
print("\nAccuracy:", acc)
rec = 0
for lst in PlainBoosting_metrics_dict:
    rec += lst['recall']
rec = rec/3
print("\nRecall:", rec)
prec = 0
for lst in PlainBoosting_metrics_dict:
    prec += lst['precision']
prec = prec/3
print("\nPrecision:", prec)
f_score = 0
print("\nF-score:", (2*prec*rec)/(prec+rec))

With plain boosting (200 Naive Bayes classifiers), the metrics are: 

Accuracy: 0.8888888888888888

Recall: 0.9380341880341879

Precision: 0.8189368770764119

F-score: 0.8744489921252843
