In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
ionosphere_df = pd.read_csv('datasets/ionosphere.csv')

In [3]:
len(ionosphere_df)

351

# Data preprocessing

In this dataset, we need to have only 2 kinds of preprocessing: 
- Removing null values
- Removing duplicate rows

## Checking for null values

In [4]:
print("Total number of rows", len(ionosphere_df))
ionosphere_df.isna().sum()

('Total number of rows', 351)


0     0
1     0
2     1
3     0
4     1
5     0
6     0
7     0
8     2
9     1
10    0
11    1
12    0
13    0
14    1
15    1
16    1
17    0
18    0
19    1
20    0
21    0
22    0
23    0
24    3
25    0
26    2
27    2
28    0
29    0
30    2
31    1
32    0
33    0
34    0
dtype: int64

In the above output, we see that the number of missing valued rows are very less than the total number of rows. Hence, we drop all rows with any null value.

In [5]:
ionosphere_df = ionosphere_df.dropna()

In [6]:
print("Size of dataset", len(ionosphere_df))

('Size of dataset', 331)


## Removing duplicate rows

In [7]:
ionosphere_df = ionosphere_df.drop_duplicates()

In [8]:
print("The final dataset size is", len(ionosphere_df))

('The final dataset size is', 330)


In [9]:
ionosphere_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,1,0,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.00000,0.03760,...,-0.51171,0.41078,-0.46168,0.21266,-0.34090,0.42267,-0.54487,0.18641,-0.45300,g
1,1,0,1.00000,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.00000,-0.04549,...,-0.26569,-0.20468,-0.18401,-0.19040,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,b
2,1,0,1.00000,-0.03365,1.00000,0.00485,1.00000,-0.12062,0.88965,0.01198,...,-0.40220,0.58984,-0.22145,0.43100,-0.17365,0.60436,-0.24180,0.56045,-0.38238,g
3,1,0,1.00000,-0.45161,1.00000,1.00000,0.71216,-1.00000,0.00000,0.00000,...,0.90695,0.51613,1.00000,1.00000,-0.20099,0.25682,1.00000,-0.32382,1.00000,b
4,1,0,1.00000,-0.02401,0.94140,0.06531,0.92106,-0.23255,0.77152,-0.16399,...,-0.65158,0.13290,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697,g
5,1,0,0.02337,-0.00592,-0.09924,-0.11949,-0.00763,-0.11824,0.14706,0.06637,...,-0.01535,-0.03240,0.09223,-0.07859,0.00732,0.00000,0.00000,-0.00039,0.12011,b
6,1,0,0.97588,-0.10602,0.94601,-0.20800,0.92806,-0.28350,0.85996,-0.27342,...,-0.81634,0.13659,-0.82510,0.04606,-0.82395,-0.04262,-0.81318,-0.13832,-0.80975,g
7,0,0,0.00000,0.00000,0.00000,0.00000,1.00000,-1.00000,0.00000,0.00000,...,1.00000,1.00000,1.00000,0.00000,0.00000,1.00000,1.00000,0.00000,0.00000,b
8,1,0,0.96355,-0.07198,1.00000,-0.14333,1.00000,-0.21313,1.00000,-0.36174,...,-0.65440,0.57577,-0.69712,0.25435,-0.63919,0.45114,-0.72779,0.38895,-0.73420,g
9,1,0,-0.01864,-0.08459,0.00000,0.00000,0.00000,0.00000,0.11470,-0.26810,...,-0.01326,0.20645,-0.02294,0.00000,0.00000,0.16595,0.24086,-0.08208,0.38065,b


## Encoding categorical columns

The last column i.e column 34 has 2 types of entries: 'g' and 'b', standing for good and bad respectively.

Since the column takes only 2 kinds of values, we can use label encoding to encode the entire column.

In [10]:
ionosphere_df['34'] = ionosphere_df['34'].replace('g', 1)

In [11]:
ionosphere_df['34'] = ionosphere_df['34'].replace('b', 0)

In [12]:
ionosphere_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,1,0,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.0,0.0376,...,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453,1
1,1,0,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,...,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,0
2,1,0,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,...,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238,1
3,1,0,1.0,-0.45161,1.0,1.0,0.71216,-1.0,0.0,0.0,...,0.90695,0.51613,1.0,1.0,-0.20099,0.25682,1.0,-0.32382,1.0,0
4,1,0,1.0,-0.02401,0.9414,0.06531,0.92106,-0.23255,0.77152,-0.16399,...,-0.65158,0.1329,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697,1


In [13]:
cov_mat = np.asarray(ionosphere_df.cov())

In [14]:
cov_mat.shape

(35, 35)

In [15]:
print(sum(cov_mat[i].T.dot(cov_mat[i]) < 0 for i in range(len(cov_mat))))

0


## Splitting into train and test sets

In [16]:
import random

def train_test_split(train_percent, df, id_col):
  random.seed()
  df = df.sample(frac=1).reset_index(drop=True)
  train_set_len = int(train_percent * len(df[id_col]))

  return df.head(train_set_len), df.tail(len(df[id_col]) - train_set_len)

In [17]:
train_set, test_set = train_test_split(0.8, ionosphere_df, '0')

print("Size of the train set", len(train_set))
print("Size of the test set", len(test_set))

('Size of the train set', 264)
('Size of the test set', 66)


In [18]:
train_y = np.asarray(train_set['34'])
train_x = np.asarray(train_set.drop(columns = '34'))

In [19]:
test_y = np.asarray(test_set['34'])
test_x = np.asarray(test_set.drop(columns = '34'))

## Gradient Descent

In [20]:
import math

def sigmoid(x):
  return 1 / (1 + math.exp(-x))

In [21]:
def compute_gradient(x, y, w):
  return sum(x[i] * (sigmoid(w.T.dot(x[i])) - y[i]) for i in range(len(y)))

In [22]:
def gradient_descent(x, y, epochs, learning_rate):
  w = np.zeros(len(x[0]))

  for epoch in range(epochs):
    
    w = w - (learning_rate * compute_gradient(x, y, w))

  return w

In [23]:
def predict(x, w):
  return [1 if sigmoid(w.T.dot(x[i])) > 0.5 else 0 for i in range(len(x))]

In [35]:
def accuracy(pred, y):
  return sum(pred[i] == y[i] for i in range(len(y))) * 1.0 / len(y)

In [36]:
weights = gradient_descent(train_x, train_y, 20, 0.01)

test_preds = predict(test_x, weights)
train_preds = predict(train_x, weights)

print("Accuracy of the test set:", accuracy(test_preds, test_y))
print("Accuracy of the train set:", accuracy(train_preds, train_y))

('Accuracy of the test set:', 0.8939393939393939)
('Accuracy of the train set:', 0.8787878787878788)


## Newton's Method of Optimization

In [26]:
def double_derivative(x, w, k, j):
  return sum(x[i][j] * x[i][k] * sigmoid(w.T.dot(x[i])) * sigmoid(-w.T.dot(x[i])) for i in range(len(x)))

In [27]:
def compute_hessian(x, w):
  hessian = [[0 for j in range(len(w))] for i in range(len(w))]

  for i in range(len(w)):
    for j in range(len(w)):
      hessian[i][j] = double_derivative(x, w, i, j)
  
  # print(np.asarray(hessian).shape)
  return hessian

In [28]:
def newtons_method(x, y, epochs, lambda_correction):
  w = np.zeros(len(x[0]))

  for epoch in range(epochs):
    gradient = compute_gradient(x, y, w)
    hessian = (np.linalg.inv(compute_hessian(x, w) - lambda_correction * 
                           np.identity(34)))
    w = w - hessian.dot(gradient)

  return w

In [37]:
weights = newtons_method(train_x, train_y, 20, 0.0001)

test_preds = predict(test_x, weights)
train_preds = predict(train_x, weights)

print("Accuracy of the test set:", accuracy(test_preds, test_y))
print("Accuracy of the train set:", accuracy(train_preds, train_y))

('Accuracy of the test set:', 0.8484848484848485)
('Accuracy of the train set:', 0.8939393939393939)


## Naive Bayes Classifier 

In [30]:
corr = ionosphere_df.corr()

cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)

corr.style.background_gradient(cmap, axis=1)\
    .set_properties(**{'max-width': '60px', 'font-size': '10pt'})\
    .set_precision(1)\

  xa[xa < 0] = -1


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34
0,1.0,,0.3,-0.006,0.2,0.1,0.2,0.03,0.2,-0.06,0.03,0.07,0.1,0.2,0.1,0.1,0.05,0.09,0.2,0.03,0.2,-0.2,0.004,-0.08,0.009,0.2,-0.2,-0.02,0.1,-0.1,0.2,-0.1,0.2,0.01,0.5
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,0.3,,1.0,0.1,0.5,0.05,0.5,-0.02,0.5,0.06,0.3,0.2,0.2,0.2,0.2,0.1,0.2,0.2,0.3,0.2,0.2,0.2,0.2,-0.006,0.3,-0.05,0.07,0.1,0.4,0.04,0.2,-0.02,0.3,0.02,0.5
3,-0.006,,0.1,1.0,-0.03,-0.2,-0.04,0.2,-0.3,0.2,-0.2,0.3,-0.2,0.2,-0.3,0.2,-0.3,-0.1,-0.3,0.2,-0.3,-0.02,-0.2,0.2,-0.1,-0.2,-0.07,-0.006,-0.05,0.3,-0.2,-0.1,-0.2,0.04,0.1
4,0.2,,0.5,-0.03,1.0,0.06,0.6,-0.05,0.5,-0.02,0.4,0.04,0.5,0.1,0.4,0.1,0.3,0.03,0.2,0.05,0.3,0.2,0.5,0.1,0.2,-0.01,0.1,0.2,0.3,0.03,0.4,0.02,0.4,-0.09,0.5
5,0.1,,0.05,-0.2,0.06,1.0,-0.01,0.3,-0.1,0.2,-0.3,0.2,-0.3,0.1,-0.4,0.2,-0.3,0.2,-0.2,-0.07,-0.1,-0.1,-0.2,-0.3,-0.2,0.04,-0.2,-0.08,-0.03,-0.1,-0.1,0.3,0.01,0.2,0.2
6,0.2,,0.5,-0.04,0.6,-0.01,1.0,-0.1,0.4,-0.09,0.4,-0.01,0.6,0.09,0.6,-0.01,0.4,0.1,0.4,0.2,0.6,0.2,0.4,0.1,0.3,0.08,0.09,0.1,0.3,-0.005,0.4,0.003,0.5,-0.07,0.5
7,0.03,,-0.02,0.2,-0.05,0.3,-0.1,1.0,-0.3,0.4,-0.4,0.4,-0.4,0.2,-0.3,0.4,-0.5,0.06,-0.4,0.08,-0.4,-0.2,-0.3,0.02,-0.2,-0.1,-0.3,0.06,-0.1,0.06,-0.2,0.1,-0.2,0.4,0.2
8,0.2,,0.5,-0.3,0.5,-0.1,0.4,-0.3,1.0,-0.3,0.7,-0.2,0.6,-0.09,0.6,-0.02,0.6,0.2,0.7,0.07,0.5,0.2,0.4,0.2,0.4,0.1,0.2,0.1,0.3,-0.02,0.3,-0.05,0.3,-0.09,0.3
9,-0.06,,0.06,0.2,-0.02,0.2,-0.09,0.4,-0.3,1.0,-0.3,0.4,-0.4,0.3,-0.4,0.3,-0.4,0.1,-0.5,0.005,-0.4,-0.04,-0.3,0.1,-0.3,-0.05,-0.3,0.07,-0.1,-0.007,-0.2,-0.03,-0.2,0.08,0.1


### Univariate Guassian 

We can take any one of columns 0, 2, 4 or 6 for univariate bayes classifier due to the correlation these columns have on column 34 which is maximum among all other columns.

In [31]:
import math

def find_gaussian_params(x_column, y_column, df, outcome):
  mean = df[x_column][df[y_column] == outcome].mean()
  std_dev = math.sqrt(df[x_column][df[y_column] == outcome].var())

  return mean, std_dev

In [32]:
import math

def p_x_given_y(mu, sigma, x):
  # print(sigma)
  return (1 / (sigma * math.sqrt(2 * math.pi))) * math.exp(-0.5 * (((x - mu) / sigma) ** 2))

In [33]:
def p_y(y_col, n, outcome):
    no_highs = sum(y_col)
    print(no_highs, n)
    return no_highs * 1.0 / n if outcome == 1 else 1 - no_highs * 1.0 / n

In [43]:
def fit(x_col, y_col, df):
  mean_1, std_dev_1 = find_gaussian_params(x_col, y_col, df, 1)
  mean_0, std_dev_0 = find_gaussian_params(x_col, y_col, df, 0)

  return mean_1, std_dev_1, mean_0, std_dev_0

In [44]:
def predict(x, probability_c, df, mu_0, sigma_0, mu_1, sigma_1):
  # print("sigma", sigma_1)
  probability_0 = [p_x_given_y(mu_0, sigma_0, x[i]) * probability_c[0] for i in range(len(x))]
  probability_1 = [p_x_given_y(mu_1, sigma_1, x[i]) * probability_c[1] for i in range(len(x))]

  return [0 if probability_0[i] > probability_1[i] else 1 for i in range(len(x))]

In [45]:
def accuracy(y_pred, y_actual):
  return sum(y_pred[i] == y_actual[i] for i in range(len(y_actual))) * 1.0 / len(y_actual)

In [46]:
columns_considered = ['0', '2', '4', '6']

print("Mean and Std Deviation for outcome 0")
for column in columns_considered:
  mu, sigma = find_gaussian_params(column, '34', ionosphere_df, 0)
  print(mu, sigma)

print("Mean and Std Deviation for outcome 1")
for column in columns_considered:
  mu, sigma = find_gaussian_params(column, '34', ionosphere_df, 0)
  print(mu, sigma)

Mean and Std Deviation for outcome 0
(0.6890756302521008, 0.464829094885346)
(0.30057058823529414, 0.6622126454409145)
(0.24941873949579832, 0.7003173815257205)
(0.24371310924369746, 0.6328189176241198)
Mean and Std Deviation for outcome 1
(0.6890756302521008, 0.464829094885346)
(0.30057058823529414, 0.6622126454409145)
(0.24941873949579832, 0.7003173815257205)
(0.24371310924369746, 0.6328189176241198)


In [48]:
possible_columns = [ '2', '4', '6']

probability_c_train = [p_y(train_y, len(train_y), 0), p_y(train_y, len(train_y), 1)]
probability_c_test = [p_y(test_y, len(test_y), 0), p_y(test_y, len(test_y), 1)]

print(probability_c_test)
print(probability_c_train)

for column in possible_columns:
  mean_1, std_dev_1, mean_0, std_dev_0 = fit(column, '34', train_set)

  print("Accuracy of test set for col " + column + " is:", accuracy(predict(np.asarray(test_set[column]), probability_c_train, ionosphere_df, mean_0, std_dev_0, mean_1, std_dev_1), test_y))
  print("Accuracy of train set for col " + column + " is:", accuracy(predict(np.asarray(train_set[column]), probability_c_train, ionosphere_df, mean_0, std_dev_0, mean_1, std_dev_1), train_y))


(164, 264)
(164, 264)
(47, 66)
(47, 66)
[0.28787878787878785, 0.7121212121212122]
[0.3787878787878788, 0.6212121212121212]
('Accuracy of test set for col 2 is:', 0.7727272727272727)
('Accuracy of train set for col 2 is:', 0.7954545454545454)
('Accuracy of test set for col 4 is:', 0.803030303030303)
('Accuracy of train set for col 4 is:', 0.8143939393939394)
('Accuracy of test set for col 6 is:', 0.7727272727272727)
('Accuracy of train set for col 6 is:', 0.7916666666666666)


Since the accuracy is maximum for column 4, we take column 4.

## Multivariate Gaussian

#### Columns to be taken for multivariate Guassian

In [None]:
## From the correlation matrix we got above the value we predict depends highly on columns: 0, 2, 4, 6

train_x_gnb = train_set[['0', '2', '4', '5', '6', '8', '10', '11', '12', '30']].copy()
test_x_gnb = test_set[['0', '2', '4', '5', '6', '8', '10', '11', '12', '30']].copy()

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score 

In [None]:
gnb = GaussianNB()

In [None]:
gnb.fit(train_x_gnb, train_y)

In [None]:
train_yhat = gnb.predict(train_x_gnb)

In [None]:
print("Accuracy of the train data", accuracy_score(train_y, train_yhat))
print("f1 score of the train data", f1_score(train_y, train_yhat))

In [None]:
test_yhat = gnb.predict(test_x_gnb)

In [None]:
print("Accuracy of the test data", accuracy_score(test_y, test_yhat))
print("f1 score of the test data", f1_score(test_y, test_yhat))