In [1]:
import numpy as np
import pandas as pd


In [19]:
class GaussianDA:
  def __init__(self, step_size=0.01, max_iter=1000000, eps=1e-5, theta=None, verbose=True) -> None:
    self.theta =  theta
    self.step_size = step_size
    self.max_iter = max_iter
    self.eps = eps
    self.verbose = verbose

  @staticmethod
  def sigmoid(z):
    return 1/ (1+np.exp(-z))

  def predict(self, x):
    y_Hat = self.sigmoid(x.dot(self.theta))
    return y_Hat

  def fit(self, x,y):
    m,n = x.shape

    phi = 1 / m * np.sum(y == 1)
    mu_0 = (y == 0).dot(x) / np.sum(y == 0)
    mu_1 = (y == 1).dot(x) / np.sum(y == 1)
    mu_yi = np.where(np.expand_dims(y == 0, -1),
                      np.expand_dims(mu_0, 0),
                      np.expand_dims(mu_1, 0))
    sigma = 1 / m * (x - mu_yi).T.dot(x - mu_yi)

    sigma = (1/m) *(x - mu_yi).T.dot(x - mu_yi)
    self.theta = np.zeros(n + 1)
    sigma_Inverse = np.linalg.inv(sigma)
    mu_diff = mu_0.T.dot(sigma_Inverse).dot(mu_0) - mu_1.T.dot(sigma_Inverse).dot(mu_1)
    self.theta[0] = 1/2 * mu_diff - np.log((1-phi)/phi)
    self.theta[1:] = -sigma_Inverse.dot(mu_0 - mu_1)

    if self.verbose:
            print('Final theta (GDA): {}'.format(self.theta))



In [29]:
df = pd.read_csv("ds2_train.csv")

x_train = df.iloc[:, 0:2]
y_train = df.iloc[:, 2]

In [15]:
x_ones = np.ones((x_train.shape[0],1))
x_train = np.concatenate((x_ones, x_train), axis=1)

In [30]:
x_train

Unnamed: 0,x_1,x_2
0,1.259481,3.507940
1,0.922057,0.991203
2,0.278818,0.112071
3,1.518066,1.653732
4,-0.693938,0.685966
...,...,...
795,2.328355,2.173261
796,1.167061,1.456635
797,1.544105,1.149879
798,0.575461,1.017255


In [31]:
y_train

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
795    1.0
796    1.0
797    1.0
798    1.0
799    1.0
Name: y, Length: 800, dtype: float64

In [32]:
clf = GaussianDA()
clf.fit(x_train, y_train)

Final theta (GDA): [-2.11390101  0.85879252  0.84927843]


In [34]:
df_eval = pd.read_csv("ds2_valid.csv")

x_eval = df_eval.iloc[:, 0:2]
y_eval = df_eval.iloc[:, 2]

In [35]:
x_ones = np.ones((x_eval.shape[0],1))
x_eval = np.concatenate((x_ones, x_eval), axis=1)

In [36]:
p_eval = clf.predict(x_eval)

In [37]:
yhat = p_eval > 0.5

In [38]:
print('LR Accuracy: %.2f' % np.mean( (yhat == 1) == (y_eval == 1)))

LR Accuracy: 0.86
