In [195]:
import numpy as np
import random
import pandas as pd

In [196]:
# read data
m0 = np.genfromtxt('DS1_m_0.txt', delimiter=',',usecols=range(0,20))
m1 = np.genfromtxt('DS1_m_1.txt', delimiter=',',usecols=range(0,20))
cov = np.genfromtxt('DS1_Cov.txt', delimiter=',',usecols=range(0,20))

# generate data
neg = np.random.multivariate_normal(m0, cov, 2000)
pos = np.random.multivariate_normal(m1, cov, 2000)

# pick 30% random examples as test set
index = np.arange(2000)
np.random.shuffle(index)
neg_test = neg[index[:600]]
pos_test = pos[index[:600]]
neg_train = neg[index[600:]]
pos_train = pos[index[600:]]

# use transpose so that features align vertically, examples align horizontally
train = np.vstack((neg_train, pos_train)).transpose()
test = np.vstack((neg_test,pos_test)).transpose()
np.savetxt("DS1.txt",test,delimiter=',')
np.savetxt("DS1_train.txt",train,delimiter=',')

In [197]:
# load training data
train = np.asmatrix(np.genfromtxt('DS1_train.txt', delimiter=','))
neg_train = train[:,:1400]
pos_train = train[:,1400:]

n0 = neg_train.shape[1]
n1 = pos_train.shape[1]

# for equal number of examples in each class, Y~Bernoulli(0.5)
p0 = 0.5
p1 = 0.5

# estimate mean
u0 = (np.sum(neg_train,axis=1) / n0).reshape(20,1)
u1 = (np.sum(pos_train,axis=1) / n1).reshape(20,1)

# estimate covariance
s0 = np.asmatrix(neg_train- u0).dot(np.asmatrix(neg_train - u0).transpose()) / n0
s1 = np.asmatrix(pos_train - u1).dot(np.asmatrix(pos_train - u1).transpose()) / n1
sigma = s0*n0/(n0+n1) + s1*n1/(n0+n1)

# compute w and w0
w = np.linalg.inv(sigma).dot(u0-u1)
w0 = -1/2*(u0.transpose().dot(np.linalg.inv(sigma)).dot(u0)) + 1/2* (u1.transpose().dot(np.linalg.inv(sigma)).dot(u1)) + np.log(p0/p1)

In [198]:
# logistic sigmoid function
def ls(a):
    return 1/(1+np.exp(-a))

def predict(x,w,w0):
    prob0 = ls(w.transpose().dot(x) + w0)
    res = np.zeros(x.shape[1])
    res[np.where(prob0<0.5)[1]] = 1
    return res

In [199]:
# LDA

# load test set 
test = np.asmatrix(np.genfromtxt('DS1.txt', delimiter=','))
res = predict(test,w,w0)
res_neg = res[:600]
res_pos = res[600:]

tn = len(np.where(res_neg == 0)[0]) # true negative
fp = len(np.where(res_neg == 1)[0]) # false postive
tp = len(np.where(res_pos == 1)[0]) # true positive
fn = len(np.where(res_pos == 0)[0]) # false negative

acc = (tn+tp)/(tn+fn+tp+fp)
prec = tp/(tp+fp)
rec = tp/(tp+fn)
f1 = 2*prec*rec/(prec+rec)

In [202]:
print("DS1_LDA Result")
print("Accuracy:",acc)
print("Precision:",prec)
print("Recall:",rec)
print("F1 Measure:",f1)

DS1_LDA Result
Accuracy: 0.9475
Precision: 0.9558573853989814
Recall: 0.9383333333333334
F1 Measure: 0.9470142977291843


In [206]:
# k-NN 

# find k nearest neighbors and return class
def fN(x, data, k):
    eDis = np.sqrt(np.sum(np.square(data - x),axis=0))
    rank = np.argsort(eDis)
    kN = rank[:,:k]
    kN[np.where(kN<1400)]=0
    kN[np.where(kN>=1400)]=1
    if (np.mean(kN) > 0.5):
        return 1
    else:
        return 0

# load test set
test = np.asmatrix(np.genfromtxt('DS1.txt', delimiter=','))

# set k range
kmax = 20


res = np.zeros((kmax,1200))
for k in range(0,kmax):
    for i in range(0,test.shape[1]):
        res[k,i] = fN(test[:,i],train,k+1)

res_neg = res[:,:600]
res_pos = res[:,600:]

acc = np.zeros(kmax)
prec = np.zeros(kmax)
rec = np.zeros(kmax)
f1 = np.zeros(kmax)
for k in range(0,kmax):
    tn = len(np.where(res_neg[k,:] == 0)[0]) # true negative
    fp = len(np.where(res_neg[k,:] == 1)[0]) # false postive
    tp = len(np.where(res_pos[k,:] == 1)[0]) # true positive
    fn = len(np.where(res_pos[k,:] == 0)[0]) # false negative
    acc[k] = (tn+tp)/(tn+fn+tp+fp)
    prec[k] = tp/(tp+fp)
    rec[k] = tp/(tp+fn)
    f1[k] = 2*prec[k]*rec[k]/(prec[k]+rec[k])

In [208]:
print("DS1_kNN Result")
print("k = 1 to 20")
summary = np.vstack((acc,prec,rec, f1)).transpose()
pd.DataFrame(summary, np.arange(1,kmax+1),["Accuracy", "Precision", "Recall", "F1"])

DS1_kNN Result
k = 1 to 20


Unnamed: 0,Accuracy,Precision,Recall,F1
1,0.521667,0.522337,0.506667,0.514382
2,0.528333,0.556291,0.28,0.372506
3,0.519167,0.519931,0.5,0.509771
4,0.5325,0.552279,0.343333,0.423433
5,0.545,0.546392,0.53,0.538071
6,0.53,0.544118,0.37,0.440476
7,0.536667,0.539146,0.505,0.521515
8,0.533333,0.547619,0.383333,0.45098
9,0.530833,0.532062,0.511667,0.521665
10,0.5475,0.566745,0.403333,0.471276


In [209]:
# 3 mixture of Gaussian

# read data
m01 = np.genfromtxt('DS2_c2_m1.txt', delimiter=',',usecols=range(0,20))
m02 = np.genfromtxt('DS2_c2_m2.txt', delimiter=',',usecols=range(0,20))
m03 = np.genfromtxt('DS2_c2_m3.txt', delimiter=',',usecols=range(0,20))
m11 = np.genfromtxt('DS2_c1_m1.txt', delimiter=',',usecols=range(0,20))
m12 = np.genfromtxt('DS2_c1_m2.txt', delimiter=',',usecols=range(0,20))
m13 = np.genfromtxt('DS2_c1_m3.txt', delimiter=',',usecols=range(0,20))
cov1 = np.genfromtxt('DS2_Cov1.txt', delimiter=',',usecols=range(0,20))
cov2 = np.genfromtxt('DS2_Cov2.txt', delimiter=',',usecols=range(0,20))
cov3 = np.genfromtxt('DS2_Cov3.txt', delimiter=',',usecols=range(0,20))

# generate data
neg = np.random.multivariate_normal(m01, cov1, 2000)
pos = np.random.multivariate_normal(m11, cov1, 2000)

for i in range(0,2000):
    p = random.random()
    if p < 0.42:
        neg[i] = np.random.multivariate_normal(m02, cov2)
    elif p < 0.9:
        neg[i] = np.random.multivariate_normal(m03, cov3)
    
    p = random.random()
    if p < 0.42:
        pos[i] = np.random.multivariate_normal(m12, cov2)
    elif p < 0.9:
        pos[i] = np.random.multivariate_normal(m13, cov3)
    
# pick 30% random examples as test set
index = np.arange(2000)
np.random.shuffle(index)
neg_test = neg[index[:600]]
pos_test = pos[index[:600]]
neg_train = neg[index[600:]]
pos_train = pos[index[600:]]

# use transpose so that features align vertically, examples align horizontally
train = np.vstack((neg_train, pos_train)).transpose()
test = np.vstack((neg_test,pos_test)).transpose()
np.savetxt("DS2.txt",test,delimiter=',')
np.savetxt("DS2_train.txt",train,delimiter=',')

In [210]:
# load training data
train = np.asmatrix(np.genfromtxt('DS2_train.txt', delimiter=','))
neg_train = train[:,:1400]
pos_train = train[:,1400:]

n0 = neg_train.shape[1]
n1 = pos_train.shape[1]

# for equal number of examples in each class, Y~Bernoulli(0.5)
p0 = 0.5
p1 = 0.5

# estimate mean
u0 = (np.sum(neg_train,axis=1) / n0).reshape(20,1)
u1 = (np.sum(pos_train,axis=1) / n1).reshape(20,1)

# estimate covariance
s0 = np.asmatrix(neg_train- u0).dot(np.asmatrix(neg_train - u0).transpose()) / n0
s1 = np.asmatrix(pos_train - u1).dot(np.asmatrix(pos_train - u1).transpose()) / n1
sigma = s0*n0/(n0+n1) + s1*n1/(n0+n1)

# compute w and w0
w = np.linalg.inv(sigma).dot(u0-u1)
w0 = -1/2*(u0.transpose().dot(np.linalg.inv(sigma)).dot(u0)) + 1/2* (u1.transpose().dot(np.linalg.inv(sigma)).dot(u1)) + np.log(p0/p1)

In [211]:
# load test set 
test = np.asmatrix(np.genfromtxt('DS2.txt', delimiter=','))
res = predict(test,w,w0)
res_neg = res[:600]
res_pos = res[600:]

tn = len(np.where(res_neg == 0)[0]) # true negative
fp = len(np.where(res_neg == 1)[0]) # false postive
tp = len(np.where(res_pos == 1)[0]) # true positive
fn = len(np.where(res_pos == 0)[0]) # false negative

acc = (tn+tp)/(tn+fn+tp+fp)
prec = tp/(tp+fp)
rec = tp/(tp+fn)
f1 = 2*prec*rec/(prec+rec)

In [212]:
print("DS2_LDA Result")
print("Accuracy:",acc)
print("Precision:",prec)
print("Recall:",rec)
print("F1 measure:", f1)

DS2_LDA Result
Accuracy: 0.53
Precision: 0.528125
Recall: 0.5633333333333334
F1 measure: 0.5451612903225808


In [213]:
#k-NN

# load test set
test = np.asmatrix(np.genfromtxt('DS1.txt', delimiter=','))

# set k range
kmax = 20


res = np.zeros((kmax,1200))
for k in range(0,kmax):
    for i in range(0,test.shape[1]):
        res[k,i] = fN(test[:,i],train,k+1)

res_neg = res[:,:600]
res_pos = res[:,600:]

acc = np.zeros(kmax)
prec = np.zeros(kmax)
rec = np.zeros(kmax)
f1 = np.zeros(kmax)
for k in range(0,kmax):
    tn = len(np.where(res_neg[k,:] == 0)[0]) # true negative
    fp = len(np.where(res_neg[k,:] == 1)[0]) # false postive
    tp = len(np.where(res_pos[k,:] == 1)[0]) # true positive
    fn = len(np.where(res_pos[k,:] == 0)[0]) # false negative
    acc[k] = (tn+tp)/(tn+fn+tp+fp)
    prec[k] = tp/(tp+fp)
    rec[k] = tp/(tp+fn)
    f1[k] = 2*prec[k]*rec[k]/(prec[k]+rec[k])

In [214]:
print("DS2_kNN Result")
print("k = 1 to 20")
summary = np.vstack((acc,prec,rec, f1)).transpose()
pd.DataFrame(summary, np.arange(1,kmax+1),["Accuracy", "Precision", "Recall", "F1"])


DS2_kNN Result
k = 1 to 20


Unnamed: 0,Accuracy,Precision,Recall,F1
1,0.515833,0.51586,0.515,0.51543
2,0.510833,0.520505,0.275,0.359869
3,0.516667,0.515625,0.55,0.532258
4,0.510833,0.515081,0.37,0.43065
5,0.515,0.513636,0.565,0.538095
6,0.488333,0.48583,0.4,0.438757
7,0.481667,0.483631,0.541667,0.511006
8,0.499167,0.499072,0.448333,0.472344
9,0.498333,0.498542,0.57,0.531882
10,0.493333,0.49262,0.445,0.467601
