# Q1

In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

cov_data = pd.read_csv('datasets/DS1_Cov.txt', sep=",", header=None)
cov_data = cov_data.dropna(axis=1)
negMeandata = np.array(pd.read_csv('datasets/DS1_m_0.txt', sep=",", header=None))
posMeandata = np.array(pd.read_csv('datasets/DS1_m_1.txt', sep=",", header=None))


#finding randoms samples from multivariate normal distribution
x1 = np.random.multivariate_normal(negMeandata[0], cov_data, 2000)
x2 = np.random.multivariate_normal(posMeandata[0], cov_data, 2000)

#labelling both datasets according and converting into dataframes
x1 = np.c_[x1, np.zeros(2000)]
x1 = np.resize(x1, (2000,21))
df1 = pd.DataFrame(x1,columns = range(21), index=range(2000))
x2 = np.c_[x2, np.ones(2000)]
x2= np.resize(x2, (2000,21))
df2 = pd.DataFrame(x2,columns = range(21), index=range(2000))

#splitting the two dataframes into train, valid and test sets 
train1, valid1, test1 = np.split(df1.sample(frac=1), [int(.6*len(df1)), int(.8*len(df1))])
train2, valid2, test2 = np.split(df2.sample(frac=1), [int(.6*len(df2)), int(.8*len(df2))])

#concatenating the sets obtained
train = [train1, train2] 
ds1Train = pd.concat(train)
ds1Train = shuffle(ds1Train)
valid = [valid1, valid2]
ds1Valid = pd.concat(valid)
ds1Valid = ds1Valid.sample(frac=1).reset_index(drop=True)
test = [test1, test2]
ds1Test = pd.concat(test)
ds1Test = ds1Test.sample(frac=1).reset_index(drop=True)

#saving the three sets into csv
ds1Train.to_csv('datasets/DS1_Train.csv', sep=',', header=None, index=None)
ds1Valid.to_csv('datasets/DS1_Valid.csv', sep=',', header=None, index=None)
ds1Test.to_csv('datasets/DS1_Test.csv', sep=',', header=None, index=None)



# Q2

In [19]:
trainData = np.array(pd.read_csv('datasets/DS1_Train.csv', sep=",", header=None))
testData = np.array(pd.read_csv('datasets/DS1_Test.csv', sep=",", header=None,))

trainY = trainData [:,20]
testY = testData [:,20]

class GDA():

    
    #defining the constructor of the class
    def __init__ (data):
        
        data.m0 = 0
        data.m1 = 0
        data.sigma = 0
        data.w = 0
        data.w0 = 0
        data.predVal = 0
        data.accuracy = 0
        data.precision = 0
        data.recall = 0
        data.f1 = 0
        
    #method to find the means
    def findmu(data, X):
        m1 = 0
        m2 = 0
        i = 0
        c1 = 0
        c2 = 0
        for row in X:
            if (trainY[i] == 0):
                m1 += row[:-1]
                c1 +=1
            else:
                m2 += row[:-1]
                c2 +=1
            i += 1
        data.m0 = m1/c1
        data.m1= m2/c2

    #method to find the covariance
    def findSigma(data, X, m0, m1):
        d1 = 0
        d2 = 0
        i = 0
        c1 = 0
        c2 = 0
        for row in X:
            if (trainY[i] == 0):
                x1 = np.reshape(np.array(row[:-1]-m0), (20,1))
                d1+= np.dot(x1, x1.T)
                c1+=1
            else:
                x2 = np.reshape(np.array(row[:-1]-m1), (20,1))
                d2+=np.dot(x2, x2.T)
                c2+=1
            i+=1
        S1 = d1/c1
        S2 = d2/c2
        data.sigma= (S1*c1)/len(X) + (S2*c2)/len(X)
    
    #method to find the paramters
    def findParameters(data,mu0,mu1,sigma):
        sigmaInv = np.linalg.inv(sigma)
        data.w = np.dot((sigmaInv),(mu0-mu1))
        data.w0 = (-.5 * np.dot(np.dot(mu0.T,sigmaInv),mu0)) + (0.5*np.dot(np.dot(mu1.T,sigmaInv),mu1))
        
    #method to find the sigmoidal
    def sigmoidal(data, a):
        return 1/(1+np.exp(-a))
    
    #method for obtaining predicted values
    def predict(data, X, w, w0):
        data.predVal = np.zeros(len(X))
        i = 0
        for row in X:
            x = (w.T).dot(np.array(row[:-1])) + w0
            probVal = data.sigmoidal(x)
            if probVal < 0.5:
                data.predVal[i] = 1
            i+=1
    
    #method to find the performance measure
    def performanceMeasure(data, testData, w, w0):
        gda.predict(testData, w, w0)
        TP = 0
        FP =0
        FN = 0
        TN = 0
        for x in range (len(gda.predVal)): 
            if(testY[x]==0 and gda.predVal[x] ==0):
                TN+=1
            elif(testY[x]==1 and gda.predVal[x] ==0):
                FN+=1
            elif(testY[x]==0 and gda.predVal[x] ==1):
                FP+=1
            else:
                TP+=1
        data.accuracy = (TP+TN)/(TP+FP+FN+TN)
        data.precision = TP/(TP+FP)
        data.recall = TP/(TP+FN)
        data.f1 = 2*data.precision*data.recall/(data.precision+data.recall)
            
#main method      
if __name__ == "__main__":
    
    gda = GDA()
    gda.findmu(trainData)
    mu0 = gda.m0
    mu1 =gda.m1
    
    gda.findSigma(trainData, mu0, mu1)
    sigma = gda.sigma
    
    gda.findParameters(mu0,mu1,sigma)
    w = gda.w
    w0 = gda.w0
    sigma = gda.sigma
    
    gda.performanceMeasure(testData,w, w0)
    accuracy = gda.accuracy
    precision = gda.precision
    recall = gda.recall
    f1 = gda.f1
    
    print("Accuracy: ", accuracy*100)
    print("Precision: ", precision*100)
    print("Recall: ", recall*100)
    print("F1-Measure: ", f1*100)
    print("\n")
    print("w: ", w)
    print("w0: ", w0)



Accuracy:  94.625
Precision:  94.73684210526315
Recall:  94.5
F1-Measure:  94.61827284105131


w:  [ 1.38768522e+01 -8.28351494e+00 -5.00203974e+00 -2.64726116e+00
 -9.36686362e+00 -4.30585248e+00  1.56511613e+01 -2.35155981e+01
 -2.79248839e+01  9.07144231e+00 -1.25061415e+01 -1.13164129e+01
  1.47171493e+01  1.21992892e+01 -5.48384788e+00  1.27276718e+01
  2.78079652e+01 -6.56346024e+00  1.66660738e-02 -5.04207265e+00]
w0:  26.18169207660766


# Q3
    

In [36]:
validData = np.array(pd.read_csv('datasets/DS1_Valid.csv', sep=",", header=None))

import math
import operator

class KNN():
    
    def __init__ (data):
        
        data.predVal = []
        data.accuracy = 0
        data.precision = 0
        data.recall = 0
        data.f1 = 0
    
    def euclideanDistance(data, X, points, length):
        distance = 0
        for x in range(length):
            distance += pow((X[x] - points[x]), 2)
        return math.sqrt(distance)
 
    def getNeighbors(data, trainingSet, sample, k):
        distances = []
        length = len(sample)-1
        for x in range(len(trainingSet)):
            dist = data.euclideanDistance(sample, trainingSet[x], length)
            distances.append((trainingSet[x], dist))
        distances.sort(key=operator.itemgetter(1))
        neighbors = []
        for x in range(k):
            neighbors.append(distances[x][0])
        return neighbors
 
    def getResponse(data, neighbors):
        classVotes = {}
        for x in range(len(neighbors)):
            response = neighbors[x][-1]
            if response in classVotes:
                classVotes[response] += 1
            else:
                classVotes[response] = 1
        sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
        return sortedVotes[0][0]
    
    def predict(data, trainData, testData, k):
        data.predVal = []
        for x in range(len(testData)):
            neighbors = data.getNeighbors(trainData, testData[x], k)
            result = data.getResponse(neighbors)
            data.predVal.append(result)
            
    def performanceMeasure (data, X, k):
        knn.predict(trainData, X,k)
        X = X[:,20]
        TP = 0
        TN = 0
        FP = 0
        FN = 0
        for x in range (len(knn.predVal)): 
            if(X[x]==0 and knn.predVal[x] ==0):
                TN+=1
            elif(X[x]==1 and knn.predVal[x] ==0):
                FN+=1
            elif(X[x]==0 and knn.predVal[x] ==1):
                FP+=1
            else:
                TP+=1
    
        data.accuracy = (TP+TN)/(TP+FP+FN+TN)
        data.precision = TP/(TP+FP)
        data.recall = TP/(TP+FN)
        data.f1 = 2*data.precision*data.recall/(data.precision+data.recall)

if __name__ == "__main__":
    # prepare data
    knn = KNN()
    
    for k in range (20, 32, 2):   
        knn.performanceMeasure(validData,k)
        f1 = knn.f1 
        print("F1-Measure: ", f1*100)
    
    

F1-Measure:  48.284960422163586
F1-Measure:  49.40239043824701
F1-Measure:  47.27755644090305
F1-Measure:  47.89473684210526
F1-Measure:  48.73501997336884
F1-Measure:  48.94179894179895


In [37]:
    knn.performanceMeasure(testData, 22)
    accuracy = knn.accuracy
    precision = knn.precision
    recall = knn.recall
    f1 = knn.f1   
    print("Accuracy: ", accuracy*100)
    print("Precision: ", precision*100)
    print("Recall: ", recall*100)
    print("F1-Measure: ", f1*100)

Accuracy:  46.875
Precision:  46.973365617433416
Recall:  48.5
F1-Measure:  47.724477244772444


# Q4

In [45]:
covData1 = pd.read_csv('datasets/DS2_Cov1.txt', sep=",", header=None)
covData1 = covData1.dropna(axis=1)
covData2 = pd.read_csv('datasets/DS2_Cov2.txt', sep=",", header=None)
covData2 = covData2.dropna(axis=1)
covData3 = pd.read_csv('datasets/DS2_Cov3.txt', sep=",", header=None)
covData3 = covData3.dropna(axis=1)
posMean1 = np.array(pd.read_csv('datasets/DS2_c1_m1.txt', sep=",", header=None))
posMean2 = np.array(pd.read_csv('datasets/DS2_c1_m2.txt', sep=",", header=None))
posMean3 = np.array(pd.read_csv('datasets/DS2_c1_m3.txt', sep=",", header=None))
negMean1 = np.array(pd.read_csv('datasets/DS2_c2_m1.txt', sep=",", header=None))
negMean2 = np.array(pd.read_csv('datasets/DS2_c2_m2.txt', sep=",", header=None))
negMean3 = np.array(pd.read_csv('datasets/DS2_c2_m3.txt', sep=",", header=None))

#finding randoms samples from multivariate normal distribution with given probabilites
posMean = [posMean1[0], posMean2[0],posMean3[0]]
negMean = [negMean1[0], negMean2[0], negMean3[0]]
cov = [covData1, covData2, covData3]
x1 = []
x2 = []
for i in range(2000):
    k = np.random.choice([0,1,2], p =[0.1,0.42,0.48])
    x1.append(np.random.multivariate_normal(negMean[k], cov[k]))   
    x2.append(np.random.multivariate_normal(negMean[k], cov[k]))

#labelling both datasets according and converting into dataframes
x1 = np.c_[x1, np.zeros(2000)]
x1 = np.resize(x1, (2000,21))
df1 = pd.DataFrame(x1,columns = range(21), index=range(2000))
x2 = np.c_[x2, np.ones(2000)]
x1 = np.resize(x2, (2000,21))
df2 = pd.DataFrame(x2,columns = range(21), index=range(2000))

#splitting the two dataframes into train, valid and test sets 
train1, valid1, test1 = np.split(df1.sample(frac=1), [int(.6*len(df1)), int(.8*len(df1))])
train2, valid2, test2 = np.split(df2.sample(frac=1), [int(.6*len(df2)), int(.8*len(df2))])

#concatenating the sets obtained
train = [train1, train2]
ds2Train = pd.concat(train)
ds2Train = shuffle(ds2Train)
valid = [valid1, valid2]
ds2Valid = pd.concat(valid)
ds2Valid = shuffle(ds2Valid)
test = [test1, test2]
ds2Test = pd.concat(test)
ds2Test = shuffle(ds2Test)

#saving the three sets into csv
ds2Train.to_csv('datasets/DS2_Train.csv', sep=',', header=None, index=None)
ds2Valid.to_csv('datasets/DS2_Valid.csv', sep=',', header=None, index=None)
ds2Test.to_csv('datasets/DS2_Test.csv', sep=',', header=None, index=None)



# Q5

In [52]:
trainData = np.array(pd.read_csv('datasets/DS2_Train.csv', sep=",", header=None))
validData = np.array(pd.read_csv('datasets/DS2_Valid.csv', sep=",", header=None,))
testData = np.array(pd.read_csv('datasets/DS2_Test.csv', sep=",", header=None,))

trainY = trainData[:,20]
testY = testData[:,20]

if __name__ == "__main__":
    
    gda = GDA()
    gda.findmu(trainData)
    mu0 = gda.m0
    mu1 =gda.m1
    
    gda.findSigma(trainData, mu0, mu1)
    sigma = gda.sigma
    
    gda.findParameters(mu0,mu1,sigma)
    w = gda.w
    w0 = gda.w0
    sigma = gda.sigma
    
    gda.performanceMeasure(testData,w, w0)
    accuracy = gda.accuracy
    precision = gda.precision
    recall = gda.recall
    f1 = gda.f1
    
    print("GDA performance measure:")
    print("Accuracy: ", accuracy*100)
    print("Precision: ", precision*100)
    print("Recall: ", recall*100)
    print("F1-Measure: ", f1*100)
    print("\n")
    print("w: ", w)
    print("w0: ", w0)


GDA performance measure:
Accuracy:  50.875
Precision:  50.89058524173028
Recall:  50.0
F1-Measure:  50.44136191677174


w:  [ 0.02612972 -0.04473386 -0.07967257 -0.05852181  0.04346625  0.0287968
 -0.02425289  0.01745191  0.02554836  0.01851677  0.0370603   0.02592644
 -0.07518714  0.02891718  0.02768238  0.01001033 -0.04085738  0.01336663
  0.01090744  0.01215722]
w0:  0.004263551114059683


array([[8.42906741, 5.9393321 , 5.10350445, 5.3988378 , 4.88055203,
        6.19594916, 6.44111712, 6.10997571, 5.1626616 , 5.70115922,
        6.01390217, 5.63362095, 5.49795036, 6.59046668, 5.88223238,
        6.33393983, 5.89492134, 6.01493146, 6.2090486 , 6.33389749],
       [5.9393321 , 7.62084937, 5.17785994, 5.48749818, 5.36572691,
        6.40939214, 6.88031124, 5.75806806, 4.95635624, 5.35385276,
        5.17576998, 5.45710834, 5.27954365, 6.51610183, 5.98047829,
        6.38672833, 5.62479725, 5.42745259, 6.54206321, 5.90954395],
       [5.10350445, 5.17785994, 6.852861  , 5.3687605 , 5.01817664,
        5.07580344, 6.2304002 , 4.86312853, 4.8719076 , 5.10207346,
        4.97303612, 5.31060242, 5.31232654, 6.21309674, 5.62487716,
        5.22081366, 5.59967067, 4.71853181, 6.09341739, 5.89580619],
       [5.3988378 , 5.48749818, 5.3687605 , 6.66128767, 4.49378978,
        5.80562685, 6.53459705, 5.58944451, 5.0031277 , 5.6810806 ,
        5.12777004, 5.16825152, 5.17667106, 5

In [41]:
if __name__ == "__main__":
    # prepare data
    knn = KNN()
    
    for k in range (1, 10, 2):   
        knn.performanceMeasure(validData,k)
        f1 = knn.f1 
        print("F1-Measure: ", f1*100)
    

F1-Measure:  50.44136191677174
F1-Measure:  47.382198952879584
F1-Measure:  46.728971962616825
F1-Measure:  47.248322147651
F1-Measure:  48.339973439575026


In [44]:
    knn.performanceMeasure(testData, 10)
    accuracy = knn.accuracy
    precision = knn.precision
    recall = knn.recall
    f1 = knn.f1   
    print("Accuracy: ", accuracy*100)
    print("Precision: ", precision*100)
    print("Recall: ", recall*100)
    print("F1-Measure: ", f1*100)

Accuracy:  49.0
Precision:  48.984771573604064
Recall:  48.25
F1-Measure:  48.614609571788414
