In [None]:
import numpy as np
import math
import matplotlib.pyplot as plt
import sys
from tqdm import tqdm
# Importing the StringIO module.
from io import StringIO
from zipfile import ZipFile
import os
import random
from skimage import color, io
import gc
import scipy
import cv2

In [None]:
class Q1DataLoader():
    
    def __init__(self,data_path):
        self.data = np.load(data_path)
    
    def train_test_split(self):
        
        x_train = self.data['train_images'].reshape(self.data['train_images'].shape[0],-1)
        x_test = self.data['test_images'].reshape(self.data['test_images'].shape[0],-1)
        x_val = self.data['val_images'].reshape(self.data['val_images'].shape[0],-1)
        y_train = self.data['train_labels']
        y_test = self.data['test_labels']
        y_val = self.data['val_labels']
        
        return x_train,y_train,x_test,y_test,x_val,y_val
    
    def get_metrics(self,pred,actual):
        n_correct_preds = 0
        tp = 0
        fp = 0
        tn = 0
        fn = 0
        for i in range(actual.shape[0]):
            if pred[i] == actual[i]:
                if actual[i]==1:
                    tp += 1
                else:
                    tn += 1
            else:
                if actual[i]==1:
                    fn += 1
                else:
                    fp += 1

        accuracy = (tp+tn)/(tp+fp+tn+fn)
        
        if tp+fn==0:
            recall= 0 
        else:
            recall = tp/(tp+fn)
        
        if tp+fp==0:
            precision = 0
        else:   
            precision = tp/(tp+fp)
        
        if recall==0 and precision==0:
            F1 = 0 
        else:
            F1 = 2*recall*precision/(recall+precision)

        if tn+fp==0:
            specificity = 0
        else:
            specificity = tn/(tn+fp)
        
        AUC = (recall + specificity)/2

        return accuracy,F1,AUC


In [None]:
class Q2DataLoader():
    
    def __init__(self,data_path):
        self.data = np.load(data_path)
    
    def train_test_split(self):
        
        x_train = self.data['train_images'].reshape(self.data['train_images'].shape[0],-1)
        x_test = self.data['test_images'].reshape(self.data['test_images'].shape[0],-1)
        x_val = self.data['val_images'].reshape(self.data['val_images'].shape[0],-1)
        y_train = self.data['train_labels']
        y_test = self.data['test_labels']
        y_val = self.data['val_labels']
    
        return x_train,y_train,x_test,y_test,x_val,y_val
    
    def confusion_mat(self,actual, pred):
        
        classes = 8
        mat = np.zeros((8, 8))
        for i, j in zip(actual, pred):
            mat[i][j] += 1
        return mat

    def get_metrics(self, y_prediction,y_true):
        
        unique,counts = np.unique(y_true,return_counts=True)
        counts = counts/np.sum(counts)
        y_prediction = y_prediction.astype(np.int64)
        cnf_matrix = confusion_mat(y_true, y_prediction)
        
        # print(cnf_matrix)
        FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)  
        FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
        TP = np.diag(cnf_matrix)
        TN = cnf_matrix.sum() - (FP + FN + TP)

        FP = FP.astype(float)
        FN = FN.astype(float)
        TP = TP.astype(float)
        TN = TN.astype(float)
        div = TP+FN
        TPR = [0 if d==0 else 1/d for d in div]
        TPR = TPR * TP

        div = TN+FP
        TNR = [0 if d==0 else 1/d for d in div]
        TNR = TNR * TN

        div = TP+FP
        PPV = [0 if d==0 else 1/d for d in div]
        PPV = PPV * TP

        ACC = (TP+TN)/(TP+FP+FN+TN)
        AUC = (TNR + TPR)/2
        div = TPR + PPV
        F1 = [0 if d==0 else 1/d for d in div]
        F1 *= 2*TPR*PPV
        
        f1 = np.sum(F1*counts)
        acc = np.sum(ACC*counts)
        
        return ACC, F1, AUC

In [None]:
class Q3DataLoader():
    
    def __init__(self,ann_path,im_path):
        
        self.ann_path = ann_path
        self.im_path = im_path
    
    def get_file_list(self,root, file_type):
        return [os.path.join(directory_path, f) for directory_path, directory_name, 
            files in os.walk(root) for f in files if f.endswith(file_type)]

    def get_train_df(self,ann_path, img_path):
    
        ann_path_list = self.get_file_list(self.ann_path, '.xml')
        ann = np.zeros((len(ann_path_list),4))
        for i in range(len(ann_path_list)):
            a_path = ann_path_list[i]
            root = ET.parse(a_path).getroot()
            ann[i][0] = int(root.find("./object/bndbox/xmin").text)
            ann[i][1] = int(root.find("./object/bndbox/ymin").text)
            ann[i][2] = int(root.find("./object/bndbox/xmax").text)
            ann[i][3] = int(root.find("./object/bndbox/ymax").text)
        return ann

    def get_image_data(self):
    
        image_list = get_file_list(self.im_path,'png')
        image_data = [ cv2.imread(image_path) for image_path in image_list]

        return image_data
    
    def resize_image_bounding_box(self):
        
        image_data = self.get_image_data()
        targetSize = (100,100)
        image_list = get_file_list(self.im_path,'png')
        resized_image_list = []
        for i in range(len(image_data)):
            x_scale = 100/image_data[i].shape[0]
            y_scale = 100/image_data[i].shape[1]
            train_box[i][0] = int(np.round(train_box[i][0]*x_scale))
            train_box[i][1] = int(np.round(train_box[i][1]*y_scale))
            train_box[i][2] = int(np.round(train_box[i][2]*x_scale))
            train_box[i][3] = int(np.round(train_box[i][3]*y_scale))
            image = cv2.imread(image_list[i])
            image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) ## grayscale image
            norm_image = cv2.normalize(image, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F) ## normalize
            image = cv2.resize(norm_image,targetSize)  ## resize to 100*100
            #print(image.shape)
            image = np.array(image)
            resized_image_list.append(image.ravel())

        return train_box,np.array(resized_image_list)  
    
    def train_test_split(self):
        
        train_box,resized_image_list = self.resize_image_bounding_box()
        return train_test_split(resized_image_list, train_box, test_size=0.3, random_state=34)
    
    def getMetrics(self,y_pred,actual):
        
        xA = max(y_pred[0], actual[0])
        yA = max(y_pred[1], actual[1])
        xB = min(y_pred[2], actual[2])
        yB = min(y_pred[3], actual[3])
        # compute the area of intersection rectangle
        interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
        # compute the area of both the prediction and ground-truth
        # rectangles
        boxAArea = (y_pred[2] - y_pred[0] + 1) * (y_pred[3] - y_pred[1] + 1)
        boxBArea = (actual[2] - actual[0] + 1) * (actual[3] - actual[1] + 1)
        # compute the intersection over union by taking the intersection
        # area and dividing it by the sum of prediction + ground-truth
        # areas - the interesection area
        iou = interArea / float(boxAArea + boxBArea - interArea)
        # return the intersection over union value

        MSE = np.mean(0.5 * (actual - y_pred)**2)
        MAE = np.mean(np.abs(actual - y_pred))
        
        return MSE,MoU/y_pred.shape[0],MAE

In [None]:
class Q4DataLoader():
    
    def __init__(self):
        
        self.phoneme = {}
        phoneme_list = ["bcl","dcl","gcl","pcl","tck","kcl","dcl","tcl","b","d","g","p","t","k","dx","q","jh","ch","s","sh","z","zh","f","th","v","dh","m","n","ng","em","en","eng","nx","l","r","w","y","hh","hv","el","iy","ih","eh","ey","ae","aa","aw","ay","ah","ao","oy","ow","uh","uw","ux","er","ax","ix","axr","ax-h","pau","epi","h#"]
        phonemlist_length = 63
        #create key value dictionary
        for ph in phoneme_list: 
            if('a' in ph or 'e' in ph or 'i' in ph or 'o' in ph or 'u' in ph):
                self.phoneme[ph] = 0                                                ##phoneme is vowel
            else:
                self.phoneme[ph] = 1
    
    def get_max_feature_len(self,x):
        
        max_len = 0
        n=x.__len__()
        for i in range(n):
            max_len = max(max_len,x[i].__len__())
        return max_len
    
    def add_padding(self,x,max_len):
    
        x_train = []
        n = x.__len__()
        for i in range(n):
            m=x[i].__len__()
            temp=np.zeros(max_len)
            if(m>max_len):
                temp=x[i][:max_len]
            else:
                temp[:m]=x[i]

            x_train.append(temp)
        return x_train

    def get_x_and_y(self,file_path):
    
        x = []
        y = []
        count = 0
        for folder in os.listdir(file_path):
            
            path = file_path + folder + "/"
            temp_name = ""
            for files in os.listdir(path):
                name = files.split(".")[0]
                if name != temp_name:
                    temp_name = name
                wav_file = path + name + ".WAV"
                phn_file = path + name + ".PHN"

                data, sampling_freq = librosa.load(wav_file,sr=None, mono=True,offset=0.0,duration=None)
                data=data.tolist()

                file_obj = open(phn_file, 'r')
                phonem_data = file_obj.readlines()
                n=np.shape(phonem_data)[0]

                for i in range(n):
                    
                    lower,upper,ph=phonem_data[i].split(" ")
                    lower = int(lower)
                    upper = int(upper)
                    ph = ph.replace("\n", "")
                    temp = data[lower:upper]
                    temp = np.array(temp)
                    count += 1
                    mfccs = librosa.feature.mfcc(temp, sr=sampling_freq)
                    mfccs = mfccs.flatten()
                    temp = mfccs.tolist()
                    x.append(temp)
                    y.append(self.phoneme[ph])
                    
        return x,y
    
    def preprocessing_audio(self,path):
  
        path_test  = path + "test/DR1/"
        path_train = path + "train/DR1/"
        x_train,y_train = self.get_x_and_y(path_train)         
        x_test,y_test = self.get_x_and_y(path_test)
        max_len = self.get_max_feature_len(x_train)
        max_len = max(max_len,self.get_max_feature_len(x_test))
        x_train = self.add_padding(x_train,max_len)
        x_test = self.add_padding(x_test,max_len)

        return np.array(x_train), np.array(y_train), np.array(x_test), np.array(y_test)
    
    def get_metrics(self,pred,actual):
        n_correct_preds = 0
        tp = 0
        fp = 0
        tn = 0
        fn = 0
        for i in range(actual.shape[0]):
            if pred[i] == actual[i]:
                if actual[i]==1:
                    tp += 1
                else:
                    tn += 1
            else:
                if actual[i]==1:
                    fn += 1
                else:
                    fp += 1
        
        return tp/actual.shape[0],fp/actual.shape[0],tn/actual.shape[0],fn/actual.shape[0]


In [None]:
class Q5DataLoader():
    
    def __init__(self, path):
        with ZipFile(path, 'r') as zipObj:
            zipObj.extractall()

        self.test_path = '/content/tiny-imagenet-200/test'
        self.train_path = '/content/tiny-imagenet-200/train'
        self.validation_path = '/content/tiny-imagenet-200/val'
        
    
    def extract(self):
        x = []
        NO_CLASSES = 10
        NO_IMAGES = 100
        class_folders = os.listdir(self.train_path)

        # loop over classes
        for class_folder in class_folders[:NO_CLASSES]: 

        image_folder_path = train_path + os.path.sep + class_folder + os.path.sep + 'images' + os.path.sep
        image_file_names = os.listdir(image_folder_path)

        # loop over images
        for image_file_name in image_file_names[:NO_IMAGES]: # loop over all images in a class
            img = io.imread(image_folder_path + image_file_name)
            img = cv2.resize(img, (28, 28))
            if len(img.shape) == 3:
                imgGray = 0.3*img[:,:,0] + 0.59*img[:,:,1] + 0.11*img[:,:,2]
                flat_image = imgGray.flatten() 
            else:
                flat_image = img.flatten()
            x.append(flat_image)
        X = np.array(x) / 255
        return X

In [None]:
class Experiments():
    
    def __init__(self):
        
        pass
    
    def binary_experiments(self,X,y,x_test,y_test,counts,dataset):
        
        clf = GaussianMAP(n_classes=2)
        clf.fit(X,counts)
        pred = clf.predict(x_test)
        accuracy,F1,AUC = dataset.get_metrics(pred,y_test)
        print(f'metrics of my Gaussian MLE = {accuracy,F1,AUC}') 
            
        clf = LogisticRegression('Elastic',0.001,0.1)
        clf.fit(X,y)
        pred = clf.predict(x_test)
        accuracy,F1,AUC = dataset.get_metrics(pred,y_test)
        print(f'metrics of Elastic Net Logistic Regression = {accuracy,F1,AUC}')
        clf.plots()

        clf = LogisticRegression(None,None,None)
        clf.fit(X,y)
        pred = clf.predict(x_test)
        accuracy,F1,AUC = dataset.get_metrics(pred,y_test)
        print(f'metrics of L1 Logistic Regression = {accuracy,F1,AUC}')
        clf.plots()

        clf = LogisticRegression('L1',0.001,None)
        clf.fit(X,y)
        pred = clf.predict(x_test)
        accuracy,F1,AUC = dataset.get_metrics(pred,y_test)
        print(f'metrics of L1 Logistic Regression = {accuracy,F1,AUC}')
        clf.plots()

        clf = LogisticRegression('L2',0.001,None)
        clf.fit(X,y)
        pred = clf.predict(x_test)
        accuracy,F1,AUC = dataset.get_metrics(pred,y_test)
        print(f'metrics of L2 Logistic Regression = {accuracy,F1,AUC}')
        clf.plots()

        clf = LDA(n_classes=2,n_features = x_train.shape[1])
        clf.fit(X,counts)
        pred = clf.predict(x_test)
        accuracy,F1,AUC = dataset.get_metrics(pred,y_test)
        print(f'metrics of LDA = {accuracy,F1,AUC}') 

        clf = Multiclass_Gaussian_Naive_Bayes(x_train.shape[1],2)
        clf.fit(X,counts)
        pred = clf.predict(x_test)
        accuracy,F1,AUC = dataset.get_metrics(pred,y_test)
        print(f'metrics of Multiclass Gaussian Naive Bayes = {accuracy,F1,AUC}') 

        clf = KNearestNeighbor(X,y,10)
        pred = clf.predict(x_test)
        accuracy,F1,AUC = dataset.get_metrics(pred,y_test)
        print(f'metrics of K nearest neighbor = {accuracy,F1,AUC}') 

        clf = GaussianMAP(n_classes=2)
        clf.fit(X,counts)
        pred = clf.predict(x_test)
        accuracy,F1,AUC = dataset.get_metrics(pred,y_test)
        print(f'metrics of my Gaussian MLE = {accuracy,F1,AUC}') 

        clf = ParzenWindow(X,n_classes = 2)
        pred = clf.predict(x_test,counts)
        accuracy,F1,AUC = dataset.get_metrics(pred,y_test)
        print(f'metrics of Parzen Window = {accuracy,F1,AUC}') 
        
        clf = GMM_classification(n_classes=2,n_clusters=3)
        pred = clf.fit_predict(X,x_test,counts)
        accuracy,F1,AUC = dataset.get_metrics(pred,y_test)
        print(f'accuracy of GMM with 3 clusters per class = {accuracy,F1,AUC}')
        
    def multi_class_experiments(self,X,y,x_test,y_test,counts,dataset):
        
        clf = Multiclass_Logistic_Regression(None,None,None)
        clf.fit(X,y)
        pred = clf.predict(x_test)
        accuracy,F1,AUC = dataset.get_metrics(pred,y_test)
        print(f'metrics of  Logistic Regression = {accuracy,F1,AUC}')
        clf.plots()
        
        clf = Multiclass_Logistic_Regression('Elastic',0.1,0.5)
        clf.fit(X,y)
        pred = clf.predict(x_test)
        accuracy,F1,AUC = dataset.get_metrics(pred,y_test)
        print(f'metrics of Elastic Net Logistic Regression  = {accuracy,F1,AUC}')
        clf.plots()

        clf = Multiclass_Logistic_Regression('L1',0.1,None)
        clf.fit(X,y)
        pred = clf.predict(x_test)
        accuracy,F1,AUC = dataset.get_metrics(pred,y_test)
        print(f'metrics of L1 Logistic Regression  = {accuracy,F1,AUC}')
        clf.plots()

        clf = Multiclass_Logistic_Regression('L2',0.1,None)
        clf.fit(X,y)
        pred = clf.predict(x_test)
        accuracy,F1,AUC = dataset.get_metrics(pred,y_test)
        print(f'metrics of L2 Logistic Regression = {accuracy,F1,AUC}')
        clf.plots()
        
        clf = Multiclass_Gaussian_Naive_Bayes(x_train.shape[1],8)
        clf.fit(X,counts)
        pred = clf.predict(x_test)
        accuracy,F1,AUC = dataset.get_metrics(pred,y_test)
        print(f'metrics of Multiclass Gaussian Naive Bayes = {accuracy,F1,AUC}') 

        clf = Multi_LDA(n_classes=8,n_features = x_train.shape[1])
        clf.fit(X,counts)
        pred = clf.predict(x_test,n_components = 7)
        accuracy,F1,AUC = dataset.get_metrics(pred,y_test)
        print(f'metrics of Multiclass LDA = {accuracy,F1,AUC}') 

        clf = ParzenWindow(X,n_classes = 8)
        pred = clf.predict(x_test,counts)
        accuracy,F1,AUC = dataset.get_metrics(pred,y_test)
        print(f'metrics of Parzen Window = {accuracy,F1,AUC}')

        clf = KNearestNeighbor(X,y,1)
        pred = clf.predict(x_test)
        accuracy,F1,AUC = dataset.get_metrics(pred,y_test)
        print(f'metrics of K nearest neighbor = {accuracy,F1,AUC}') 

        clf = GaussianMLE(n_classes=8)
        clf.fit(X,counts)
        pred = clf.predict(x_test)
        accuracy,F1,AUC = dataset.get_metrics(pred,y_test)
        print(f'metrics of Gaussian MLE = {accuracy,F1,AUC}') 
    
        clf = GMM_classification(n_classes = 8,n_clusters=3)
        pred = clf.fit_predict(X,x_test,counts)
        accuracy,F1,AUC = dataset.get_metrics(pred,y_test)
        print(f'metrics of GMM classification = {accuracy,F1,AUC}')
    
    def Regression_experiments(self,X,y,x_test,y_test,dataset):
        
        clf = LinearRegression(1e-7,None,None,None)
        clf.fit(X_train,y_train)
        training_errors = np.array(clf.training_errors)
        training_MAE = np.array(clf.MAE)
        y_pred = clf.predict(X_test)
        MSE,mIoU,MAE = dataset.getMetrics(y_pred,y_test) 
        print(f'metrics for Simple Linear are {MSE,mIoU,MAE}')
        clf.plot_loss()

        clf = LinearRegression(1e-7,0.1,'L1',None)
        clf.fit(X_train,y_train)
        training_errors = np.array(clf.training_errors)
        training_MAE = np.array(clf.MAE)
        y_pred = clf.predict(X_test)
        MSE,mIoU,MAE = dataset.getMetrics(y_pred,y_test) 
        print(f'metrics for L1 are {MSE,mIoU,MAE}')
        clf.plot_loss()

        clf = LinearRegression(1e-7,0.1,'L2',None)
        clf.fit(X_train,y_train)
        training_errors = np.array(clf.training_errors)
        training_MAE = np.array(clf.MAE)
        y_pred = clf.predict(X_test)
        MSE,mIoU,MAE = dataset.getMetrics(y_pred,y_test) 
        print(f'metrics for L2 are {MSE,mIoU,MAE}')
        clf.plot_loss()

        clf = LinearRegression(1e-7,0.1,'Elastic',0.1)
        clf.fit(X_train,y_train)
        training_errors = np.array(clf.training_errors)
        training_MAE = np.array(clf.MAE)
        y_pred = clf.predict(X_test)
        MSE,mIoU,MAE = dataset.getMetrics(y_pred,y_test) 
        print(f'metrics for Elastic are {MSE,mIoU,MAE}')
        clf.plot_loss()

In [None]:
class Multiclass_Gaussian_Naive_Bayes():
    
    def __init__(self,n_features,n_classes):
        
        self.n_classes = n_classes
        self.n_features = n_features
        self.mean = np.zeros((n_classes,n_features))
        self.variance = np.zeros((n_classes,n_features))
        self.eps = 1e-7
        
    def fit(self,X,counts):
    
        for i in range(np.size(counts)):
            if i==0:
                st_idx = 0
            else:
                st_idx = np.sum(counts[0:i])
            curr_X = X[st_idx:st_idx+counts[i]]
            self.mean[i] = np.mean(curr_X,axis = 0)
            diff = curr_X - self.mean[i]
            self.variance[i] = np.mean(np.square(diff),axis = 0)

    def gaussian(self,X, mu, var):
        
        return 1 / ((2 * np.pi) ** (1 / 2) * var ** 0.5) * np.exp(-0.5 * (X-mu)**2/var)

    def predict(self,x_test):
        
        pred = np.zeros(x_test.shape[0])
        
        for j in range(x_test.shape[0]):
            best_class = 0
            best_likelihood = -math.inf
            for i in range(self.n_classes):
                likelihood = self.gaussian(x_test[j],self.mean[i],self.variance[i])
                log_likelihood = np.sum(np.log(likelihood+self.eps))
                if best_likelihood < log_likelihood:
                    best_likelihood = log_likelihood
                    best_class = i
            pred[j] = best_class
        return pred
                
   

In [None]:
class LDA():
    
    def __init__(self,n_features,n_classes):
        
        self.n_features = n_features
        self.n_classes = n_classes
        self.S_w = np.zeros((n_features,n_features))
        self.S_b = np.zeros((n_features,n_features))
        self.mu = np.zeros(n_features)
        self.mu_class = np.zeros((self.n_classes,self.n_features))

    def fit(self,X,counts):
        
        self.mu = np.mean(X,axis = 0)
        self.mu_class[0] = np.mean(X[0:counts[0],:],axis = 0)
        self.mu_class[1] = np.mean(X[counts[0]:,:],axis = 0)
        self.S_w = np.dot((X[0:counts[0],:]-self.mu_class[0]).T,X[0:counts[0],:]-self.mu_class[0]) + np.dot((X[counts[0]:,:]-self.mu_class[1]).T,X[counts[0]:,:]-self.mu_class[1])
        self.S_b = np.dot((self.mu_class[0] - self.mu_class[1]).T,self.mu_class[0] - self.mu_class[1])
            
    def predict(self,x_test):
        
        v = np.dot(np.linalg.pinv(self.S_w),self.mu_class[0] - self.mu_class[1])
        pred = [1 if abs(np.dot(x_test[i],v)-np.dot(self.mu_class[0],v))> abs(np.dot(x_test[i],v)-np.dot(self.mu_class[1],v)) else 0 for i in range(x_test.shape[0])]
        return pred
    

In [None]:
import math

class LinearRegression():
    
    def __init__(self, learning_rate,alpha,regularization,L1_ratio):
            
        self.param = None
        self.learning_rate = learning_rate
        self.reg_rate = alpha
        self.regularization = regularization
        self.L1_ratio = L1_ratio
        self.n_iterations = 2000
    def initialize_parameters(self, input_dim,output_dim):
        
        self.param = np.ones((input_dim,output_dim))
        
    def fit(self, x, y):
        
        l1_ratio = self.L1_ratio
        x = np.insert(x, 0, 1, axis=1)
        self.training_errors = []
        self.MAE = []
        self.initialize_parameters(input_dim = x.shape[1],output_dim = y.shape[1])
        print(self.param.shape)
        # Do gradient descent for n_iterations
        for i in tqdm(range(self.n_iterations)):
            
            self.learning_rate = self.learning_rate/(10**int((i/1000))) ## learning rate decay
            y_pred = np.dot(x,self.param)
            if self.regularization == None:
                mse = np.mean(0.5 * (y - y_pred)**2)
                grad_param = np.dot(x.T,(y_pred - y))
            elif self.regularization == 'L1':
                mse = np.mean(0.5 * (y - y_pred)**2) + self.reg_rate*np.linalg.norm(self.param, ord= 1)
                grad_param = np.dot(x.T,(y_pred - y)) + self.reg_rate*np.sign(self.param)
            elif self.regularization == 'L2':
                mse = np.mean(0.5 * (y - y_pred)**2) + 0.5*self.reg_rate*np.linalg.norm(self.param)**2
                grad_param = np.dot(x.T,(y_pred - y)) + self.reg_rate*self.param
            elif self.regularization == 'Elastic':
                mse = np.mean(0.5 * (y - y_pred)**2) + self.reg_rate*(l1_ratio*np.linalg.norm(self.param, ord= 1)+(1-l1_ratio)*0.5*np.linalg.norm(self.param)**2)
                grad_param = np.dot(x.T,(y_pred - y)) + self.reg_rate*((1-l1_ratio)*self.param + l1_ratio*np.sign(self.param))
            
            self.training_errors.append(mse)
            self.MAE.append(np.mean(np.abs(y-y_pred)))
            self.param = self.param - self.learning_rate * grad_param
        
            
    def predict(self,x_test):
        x_test = np.insert(x_test, 0, 1, axis=1)
        pred = np.dot(x_test,self.param)
        return pred
    
    def plot_loss(self):
    
        iters = np.arange(0,self.n_iterations,1)
        fig = plt.figure(figsize=(4,4))
        plt.plot(iters[50:],self.training_errors[50:],label = 'MSE')
        plt.plot(iters[50:],self.MAE[50:],label = 'MAE')
        plt.xlabel('No. of iterations')
        plt.ylabel('Loss')
        plt.legend()
        if self.regularization == None:
            plt.title('Simple Linear Regression')
            plt.savefig('Q3_LinearRegression.png')
        else:
            plt.title(f'{self.regularization}Regression')
            plt.savefig(f'Q3{self.regularization}.png')
        plt.close()


In [None]:
class LogisticRegression():

    def __init__(self,regularization,reg_rate,l1_ratio,learning_rate=0.001):
        self.param = None
        self.learning_rate = learning_rate
        self.loss = []
        self.eps = 1e-7
        self.n_iterations = 2000
        self.regularization = regularization
        self.reg_rate = reg_rate
        self.l1_ratio = l1_ratio
        
    def initialize_parameters(self, X):
        n_features = np.shape(X)[1]
        self.param = np.ones((n_features,1))
    
    def sigmoid(self,x):
        return 1/(1+np.exp(-x))

    def fit(self, X, y):
        X = np.insert(X,0,1,axis = 1)
        y = y.reshape(-1,1)
        self.initialize_parameters(X)
        l1_ratio = self.l1_ratio
        for i in tqdm(range(self.n_iterations)):
            
            y_pred = self.sigmoid(np.dot(X,self.param))
            error = -np.dot(y.T,np.log(y_pred + self.eps)).item()
            if self.regularization==None:
                self.param -= self.learning_rate * np.dot(X.T,y_pred - y)
            elif self.regularization == 'L1':
                self.param -= self.learning_rate * (np.dot(X.T,y_pred - y) + self.reg_rate*np.sign(self.param))
            elif self.regularization == 'L2':
                self.param -= self.learning_rate * (np.dot(X.T,y_pred - y) + self.reg_rate*self.param)
            elif self.regularization == 'Elastic':
                self.param -= self.learning_rate * (np.dot(X.T,y_pred - y) + self.reg_rate*((1-l1_ratio)*self.param + l1_ratio*np.sign(self.param)))
    
            self.loss.append(error)                                     
    def predict(self, X):
        
        X = np.insert(X,0,1,axis = 1)
        y_pred = np.round(self.sigmoid(np.dot(X,self.param))).astype(int)
        
        return y_pred
    
    
    def plots(self):
        
        iters = np.arange(0,self.n_iterations,1)
        fig = plt.figure()
        plt.plot(iters,self.loss)
        plt.xlabel('No. of iterations')
        plt.ylabel('Loss')
        plt.legend()
        if self.regularization == None:
            plt.title('Simple Logistic Regression')
            plt.savefig('Q1_LogisticRegression.png')
        else:
            plt.title(f'{self.regularization}_LogisticRegression')
            plt.savefig(f'Q1{self.regularization}.png')
        plt.close()

In [None]:
class ParzenWindow():
    def __init__(self,X_train,n_classes):
        self.n_classes = n_classes
        self.x_train = X_train
        
    def predict(self,X_test,counts):
        
        y_pred = np.zeros((self.n_classes,X_test.shape[0]))
        for i in range(np.size(counts)):
            if i==0:
                st_idx = 0
            else:
                st_idx = np.sum(counts[0:i])
                
            curr_X = self.x_train[st_idx:st_idx+counts[i]]
            y_pred[i,:] = np.array([np.sum(np.exp(-1*((np.linalg.norm(curr_X - X_test[idx],axis=1))**2))) for idx in range(X_test.shape[0])])
        
        return np.argmax(y_pred,axis=0)

        

In [None]:
class Multiclass_Logistic_Regression():
    
    def __init__(self, regularization,reg_rate,l1_ratio,learning_rate=0.001,n_classes=8):
        
        self.param = None
        self.lr = learning_rate
        self.training_errors = []
        self.eps = 1e-7
        self.n_classes = n_classes
        self.regularization = regularization
        self.reg_rate = reg_rate
        self.l1_ratio = l1_ratio
        self.n_iterations=1000
        
    def initialize_parameters(self, X):
        
        n_features = np.shape(X)[1]
        self.param = np.ones((self.n_classes,n_features))
    
    def one_hot(self,y):

        return np.eye(self.n_classes)[y.reshape(-1)]
    
    def softmax(self,probs):
        probs = probs - (np.mean(probs,axis=1)).reshape(-1,1)  
        return np.exp(probs)/(np.sum(np.exp(probs),axis = 1) + self.eps).reshape(-1,1)
    

    def fit(self, X, y):
        
        X = np.insert(X,0,1,axis=1)
        y = self.one_hot(y)
        self.initialize_parameters(X)
        l1_ratio = self.l1_ratio
        for i in tqdm(range(self.n_iterations)):
            y_pred = softmax(np.dot(X,self.param.T),axis = 1)
            if self.regularization==None:
                loss = -1*np.mean(y*np.log(y_pred + self.eps))
                grad = np.dot((y_pred-y).T,X)
            elif self.regularization == 'L1':
                loss = -1*np.mean(y*np.log(y_pred + self.eps)) + self.reg_rate*np.linalg.norm(self.param, ord= 1)
                grad =np.dot((y_pred-y).T,X) + self.reg_rate*np.sign(self.param)
            elif self.regularization == 'L2':
                loss = -1*np.mean(y*np.log(y_pred + self.eps)) + 0.5*self.reg_rate*np.linalg.norm(self.param)**2
                grad = np.dot((y_pred-y).T,X) + self.reg_rate*self.param
            elif self.regularization == 'Elastic':
                loss = -1*np.mean(y*np.log(y_pred + self.eps)) + self.reg_rate*(l1_ratio*np.linalg.norm(self.param, ord= 1)+(1-l1_ratio)*0.5*np.linalg.norm(self.param)**2)
                grad = np.dot((y_pred-y).T,X) + self.reg_rate*((1-l1_ratio)*self.param + l1_ratio*np.sign(self.param))
            self.training_errors.append(loss)
            self.param = self.param - self.lr*grad
    
    def predict(self, X):
        
        X = np.insert(X,0,1,axis=1)
        y_pred = np.argmax(softmax(np.dot(X,self.param.T),axis=1),axis = 1)
        
        return y_pred
    
    
    def plots(self):
        
        iters = np.arange(0,self.n_iterations,1)
        fig = plt.figure()
        plt.plot(iters,self.training_errors)
        plt.xlabel('No. of iterations')
        plt.ylabel('Loss')
        if self.regularization == None:
            plt.title('Simple Logistic Regression')
            plt.savefig('Q2_LogisticRegression.png')
        else:
            plt.title(f'{self.regularization}_LogisticRegression')
            plt.savefig(f'Q2{self.regularization}.png')
        plt.close()

In [None]:
class GaussianMAP():
    
    def __init__(self,n_classes):
        self.n_classes = n_classes
        self.eps = 1e-7
    
    def fit(self,X,counts):
        
        self.mu = np.zeros((self.n_classes,X.shape[1]))
        self.cov = np.zeros((self.n_classes,X.shape[1],X.shape[1]))
        self.log_prior = np.log(counts/np.sum(counts))
        
        for i in range(len(counts)):
            if i==0:
                st_idx = 0
            else:
                st_idx = np.sum(counts[0:i])
            curr_X = X[st_idx:st_idx+counts[i]]
            self.mu[i] = np.mean(curr_X,axis = 0)
            self.cov[i] = np.dot((curr_X-self.mu[i]).T,curr_X-self.mu[i])
            
    def log_likelihood(self,X, mu, cov):
        
        sign,log_det = np.linalg.slogdet(cov)
        return -0.5*log_det -0.5*np.dot(np.dot((X-mu).T,np.linalg.pinv(cov)),X-mu)

    def predict(self,x_test):
        
        pred = np.zeros(x_test.shape[0])
        for i in tqdm(range(x_test.shape[0])):
            best_class = 0
            best_posterior = -math.inf
            for j in range(self.n_classes):
                log_likelihood = self.log_likelihood(x_test[i],self.mu[j],self.cov[j])
                log_posterior = log_likelihood + self.log_prior[j]
                if best_posterior < log_posterior:
                    best_posterior = log_posterior
                    best_class = j
            pred[i] = best_class
        return pred
    
           

In [None]:
class Multi_LDA():

    def __init__(self,n_features,n_classes):
        self.n_features = n_features
        self.n_classes = n_classes
        self.S_w = np.zeros((n_features,n_features))
        self.S_b = np.zeros((n_features,n_features))
        self.mu = np.zeros(n_features)
        self.mu_class = np.zeros((self.n_classes,self.n_features))

    def fit(self,X,counts):
        
        self.mu = np.mean(X,axis = 0)
        for i in range(self.n_classes):
            if i==0:
                st_idx = 0
            else:
                st_idx = np.sum(counts[0:i])
            curr_X = X[st_idx:st_idx+counts[i]]
            self.mu_class[i] = np.mean(curr_X,axis = 0)
            self.S_w = self.S_w + np.dot((curr_X - self.mu_class[i]).T,curr_X - self.mu_class[i])
            self.S_b = self.S_b + counts[i]*np.dot((self.mu - self.mu_class[i]).T,self.mu - self.mu_class[i])
     
    def predict(self,x_test,n_components):
        
        V = np.dot(np.linalg.pinv(self.S_w),self.S_b)
        eigenvalues, eigenvectors = np.linalg.eig(V)
        eigenvectors = eigenvectors.T
        eigenList = [(eigenvalues[i],eigenvectors[i,:]) for i in range(len(eigenvalues))]
        eigenList = sorted(eigenList,key = lambda x:x[0] ,reverse= True)
        for i in range(n_components):
            eigenvectors[i,:] = eigenList[i][1]
        # Project the data onto eigenvectors
        eigenvectors = eigenvectors[0:n_components,:]
        projected_X = np.dot(x_test,eigenvectors.T)
        print(eigenvectors.shape,projected_X.shape)
        projected_mu = np.dot(self.mu_class,eigenvectors.T)
        pred = np.zeros(x_test.shape[0])
        for i in range(x_test.shape[0]):
            best_class = 0
            best_dist = math.inf
            for j in range(self.n_classes):
                if np.linalg.norm(projected_X[i] - projected_mu[j]) < best_dist:
                    best_dist = np.linalg.norm(projected_X[i] - projected_mu[j])
                    best_class = j
            pred[i] = best_class
        
        return pred
    

In [None]:
class KNearestNeighbor():
    
    def __init__(self,x_train,y_train,K):
        self.k = K
        self.X = x_train
        self.y = y_train
    
    def get_Euclidean_distance(self,x):
        
        return np.sqrt(np.sum((self.X - x)**2,axis=1))
        
    def predict(self,x_test):
        
        pred = np.zeros(x_test.shape[0])
        for i in range(x_test.shape[0]):
            dist = self.get_Euclidean_distance(x_test[i])
            nearest_neighbors = dist.argsort()[0:self.k]
            unique,counts = np.unique(self.y[nearest_neighbors], return_counts = True)
            pred[i] = unique[np.argmax(counts)]
            
        return pred
    

In [None]:
class GMM():

    def __init__(self, n_components, n_iters, tol):
        self.n_components = n_components
        self.n_iters = n_iters
        self.tol = tol
        self.eps = 1e-7
        
    def fit(self, X):

        # data's dimensionality and responsibility vector
        n_row, n_col = X.shape     
        self.resp = np.zeros((n_row, self.n_components))
        
        # initialize parameters
        np.random.seed(4)
        chosen = np.random.choice(n_row, self.n_components, replace = False)
        self.means = X[chosen]
        self.weights = np.full(self.n_components, 1 / self.n_components)

        shape = self.n_components, n_col, n_col
        self.covs = np.full(shape, np.cov(X, rowvar = False))

        log_likelihood = 0
        self.converged = False
        self.log_likelihood_trace = []      

        for i in range(self.n_iters):
            log_likelihood_new = self.Expectation(X)
            self.Maximization(X)

            if abs(log_likelihood_new - log_likelihood) <= self.tol:
                self.converged = True
                break
  
            log_likelihood = log_likelihood_new
            self.log_likelihood_trace.append(log_likelihood)

        return self

    def Expectation(self, X):
        
        self._compute_log_likelihood(X)
        log_likelihood = np.sum(np.log(np.sum(self.resp, axis = 1)+ self.eps) )
        self.resp = self.resp / (self.resp.sum(axis = 1, keepdims = 1) + self.eps)
        
        return log_likelihood

    def _compute_log_likelihood(self, X):
        self.reg_cov = 1e-6*np.identity(X.shape[1])
        assert(np.shape(self.covs[0])==np.shape(self.reg_cov))
        for k in range(self.n_components):
            prior = self.weights[k]
            likelihood = multivariate_normal(self.means[k], self.covs[k]+self.reg_cov).pdf(X)
            self.resp[:, k] = prior * likelihood

        return self

    def Maximization(self, X):

        resp_weights = self.resp.sum(axis = 0)
        self.weights = resp_weights / X.shape[0]
        weighted_sum = np.dot(self.resp.T, X)
        self.means = weighted_sum / (resp_weights.reshape(-1, 1) + self.eps)
        
        for k in range(self.n_components):
            diff = (X - self.means[k]).T
            weighted_sum = np.dot(self.resp[:, k] * diff, diff.T)
            self.covs[k] = weighted_sum / (resp_weights[k]+self.eps)
            
        return self
    
    def predict(self,X_test):
        self._compute_log_likelihood(X_test)
        return np.log(np.sum(self.resp, axis = 1))
        

In [None]:
class GMM_classification():
    
    def __init__(self,n_classes,n_clusters):
        self.n_classes = n_classes
        self.n_clusters = n_clusters
    
    def fit_predict(self,X_train,X_test,counts):
        pred = np.zeros((X_test.shape[0],self.n_classes))
        for i in range(len(counts)):
            if i==0:
                st_idx = 0
            else:
                st_idx = np.sum(counts[0:i])
            curr_X = X_train[st_idx:st_idx+counts[i]]
            clf = GMM(n_components = 3, n_iters = 50, tol = 1e-4).fit(curr_X)
            pred[:,i] = clf.predict(X_test)
        return np.argmax(pred,axis = 1)
            
    
    

In [None]:
class GMM_generate():
        
    def __init__(self, n_features, n_clusters):
        self.clusters = {}
        self.n_clusters = n_clusters
        self.n_features = n_features
        self.eps = 1e-10

        pi = np.random.rand(n_clusters)
        self.clusters['pi'] = np.ones(n_clusters) / n_clusters
        
        print('self.clusters[pi]', self.clusters['pi'])
        self.clusters['logdet'] = np.zeros(n_clusters, dtype=np.float64)
        self.clusters['mu'] = np.random.rand(self.n_clusters, self.n_features) * 2 - 1

        inv_cov = []
        for i in range(n_clusters):
            inv_cov.append(np.eye(self.n_features))
        self.clusters['inv_cov'] = np.array(inv_cov)
        self.clusters['cov'] = np.array(inv_cov)

        print('pi shape', self.clusters['pi'].shape)
        print('mu shape', self.clusters['mu'].shape)
        print('det shape', self.clusters['logdet'].shape)
        print('inv_cov shape', self.clusters['inv_cov'].shape)


    def check_power(self, power):
        if np.max(power) >= 710:
            print(f'High Exponent Error  min:{str(np.min(power))}  max:{str(np.max(power))}  new:{str(np.max(power) - 700)}')

    
    

    def expectation_step(self, X):
        print('E step')
        # calculate responsibility for each data point, for each class
        self.resp = np.zeros((self.N, self.n_clusters), dtype=np.float64)
        self.exp = np.zeros((self.N, self.n_clusters), dtype=np.float64)
        self.log_likelihood = 0

        for k in range(self.n_clusters):
            for n in range(len(X)):
                diff = (X[n] - self.clusters['mu'][k])
                self.exp[n][k] = np.log(self.clusters['pi'][k]) - (self.clusters['logdet'][k] + np.dot(np.dot(diff.T, self.clusters['inv_cov'][k]), diff)
)/2

        validate(exp_n_k = self.exp)
        validate(clusters_pi = self.clusters['pi'])
        validate(clusters_det = self.clusters['logdet'])

        for n in range(self.N):
            self.exp[n] -= (np.max(self.exp[n]) - 700)
            self.resp[n] = np.exp(self.exp[n], dtype=np.float64)

        validate(resp_n_k = self.resp)

        for n in range(self.N):
            resp_sum = np.sum(self.resp[n])
            self.log_likelihood += np.log(resp_sum)
            self.resp[n] = self.resp[n] / resp_sum



    def maximization_step(self, X):
        print('M step')

        N_k = np.sum(self.resp, axis=0) # get class responsibility
        print('N_k', N_k)
        for k in range(self.n_clusters):
            
            # find pi for each cluster
            self.clusters['pi'][k] = N_k[k] / self.N

            # find new mean for each cluster
            self.clusters['mu'][k] = np.matmul(self.resp[:, k], X) / N_k[k]

            # find covariance matrix for each cluster
            cov = np.zeros((self.n_features, self.n_features), dtype=np.float64)
            for n in range(self.N):
                v = X[n] - self.clusters['mu'][k]
                cov += self.resp[n][k] * np.outer(v, v)
            cov /= N_k[k]

            self.clusters['cov'][k] = cov
            dg = np.empty(self.n_features)
            dg.fill(1e-6)
            cov += np.diag(dg)
            (sign, lgdet) = np.linalg.slogdet(cov)
            self.clusters['logdet'][k] = sign * lgdet

            if not self.clusters['logdet'][k]:
                print(cov)
                raise Exception('Determinant has zero ' + str(self.clusters['logdet'][k]))

            try:
                self.clusters['inv_cov'][k] = scipy.linalg.inv(cov)
            except ValueError:
                print()
    

    def sample(self, n_samples, cluster=None):
        probs = np.cumsum(self.clusters['pi']) 
        def get_cluster(prob):
            for i in range(self.n_clusters):
                if prob <= probs[i]:
                    return i
        samples = []
        unif = np.random.uniform(0,1,n_samples)
        for n in range(n_samples):
            k = get_cluster(unif[n])
            samples.append(np.random.default_rng().multivariate_normal(self.clusters['mu'][k], self.clusters['cov'][k]))

        return np.array(samples)

    def fit(self, X, n_epochs):

        self.N = X.shape[0]
        for i in range(n_epochs):
            self.expectation_step(X)
            self.maximization_step(X)
            print('Epoch: ', i + 1, 'log_likelihood: ', self.log_likelihood)
            disp_resp(self.resp, self.n_clusters)

        print('\n\nPI values', self.clusters['pi'])

    from scipy import linalg

    def calculate_fid(self, mu1, sigma1, mu2, sigma2):
        
        # calculate sum squared difference between means
        ssdiff = np.sum((mu1 - mu2)**2.0)

        # calculate sqrt of product between cov
        covmean = linalg.sqrtm(sigma1.dot(sigma2))

        # check and correct imaginary numbers from sqrt
        if np.iscomplexobj(covmean):
        covmean = covmean.real

        # calculate score
        fid = ssdiff + np.trace(sigma1 + sigma2 - 2.0 * covmean)
        return fid

In [None]:

if __name__ == '__main__':

    dataset = Q1DataLoader('../Assignment 1/data/pneumoniamnist.npz')
    x_train,y_train,x_test,y_test,x_val,y_val = dataset.train_test_split()
    #x_train = dataset.normalize_input(x_train)
    unique,counts = np.unique(y_train,return_counts= True)
    X = np.zeros(x_train.shape)
    y = np.zeros(y_train.shape[0],dtype = int)
    idx = 0
    for label in range(2):
        labels = np.where(y_train==label)[0]
        for i in range(np.size(labels)):
            X[idx] = x_train[labels[i]]
            y[idx] = label
            idx += 1
            
    q1_experiments = Experiments()
    q1_experiments.binary_experiments(X,y,x_test,y_test,counts,dataset)
    
    dataset = Q2DataLoader('../Assignment 1/data/bloodmnist.npz')
    x_train,y_train,x_test,y_test,x_val,y_val = dataset.train_test_split()
    unique, counts = np.unique(y_train, return_counts=True)
    X = np.zeros(x_train.shape)
    y = np.zeros(y_train.shape[0],dtype = int)
    idx = 0
    for label in range(8):
        labels = np.where(y_train==label)[0]
        for i in range(np.size(labels)):
            X[idx] = x_train[labels[i]]
            y[idx] = label
            idx += 1
    
    q2_experiments = Experiments()
    q2_experiments.multi_class_experiments(X,y,x_test,y_test,counts,dataset)
    
    
    ann_path = '../Assignment 1/data/Road Sign Detection/annotations/'
    image_path = '../Assignment 1/data/Road Sign Detection/images/'
    dataset = Q3DataLoader(ann_path,image_path)
    X_train, X_test, y_train, y_test = dataset.train_test_split()
    q3_experiments = Experiments()
    q3_experiments.Regression_experiments(X_train,y_train,x_test,y_test,dataset)
    
    
    path = "../Assignment 1/data/data/"
    AudioFeatures = Q4DataLoader()
    x_train,y_train,x_test,y_test = AudioFeatures.preprocessing_audio(path)
    unique,counts = np.unique(y_train,return_counts= True)
    X = np.zeros(x_train.shape)
    y = np.zeros(y_train.shape[0],dtype = int)
    idx = 0
    for label in range(2):
        labels = np.where(y_train == label)[0]
        for i in range(np.size(labels)):
            X[idx] = x_train[labels[i]]
            y[idx] = label
            idx += 1
    
    q4_experiments = Experiments()
    q4_experiments.binary_experiments(X,y,x_test,y_test,counts)
    

    data = Q5DataLoader('/content/drive/MyDrive/IISc/PRNN/tiny-imagenet-200.zip')
    X = data.extract()
    NO_CLUSTERS = 5
    NO_EPOCHS = 10
    gmm = GMM_generate(X.shape[1], NO_CLUSTERS)
    gmm.fit(X, NO_EPOCHS)
    samples = gmm.sample(1000)
    mu1, sigma1 = X.mean(axis=0), np.cov(X, rowvar=False)
    mu2, sigma2 = samples[0].mean(axis=0), np.cov(samples[0],  rowvar=False)
    fid = gmm.calculate_fid(mu1, sigma1, mu2, sigma2)
    print(f'fid score: {fid}')

100%|████████████████████████████████████████████████████████████████████████████████| 624/624 [26:40<00:00,  2.56s/it]


metrics of my Gaussian MLE = (0.375, 0, 0.5)


  return 1/(1+np.exp(-x))
100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:15<00:00, 130.17it/s]
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.


metrics of Elastic Net Logistic Regression = (0.8253205128205128, 0.8757126567844925, 0.7722222222222223)


  return 1/(1+np.exp(-x))
100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:18<00:00, 110.62it/s]
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.


metrics of L1 Logistic Regression = (0.8589743589743589, 0.895734597156398, 0.8222222222222222)


  return 1/(1+np.exp(-x))
100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:22<00:00, 87.49it/s]
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.


metrics of L1 Logistic Regression = (0.8269230769230769, 0.876993166287016, 0.7735042735042735)


  return 1/(1+np.exp(-x))
100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:21<00:00, 91.14it/s]
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.


metrics of L2 Logistic Regression = (0.8525641025641025, 0.8925233644859814, 0.8102564102564103)
metrics of LDA = (0.8365384615384616, 0.8819444444444443, 0.7897435897435897)
metrics of Multiclass Gaussian Naive Bayes = (0.8365384615384616, 0.8725, 0.8170940170940171)
metrics of K nearest neighbor = (0.8381410256410257, 0.8829663962920046, 0.7918803418803418)


 45%|████████████████████████████████████▎                                           | 283/624 [29:00<34:57,  6.15s/it]


KeyboardInterrupt: 

In [None]:
dataset = Q1DataLoader('../Assignment 1/data/pneumoniamnist.npz')
x_train,y_train,x_test,y_test,x_val,y_val = dataset.train_test_split()
unique,counts = np.unique(y_train,return_counts = True)
print(unique,counts/sum(counts),type(counts))


[0 1] [0.25785896 0.74214104] <class 'numpy.ndarray'>


In [None]:
from scipy.stats import multivariate_normal

test_pdf0 = [np.mean(multivariate_normal.pdf(x0,x_test[i],np.identity(x0.shape[1]))) for i in range(x_test.shape[0])]
test_pdf1 = [np.mean(multivariate_normal.pdf(x1,x_test[i],np.identity(x1.shape[1]))) for i in range(x_test.shape[0])]

pred_Parzen = []
correct_preds = 0

for i in range(x_test.shape[0]):
    if test_pdf1[i]>test_pdf0[i]:
        pred_Parzen.append(1)
        if y_test[i] == 1:
            correct_preds += 1
    else:
        pred_Parzen.append(0)
        if y_test[i] == 0:
            correct_preds += 0
print(correct_preds,pred_Parzen)


# 