In [1]:
!pip install seaborn



In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random as ra
import matplotlib.pyplot as plt
import seaborn as sn

In [3]:
#function to perform K-Means clustering, and obtain mean vectors
def KMeans(features,K,length,number):
    #creating empty arrays
    similar_count = 0
    
    #initialising mean representatives with random x's
    means = initialise_means(features.copy(),number,length,K)
    #looping until a consistent set of mean vectors and hard clusters are obtained
    while True:
        resp_mat = cluster_label(means, features.copy(),K,number)
        new_means = recompute_mean(resp_mat,features.copy(),K,length,number)
    
        if np.array_equal(means,new_means):
            similar_count += 1
        
        else:
            similar_count = 0
        
        #if not changing for more than 9 iterations, then stop the cycle, and return the consistent set of clustered responsibilities.
        if (similar_count == 10):
            return resp_mat
        
        #until then, constantly updating the mean vectors
        means = new_means

In [4]:
def maximisation(resp_mat,features,K,length,number,covar_type,tied_covar):
    means = recompute_mean(resp_mat,features.copy(),K,length,number)
    feat = np.empty((K,number,length))
    for i in range(K):
        feat[i,:,:] = np.subtract(features,means[i])
    covar_mat = covariance(feat,resp_mat,K,length,number,covar_type,tied_covar)
    weight = weight_params(resp_mat,number)
    return covar_mat, weight, feat, means

In [5]:
def weight_params(resp_mat,number):
    effective_egs = np.sum(resp_mat,axis = 0)
    weight_array = np.divide(effective_egs,number)
    return weight_array

In [6]:
def initialise_means(feature_v,number,length,K):
    rng = np.random.default_rng()
    means = rng.choice(feature_v,K,replace=False)
    return means

In [7]:
def cluster_label(means,features,K,number):
    resp_mat = np.zeros((number,K))
    count = 0
    for i in features:
        distances = np.zeros(0)
        resp_v = np.zeros(K)
        #calculate distances between mean representatives and the point
        for j in means:
            euc_dist = np.linalg.norm(i - j)
            distances = np.append(distances,euc_dist)
        #assign class label of which has minimum euclidean distance
        class_label = np.argmin(distances,axis = -1)
        resp_mat[count,class_label] = 1
        count += 1
    return resp_mat

In [8]:
def covariance(feat, resp_mat, K, length, number,covar_type,tied_covar):
    covar_mat = np.zeros((K, length, length))

    for i in range(K):
        # Initialize a weighted sum for the covariance matrix
        weighted_sum = np.zeros((length, length))

        # Calculate the weighted sum for this component's covariance matrix
        for j in range(number):
            weighted_sum += resp_mat[j, i] * np.outer(feat[i, j, :], feat[i, j, :])
    
            #constructing a diagonal covariance matrix, if prompted accordingly
            if covar_type:
                weighted_sum = np.diag(np.diag(weighted_sum))
        # Normalize the covariance matrix by dividing by the sum of responsibilities for the component
        sum_of_resp = np.sum(resp_mat[:, i])
        if sum_of_resp != 0:  # Avoid division by zero
            covar_mat[i, :, :] = weighted_sum / sum_of_resp       
    return covar_mat

In [9]:
def expectation(feat,resp_mat,covar_mat,weights,K,length,number):
    likehd_mat = np.zeros(number)
    for i in range(0,K):
        resp_mat[:,i] = evaluate_gaussian(covar_mat[i,:,:],feat[i,:,:],weights[i],length,number)    
    
    for i in range(0,number):
        likehd_mat[i] = np.sum(resp_mat[i,:])
        resp_mat[i,:] = np.divide(resp_mat[i,:],likehd_mat[i])
    return resp_mat,likehd_mat.T

In [10]:
def evaluate_gaussian(covar_mat,feat,weight,length,number):
    resps = np.zeros(number)
    for i in range(0,number):
        resps[i] = np.divide(weight*np.exp(np.multiply(feat[i] @ np.linalg.inv(covar_mat) @ feat[i].T,-1/2)),(np.sqrt((2*np.pi)**length*(np.linalg.det(covar_mat)))))
    return resps.T

In [11]:
def recompute_mean(resp_mat, features, K, length, number):
    resp_avg = np.zeros((K, length))
    
    for i in range(K):
        for j in range(number):
            resp_avg[i] += resp_mat[j, i] * features[j]
        
        # Normalize the mean by dividing by the sum of responsibilities for the cluster  
        sum_of_resp = np.sum(resp_mat[:, i])
        if sum_of_resp != 0:  #This part avoid division by zero       
            resp_avg[i] /= sum_of_resp
    return resp_avg

In [12]:
def func_main(features,tied_covar,K):
    number = np.shape(features)[0]
    length = np.shape(features)[1]
    
    log_likelihood = 0
    count = 0
    exit_count = 7
    i = 1

    resp_mat = KMeans(features,K,length,number)
    while (count <= exit_count):
        covar_mat,weights,feat,means = maximisation(resp_mat,features,K,length,number,covar_type,tied_covar)
        resp_mat,likehd_mat = expectation(feat,resp_mat,covar_mat,weights,K,length,number)
        new_log_likelihood = np.sum(np.log(likehd_mat))
        change_likehd = new_log_likelihood-log_likelihood
        if (np.abs(change_likehd) < 1e-2):
            count += 1
        i += 1
        log_likelihood = new_log_likelihood
    #print("Log Likelihood on the final epoch #",i,"is ",new_log_likelihood)
    return means,covar_mat,weights

In [13]:
def matrix_plot(actual_label_array,predicted_label_array,num_class,class_numbers,covar_type,tied_covar,K,phrase):
    mistakes = 0
    plot_matrix = np.zeros((num_class,num_class))
    for j in range(0,np.sum(class_numbers)):
        actual_class = int(actual_label_array[j])
        predicted_class = int(predicted_label_array[j])
        if (actual_class != predicted_class):
            mistakes += 1
        plot_matrix[predicted_class][actual_class] += 1
    sn.set(font_scale=1.5)
    mat = sn.heatmap(plot_matrix,annot=True,annot_kws={"size":10},cmap='crest',linecolor='black')
    mat.set(xlabel = 'True Class',ylabel = 'Predicted Class')
    plt.savefig("/kaggle/working/matrix_{}_{}_{}_{}.pdf".format(K,covar_type,tied_covar,phrase))
    return mistakes

import numpy as np
from sklearn.mixture import GaussianMixture
gm = GaussianMixture(n_components=5, random_state=0,covariance_type='diag').fit(needed_data)
print(gm.covariances_)
print(gm.means_)
print(gm.weights_)
#gm.predict([[0, 0], [12, 3]])


In [14]:
import pandas as pd
import numpy as np

covar_type = bool(input("Enter 1 for diagonal covariance matrix, and 0 for full covariance matrix"))
tied_covar = bool(input("Enter 1 for tied covariance matrices (if the covariance matrix for each class is to be same)"))

print("\\begin{table}")
print("\t\\centering")
print("\t\\caption{Classification Accuracy on Datasets, with GMM, full covariance matrices}")
print("\t\\begin{tabular}{|c|c|c|c|c|c|}")
print("\t\hline")
print("\tK & Train Accuracy & Val Accuracy & Test Accuracy & Params Per Class \\\\")
print("\t\hline")
print("\t\\end{tabular}")
print("\\end{table}")
for K in range(2,20):
    train_data = preprocess("/kaggle/input/assn2assn2/Dataset-1b/Train-10.csv")
    class_means,class_covars,class_weights,class_prior_probs,num_class = train(train_data,covar_type,tied_covar,K)

    test_data = preprocess("/kaggle/input/assn2assn2/Dataset-1b/Test-10.csv")
    #test_data = preprocess("/kaggle/input/assn2assn2/Dataset-1a/Val-35.csv")
    #test_data = preprocess("/kaggle/input/assn2assn2/Dataset-1a/Test-35.csv")

    features_array,class_numbers,length,actual_labels = extract_data(test_data)
    total_egs = np.sum(class_numbers)
    predicted_labels = predict(features_array,total_egs,length,K,class_means,class_covars,class_weights,class_prior_probs,num_class)
    classify_acc1 = plot(predicted_labels,actual_labels,num_class,class_numbers,features_array,class_means,class_covars,K,"train")
    
    #test_data = preprocess("/kaggle/input/assn2assn2/Dataset-1a/Train-35.csv")
    test_data = preprocess("/kaggle/input/assn2assn2/Dataset-1b/Val-10.csv")
    #test_data = preprocess("/kaggle/input/assn2assn2/Dataset-1a/Test-35.csv")

    features_array,class_numbers,length,actual_labels = extract_data(test_data)
    total_egs = np.sum(class_numbers)
    predicted_labels = predict(features_array,total_egs,length,K,class_means,class_covars,class_weights,class_prior_probs,num_class)
    classify_acc2 = plot(predicted_labels,actual_labels,num_class,class_numbers,features_array,class_means,class_covars,K,"val")
    print("\t{} & {} & {} & {} & {} \\\\ \\hline".format(K,int(classify_acc1*100000)/100000,int(classify_acc2*100000)/100000," ",6*K))

StdinNotImplementedError: raw_input was called, but this frontend does not support input requests.

In [None]:
 K = int(input("Enter the number of Gaussians to use for this Gaussian Mixture Model (Enter 1 for a single Gaussian distribution)"))

train_data = preprocess("/kaggle/input/assn2assn2/Dataset-1b/Train-10.csv")
class_means,class_covars,class_weights,class_prior_probs,num_class = train(train_data,covar_type,tied_covar,K)

#test_data = preprocess("/kaggle/input/assn2assn2/Dataset-1a/Train-35.csv")
#test_data = preprocess("/kaggle/input/assn2assn2/Dataset-1a/Val-35.csv")
test_data = preprocess("/kaggle/input/assn2assn2/Dataset-1b/Test-10.csv")

features_array,class_numbers,length,actual_labels = extract_data(test_data)
total_egs = np.sum(class_numbers)
predicted_labels = predict(features_array,total_egs,length,K,class_means,class_covars,class_weights,class_prior_probs,num_class)
classify_acc = plot(predicted_labels,actual_labels,num_class,class_numbers,features_array,class_means,class_covars,K,"test_gmm_full")
print(int(classify_acc*100000)/100000)

plt.clf()
test_data = preprocess("/kaggle/input/assn2assn2/Dataset-1b/Train-10.csv")
#test_data = preprocess("/kaggle/input/assn2assn2/Dataset-1a/Val-35.csv")
#test_data = preprocess("/kaggle/input/assn2assn2/Dataset-1a/Test-35.csv")

features_array,class_numbers,length,actual_labels = extract_data(test_data)
total_egs = np.sum(class_numbers)
predicted_labels = predict(features_array,total_egs,length,K,class_means,class_covars,class_weights,class_prior_probs,num_class)
classify_acc1 = plot1(predicted_labels,actual_labels,num_class,class_numbers,features_array,class_means,class_covars,K,"train_gmm_full")

In [None]:
preprocess("/kaggle/input/assn2assn2/Dataset-1b/Train-10.csv")

In [None]:
def preprocess(file_path,file_path_1=[]):
    # Creating a dataframe from the given input file
    file = pd.read_csv(file_path)
    #file = pd.read_csv(file_path,header = None)
    df = pd.DataFrame(file)
    try:
        file1 = pd.read_csv(file_path_1,header = None)
        df1 = pd.DataFrame(file1)
        df1.rename(columns = {df1.columns[0]:"label"},inplace = True)
        
        df = pd.concat([df,df1],axis = 1)
    except:
        pass
    
    # Group by class label
    data = df.groupby('label')
    #data = df.groupby('output')
    return data

In [None]:
def train(data,covar_type,tied_covar,K):
    class_means = []
    class_covars = []
    class_weights = []
    no_class_egs = []

    class_count = 0
    for indx, grp in data:
        col_data = np.array(grp.values)
        class_features = col_data[:,1:-1]
        means,covar_mat,weights = func_main(class_features,covar_type,K)
        class_weights.append(weights)
        class_covars.append(covar_mat)
        class_means.append(means)
        no_class_egs.append(np.shape(class_features)[0])
        class_count += 1
    
    if tied_covar:
        class_covars = covar_tie(class_covars,class_count)
    
    no_class_egs = np.array(no_class_egs)
    class_prior_probs = np.divide(no_class_egs,np.sum(no_class_egs))
    return class_means,class_covars,class_weights,class_prior_probs,class_count

In [None]:
def predict(features_array,total_egs,length,K,class_means,class_covars,class_weights,class_prior_probs,num_class):    
    predicted_labels = []
    posterior_mat = np.zeros((total_egs,num_class))
    dummy_array = np.zeros((total_egs,K))
        
    for j in range(0,num_class):
        means = class_means[j]
        covar_mat = class_covars[j]
        weights = class_weights[j]
        
        feat = np.empty((K,total_egs,length))
        for i in range(K):
            feat[i,:,:] = np.subtract(features_array,means[i])
        dummy_array,posterior_mat[:,j] = expectation(feat,dummy_array,covar_mat,weights,K,length,total_egs)
        posterior_mat[:,j] = np.multiply(posterior_mat[:,j],class_prior_probs[j])
    predicted_labels = np.argmax(posterior_mat,axis=1)
    return predicted_labels

In [None]:
def extract_data(data):
    features_array = None
    class_numbers = []
    actual_labels = []
    for indx,grp in data:
        col_data = np.array(grp.values)
        class_features = col_data[:,1:-1]
        act_class_labels = col_data[:,-1]

        if features_array is None:
            features_array = class_features
        
        else:
            features_array = np.concatenate((features_array,class_features),axis=0)
        actual_labels = np.concatenate((actual_labels,act_class_labels),axis=0)
        
        number = np.shape(class_features)[0]
        length = np.shape(class_features)[1]
        class_numbers.append(number)
    return features_array,class_numbers,length,actual_labels

In [None]:
def plot(predicted_labels,actual_labels,num_class,class_numbers,features_array,class_means,class_covars,K,phrase):
    misclass_egs = matrix_plot(actual_labels,predicted_labels,num_class,class_numbers,covar_type,tied_covar,K,phrase)
    total_egs = np.sum(class_numbers)
    classify_acc = (1-(misclass_egs/total_egs))*100
    #if (length <= 2):
    #    x_min,x_max = features_array[:,0].min()-0.5 , features_array[:,0].max()+0.5
    #    y_min,y_max = features_array[:,1].min()-0.5 , features_array[:,1].max()+0.5
    #    
    #    plt.figure()
    #
    #    level_curve_plot(class_means,class_covars,K,num_class,x_min,x_max,y_min,y_max)
    #    scatter_plot(features_array,actual_labels,total_egs)
    #    dec_region_plot(x_min,x_max,y_min,y_max,K,covar_type,tied_covar,phrase)
    #print("The Classification accuracy of the Model is {}".format(classify_acc))
    return classify_acc 

In [None]:
def plot1(predicted_labels,actual_labels,num_class,class_numbers,features_array,class_means,class_covars,K,phrase):
    misclass_egs = matrix_plot(actual_labels,predicted_labels,num_class,class_numbers,covar_type,tied_covar,K,phrase)
    total_egs = np.sum(class_numbers)
    classify_acc = (1-(misclass_egs/total_egs))*100
    if (length <= 2):
        x_min,x_max = features_array[:,0].min()-0.5 , features_array[:,0].max()+0.5
        y_min,y_max = features_array[:,1].min()-0.5 , features_array[:,1].max()+0.5
        
        plt.figure()
    
        level_curve_plot(class_means,class_covars,K,num_class,x_min,x_max,y_min,y_max)
        scatter_plot(features_array,actual_labels,total_egs)
        dec_region_plot(x_min,x_max,y_min,y_max,K,covar_type,tied_covar,phrase)
    #print("The Classification accuracy of the Model is {}".format(classify_acc))
    return classify_acc 

In [None]:
def dec_region_plot(x_min,x_max,y_min,y_max,K,covar_type,tied_covar,phrase):
    class_colors = {0: 'red', 1: 'blue'}
    total_egs = 45 ** 2
    
    xx,yy = np.meshgrid(np.linspace(x_min,x_max,45),np.linspace(y_min,y_max,45))
    feature_space = np.c_[xx.ravel(),yy.ravel()]
    feature_sp_labels = predict(feature_space,total_egs,length,K,class_means,class_covars,class_weights,class_prior_probs,num_class)
    feature_sp_labels = feature_sp_labels.reshape(np.shape(xx))
    plt.contourf(xx,yy,feature_sp_labels,cmap = plt.cm.RdYlBu, alpha = 0.4)
    plt.savefig("/kaggle/working/region_{}_{}_{}_{}.pdf".format(K,covar_type,tied_covar,phrase))


In [None]:
def scatter_plot(class_features,actual_labels,total_egs):
    class_colors = {0: 'red', 1: 'blue'}
    k = 0
    for j in range(0,total_egs):
        plt.scatter(class_features[j][0],class_features[j][1], c = class_colors[actual_labels[k]])
        k += 1

In [None]:
def level_curve_plot(class_means,class_covars,K,num_class,x_min,x_max,y_min,y_max):
    class_count = 0
    class_colors = {0: 'red', 1: 'blue'}
    plt.xlabel("feat-1")
    plt.ylabel("feat-2")
    plt.title("Decision Region for K = {}".format(K))
    
    for i in range(0,num_class):
        for j in range(0,K):
            data = np.random.multivariate_normal(class_means[i][j],class_covars[i][j],size=200)
            sn.kdeplot(x=data[:, 0], y=data[:, 1], color = class_colors[1-class_count], levels=[0.5],legend = True,label = "Class {}".format(class_colors))
        class_count += 1
    legend_labels = [plt.Line2D([0],[0],color = class_colors ,label = "Class {}".format(class_labels)) for class_labels,class_colors in class_colors.items()]
    plt.legend
    ax = plt.gca()
    ax.set_xlim(x_min,x_max)
    ax.set_ylim(y_min,y_max)

In [None]:
def covar_tie(class_covars,class_count):
    accum_mat = np.zeros(np.shape(class_covars[0]))
    for i in class_covars:
        accum_mat += i
    tied_cov = np.divide(accum_mat,class_count)
    for i in range(0,class_count):
        class_covars[i] = tied_cov
    return class_covars