In [None]:
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np

def purity_score(y_true, y_pred, axis):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=axis)) / np.sum(contingency_matrix) 


def evaluate_k_means(kmeans, name, data, labels):
  estimator = make_pipeline(StandardScaler(), kmeans).fit(data)
  results = [name, estimator[-1].inertia_]
  score = metrics.homogeneity_score(labels, estimator[-1].labels_)
  purityX = purity_score(labels, estimator[-1].labels_, 0)
  purityY = purity_score(labels,estimator[-1].labels_, 1)

  print("Name: ", results[0])
  print("Objective (i.e. inertia): ", results[1])
  print("Gini Score: ", score)
  print("Row Purity: ", purityX)
  print("Column Purity: ", purityY)
  print("\n")


In [None]:
from sklearn.cluster import KMeans

#MNIST
dataPath = "sample_data/mnist_train_small.csv"
dataFile = open(dataPath, 'r')
trainDF = pd.read_csv(dataPath)
trainFeatures = trainDF.iloc[:,1:]
trainLabels = trainDF.iloc[:,:1].values.flatten().astype(np.int32)

kmeans = KMeans(n_clusters=5,random_state=0).fit(trainFeatures)
evaluate_k_means(kmeans, "Mnist K = 5", trainFeatures, trainLabels)

kmeans = KMeans(n_clusters=10,random_state=0).fit(trainFeatures)
evaluate_k_means(kmeans, "Mnist K = 10", trainFeatures, trainLabels)

kmeans = KMeans(n_clusters=20,random_state=0).fit(trainFeatures)
evaluate_k_means(kmeans, "Mnist K = 20", trainFeatures, trainLabels)

Name:  Mnist K = 5
Objective (i.e. inertia):  12434543.486521289
Gini Score:  0.3297481445651872
Row Purity:  0.41012050602530126
Column Purity:  0.7098854942747137


Name:  Mnist K = 10
Objective (i.e. inertia):  11613063.544049602
Gini Score:  0.4233432910668482
Row Purity:  0.5288764438221911
Column Purity:  0.5720286014300715


Name:  Mnist K = 20
Objective (i.e. inertia):  10756085.039449332
Gini Score:  0.5125123104171879
Row Purity:  0.6093804690234512
Column Purity:  0.40852042602130106




In [None]:
#Load Fashion
from sklearn.cluster import KMeans

dataPath = "/content/drive/MyDrive/DataMining/Fashion/fashion-mnist_train.csv"
dataFile = open(dataPath, 'r')

#load the data
trainDF = pd.read_csv(dataPath)

labels = trainDF.iloc[:,:1].values.flatten()
features = trainDF.iloc[:,1:]

kmeans = KMeans(n_clusters=5,random_state=0).fit(features)
evaluate_k_means(kmeans, "Fashion K = 5", features, labels)

kmeans = KMeans(n_clusters=10,random_state=0).fit(features)
evaluate_k_means(kmeans, "Fashion K = 10", features, labels)

kmeans = KMeans(n_clusters=20,random_state=0).fit(features)
evaluate_k_means(kmeans, "Fashion K = 20", features, labels)


Name:  Fashion K = 5
Objective (i.e. inertia):  31124733.130651943
Gini Score:  0.3642661390898533
Row Purity:  0.3777
Column Purity:  0.69165


Name:  Fashion K = 10
Objective (i.e. inertia):  26154004.41527849
Gini Score:  0.4948062577384306
Row Purity:  0.5547166666666666
Column Purity:  0.5968333333333333


Name:  Fashion K = 20
Objective (i.e. inertia):  22488430.322118796
Gini Score:  0.6066827784650357
Row Purity:  0.6613166666666667
Column Purity:  0.45025




In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans
import numpy as np


dataset = fetch_20newsgroups(
    remove=("headers", "footers", "quotes"),
    subset="all",
    shuffle=True,
    random_state=42,
)

labels = dataset.target
unique_labels, category_sizes = np.unique(labels, return_counts=True)
true_k = unique_labels.shape[0]

vectorizer = TfidfVectorizer(
    max_df=0.5,
    min_df=5,
    stop_words="english",
)
X_tfidf = vectorizer.fit_transform(dataset.data)


def evaluate_k_means(kmeans, name, data, labels):
  estimator = make_pipeline(StandardScaler(with_mean=False), kmeans).fit(data)
  results = [name, estimator[-1].inertia_]
  score = metrics.homogeneity_score(labels, estimator[-1].labels_)
  purityX = purity_score(labels, estimator[-1].labels_, 0)
  purityY = purity_score(labels,estimator[-1].labels_, 1)

  print("Name: ", results[0])
  print("Objective (i.e. inertia): ", results[1])
  print("Gini Score: ", score)
  print("Row Purity: ", purityX)
  print("Column Purity: ", purityY)
  print("\n")


kmeans = KMeans(
        n_clusters=20,
        random_state=0,
        ).fit(X_tfidf)
evaluate_k_means(kmeans, "20NG K = 20", X_tfidf, labels)

kmeans = KMeans(
        n_clusters=40,
        random_state=0,
        ).fit(X_tfidf)
evaluate_k_means(kmeans, "20NG K = 40", X_tfidf, labels)

kmeans = KMeans(
        n_clusters=10,
        random_state=0,
        ).fit(X_tfidf)
evaluate_k_means(kmeans, "20NG K = 10", X_tfidf, labels)


Name:  20NG K = 20
Objective (i.e. inertia):  441939139.3143358
Gini Score:  0.11939048079216033
Row Purity:  0.10267430754536772
Column Purity:  0.8877215324206729


Name:  20NG K = 40
Objective (i.e. inertia):  436141449.87681043
Gini Score:  0.004720371855807601
Row Purity:  0.057624960203756764
Column Purity:  0.9952244508118434


Name:  20NG K = 10
Objective (i.e. inertia):  445831593.4926965
Gini Score:  0.002183698799859606
Row Purity:  0.05518412395203226
Column Purity:  0.9975061020906293




In [None]:
import numpy as np
from scipy.stats import multivariate_normal

class GMM:
  def __init__(self, k, max_iter=5):
    self.k = k
    self.max_iter = int(max_iter)

  def initialize(self, X):
    self.shape = X.shape #store the shape of the given data matrix. N points each with M features.
    self.n, self.m = self.shape

    self.phi = np.full(shape=self.k, fill_value=1/self.k) #the probability of each class. 
    self.weights = np.full( shape=self.shape, fill_value=1/self.k)  #the likelihood that a point belongs to cluster k. initially, all clusters are equally likely. 
    
    random_row = np.random.randint(low=0, high=self.n, size=self.k) 
    self.mu = [  X[row_index,:] for row_index in random_row ] #set the mean of each cluster to be the value of a random point in the dataset. 
    self.sigma = [ np.cov(X.T) for _ in range(self.k) ] #set the sigma of each cluster to be the covariance of that row

  def e_step(self, X):
    #Given a mu and sigma, update the liklihood that point i came from cluster k.
    self.weights = self.predict_proba(X) #each row is a datapoint, each column is the likelihood that that datapoint is in cluster k
    #the overall probability of a random point coming from cluster k is the average of all of the points coming from cluster k. 
    self.phi = self.weights.mean(axis=0)

  def m_step(self, X):
    #Given the likehiood that point i came from cluster k, determine new mu and sigma for each cluster k.
    for i in range(self.k):
      weight = self.weights[:,[i]] #th
      total_weight = weight.sum()
      #the mean of cluster i is the average of all points in that cluster.
      #Each "point" is actually a fraction of a point determined by its weight to be in that cluster. 
      #So, to get the number of "points" in the cluster, we multiply every n by its weight and then sum them up.
      #then, we divide by the total_weight to average them. 
      self.mu[i] = (X * weight).sum(axis=0) / total_weight
      self.sigma[i] = np.cov(X.T, aweights=(weight/total_weight).flatten(), bias=True)

  def fit(self, X):
    self.initialize(X)
    for iteration in range(self.max_iter):
      self.e_step(X)
      self.m_step(X)
  
  def predict_proba(self, X):
    likelihood = np.zeros((self.n, self.k))
    for i in range(self.k): #for each cluster
      distribution = multivariate_normal( #make a distribution for that cluster given our current mu and sigma values 
          mean=self.mu[i],
          cov=self.sigma[i])
      likelihood[:,i] = distribution.pdf(X) #each i,j in likelihood is the likelihood that point i is in cluster j. 
    
    numerator = likelihood * self.phi #bayes
    denominator = numerator.sum(axis=1)[:,np.newaxis]
    weights = numerator / denominator
    return weights

  def predict(self, X):
    weights = self.predict_proba(X) #each row is a datapoint, each column is the likelihood that that datapoint is in cluster k
    return np.argmax(weights,axis=1) #to get the class index of a point, find the class index it is most likely to be in and return that.

  
  def print(self):
    print("GMM Model with k=", self.k, " and data matrix with shape ", self.shape, ":\n")
    for i in range(self.k):
      print("Mean for cluster ", i, ":\n ", self.mu[i])
      print("Cov for cluster ", i, ":\n ", self.sigma[i])
      print("Number of points in cluster ", i ,":\n ", np.sum(self.weights[:,i]), "\n")
      print("\n")





In [None]:

import pandas as pd
from scipy.stats import mode
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

path2 = "/content/drive/MyDrive/DataMining/2gaussian.txt"
path3 = "/content/drive/MyDrive/DataMining/3gaussian.txt"

df = pd.read_table(path2, delimiter = ' ')

X = df.values
print(X.shape)

gmm = GMM(k=2, max_iter=1000)
gmm.fit(X)
gmm.print()


df = pd.read_table(path3, delimiter = ' ')

X = df.values

gmm2 = GMM(k=3, max_iter=1000)
gmm2.fit(X)
gmm2.print()


(5999, 2)
GMM Model with k= 2  and data matrix with shape  (5999, 2) :

Mean for cluster  0 :
  [7.01295317 3.98321625]
Cov for cluster  0 :
  [[0.97503308 0.4977206 ]
 [0.4977206  1.00138972]]
Number of points in cluster  0 :
  3990.3243929123364 



Mean for cluster  1 :
  [2.99404367 3.05211315]
Cov for cluster  1 :
  [[1.01010905 0.02721332]
 [0.02721332 2.93789732]]
Number of points in cluster  1 :
  2008.6756070876638 



GMM Model with k= 3  and data matrix with shape  (9999, 2) :

Mean for cluster  0 :
  [7.02158184 4.01547592]
Cov for cluster  0 :
  [[0.99039767 0.50094769]
 [0.50094769 0.9956332 ]]
Number of points in cluster  0 :
  2984.324455881726 



Mean for cluster  1 :
  [5.01170274 7.00144464]
Cov for cluster  1 :
  [[0.97975418 0.18521522]
 [0.18521522 0.97458008]]
Number of points in cluster  1 :
  4959.684040771087 



Mean for cluster  2 :
  [3.03982973 3.04846728]
Cov for cluster  2 :
  [[1.02920341 0.02697752]
 [0.02697752 3.38659663]]
Number of points in cluste

In [3]:
#Question 4

import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture as gm
from sklearn import metrics as metrics

#Load the fashion dataset
dataPath = "/content/drive/MyDrive/DataMining/Fashion/fashion-mnist_train.csv"
df = pd.read_csv(dataPath)

#Get labels and features
labels = df.iloc[:,:1].values.flatten()
features = df.iloc[:,1:]
features=(features-features.mean())/features.std()

print("Labaels shape: ", labels.shape)
print("Features shape: ", features.shape)
X = features.values


gmm = gm(n_components = 10, init_params='kmeans',
           n_init = 5, max_iter = 5000, covariance_type = 'diag')

gmm.fit(X)

predictions = gmm.predict(X)

print ("gmm: silhouttte: ", metrics.silhouette_score(X, predictions))

Labaels shape:  (60000,)
Features shape:  (60000, 784)
gmm: silhouttte:  0.08464173957612584


In [14]:

import pandas as pd
from scipy.stats import mode
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import sklearn.preprocessing as preprocessing

path = "/content/drive/MyDrive/DataMining/spambase.data"


df = pd.read_csv(path)
labels = df.iloc[: , -1]
X = df.iloc[:,0:-1].values

X = df.values #returns a numpy array


gmm = gm(n_components = 2, init_params='kmeans',
           n_init = 5, max_iter = 5000, covariance_type = 'diag')

gmm.fit(X)
predictions = gmm.predict(X)

silhouette = metrics.silhouette_score(X, labels)
score = metrics.homogeneity_score(predictions, labels)

print("Spambase Results:")
print("Silhouette: ", silhouette)
print("Homogenity: ", score)

Spambase Results:
Silhouette:  0.1976264243774234
Homogenity:  0.25742583510935213


# New Section