In [167]:
import numpy as np
from sklearn.datasets import load_iris 
from scipy.stats import multivariate_normal
from scipy.stats import mode
import random
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
import warnings

In [168]:
iris = load_iris()
data = iris.data
target = iris.target

# Performing Clustering using Gaussian Mixture Models 
*   in the iris dataset
*   matching the resulted clusters with the original labels with the help of adjusted Rand score

In [169]:
#Initialization
k = 3
max_itr = 5
n, m = data.shape
phi = np.zeros(k)
weights = np.zeros(data.shape)


# initializing phi and weights
for i in range(k):
  phi[i] = 1/k
for i in range(n):
  for j in range(m):
    weights[i][j] = 1/k

# initializing centroids
initial_centroids = random.sample(range(0,149), k)
# initializing mean and covariance matrix
mu = [data[row_index,:] for row_index in initial_centroids]
sigma = [np.cov(np.transpose(data)) for _ in range(k)]

In [170]:
# Probability function for each class ϕ, by averaging over all examples in the training set
def probability_predict(data):
  likelihood = np.zeros((n, k))
  for i in range(k):
    gauss_dist = multivariate_normal(mean=mu[i], cov=sigma[i])
    likelihood[:,i] = gauss_dist.pdf(data)

  numerator = likelihood * phi
  denominator = numerator.sum(axis=1)[:, np.newaxis]
  weights = numerator / denominator
  return weights

# Returning index of maximum probability
def predict(data):
  return np.argmax(probability_predict(data), axis=1)

In [171]:
# Updating weights, phi,  mean and covariance matrix
for i in range(max_itr):
  weights = probability_predict(data)
  phi = weights.mean(axis=0)

  for j in range(k):
    weight = weights[:, [j]]
    total_weight = weight.sum()
    mu[j] = (data * weight).sum(axis=0) / total_weight
    sigma[j] = np.cov(np.transpose(data), aweights=(weight/total_weight).flatten())

In [172]:
labels = predict(data)

In [173]:
# Calculating adjusted rand score
score = adjusted_rand_score(labels, target)
print("Adjusted Rand score:", score)

Adjusted Rand score: 0.5586216751698382


In [174]:
# Finding row indices of target and predicted labels for jaccard similiarity later
target_rows = [[0 for i in range(0)] for j in range(k)]
for i in range(len(target)):
  target_rows[target[i]].append(i)
print("Row indices of the target labels -->", target_rows)

pred_rows = [[0 for i in range(0)] for j in range(k)]
for i in range(len(labels)):
  pred_rows[labels[i]].append(i)
print("Row indices of predicted labels --->", pred_rows)

Row indices of the target labels --> [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49], [50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]]
Row indices of predicted labels ---> [[50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 94, 95, 96, 97, 98, 99, 101, 102, 103, 104, 105, 106, 107, 108, 111, 113, 116, 117, 118, 119, 122, 124, 125, 127,

In [175]:
# Jaccard similiarity for accuracy
gmm_jaccard = [0,0,0]

for i in range(len(pred_rows)):
  sim = [0, 0, 0]
  # finds the no of similar elements and updates the sim matrix accordingly
  for j in range(len(pred_rows[i])):
    for l in range(3):
      if pred_rows[i][j] in target_rows[l]:
        sim[l] += 1
  # for finding out the most similar set between target and predicted labels
  max_sim = sim.index(np.amax(sim))
  u_list = list(set(pred_rows[i]) | set(target_rows[max_sim]))
  i_list = list(set(pred_rows[i]) & set(target_rows[max_sim]))

  gmm_jaccard[i] = len(i_list)/len(u_list)

accuracy = (gmm_jaccard[0]*len(pred_rows[0]))/len(data) + (gmm_jaccard[1]*len(pred_rows[1]))/len(data) + (gmm_jaccard[2]*len(pred_rows[2]))/len(data)

In [176]:
print("Accuracy of Gaussian Mixture Model  -->",round(accuracy*100,2))
print("Adjusted Rand score using sk-learn -->", round(score,2))

Accuracy of Gaussian Mixture Model  --> 68.83
Adjusted Rand score using sk-learn --> 0.56


# Performing in-built logistic regression on training data and classify test data.
*   using IRIS data to generate training data and test data
*   choosing 40 random data points from each class for training data
*   using the remaining 10 data points from each class as test data

In [177]:
warnings.filterwarnings('ignore' )
X_train, X_test, Y_train, Y_test = train_test_split(data, target, test_size = 0.2)

lr_classifier = LogisticRegression(random_state=0).fit(X_train, Y_train)
lr_classifier.predict(X_test)
lr_accuracy = clf.score(X_train, Y_train)

print("Accuracy of Logictic Regression using sk-learn  -->",round(lr_accuracy*100,2))

Accuracy of Logictic Regression using sk-learn  --> 97.5
