# K-means Clustering
K-means clustering is one of the simplest and popular unsupervised machine learning algorithms

![](http://www.learnbymarketing.com/wp-content/uploads/2015/01/method-k-means-steps-example.png)

In [None]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

#generating some random data in a two-dimensional
X= -2 * np.random.rand(100,2)
X1 = 1 + 2 * np.random.rand(50,2)
X[50:100, :] = X1
plt.scatter(X[ : , 0], X[ :, 1], s = 50, c = 'b')
plt.show()

#k-means
Kmean = KMeans(n_clusters=2)
Kmean.fit(X)

#Finding the centroid
print(Kmean.cluster_centers_)

plt.scatter(X[ : , 0], X[ : , 1], s =50, c='b')
plt.scatter(Kmean.cluster_centers_[0][0], Kmean.cluster_centers_[0][1], s=200, c='g', marker='s')
plt.scatter(Kmean.cluster_centers_[1][0], Kmean.cluster_centers_[1][1], s=200, c='r', marker='s')
plt.show()

#Testing the algorithm
print(Kmean.labels_)

#predicting the cluster of a data point
sample_test=np.array([-3.0,-3.0])
second_test=sample_test.reshape(1, -1)
print(Kmean.predict(second_test))

# Kmeans on Geyser’s Eruptions Segmentation

In [None]:
# Modules
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.image import imread
import pandas as pd
import seaborn as sns
from sklearn.datasets.samples_generator import (make_blobs,
                                                make_circles,
                                                make_moons)
from sklearn.cluster import KMeans, SpectralClustering
#from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_samples, silhouette_score

plt.style.use('fivethirtyeight')
from warnings import filterwarnings
filterwarnings('ignore')

# Import the data
df = pd.read_csv('../input/old-faithful/faithful.csv')
df = df[["eruptions","waiting"]]

# Plot the data
plt.figure(figsize=(6, 6))
plt.scatter(df.iloc[:, 0], df.iloc[:, 1])
plt.xlabel('Eruption time in mins')
plt.ylabel('Waiting time to next eruption')
plt.title('Visualization of raw data');

# Standardize the data
X_std = StandardScaler().fit_transform(df)

# Run local implementation of kmeans
km = KMeans(n_clusters=2, max_iter=20, random_state=20)
km.fit(X_std)
centroids = km.cluster_centers_

# Plot the clustered data
fig, ax = plt.subplots(figsize=(6, 6))
plt.scatter(X_std[km.labels_ == 0, 0], X_std[km.labels_ == 0, 1],
            c='green', label='cluster 1')
plt.scatter(X_std[km.labels_ == 1, 0], X_std[km.labels_ == 1, 1],
            c='blue', label='cluster 2')
plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=300,
            c='r', label='centroid')
plt.legend()
plt.xlim([-2, 2])
plt.ylim([-2, 2])
plt.xlabel('Eruption time in mins')
plt.ylabel('Waiting time to next eruption')
plt.title('Visualization of clustered data', fontweight='bold')
ax.set_aspect('equal');

## Checkpoint 1
Implement k-means on Iris dataset with k=3

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris

dataset = load_iris()
x = dataset.data

# Standardizing the features
x = StandardScaler().fit_transform(x)

# Run local implementation of kmeans
km = KMeans(n_clusters=3, max_iter=20, random_state=20)
km.fit(x)
centroids = km.cluster_centers_
print(centroids)

#Testing the algorithm
print(km.labels_)

# Principal Component Analysis (PCA) as Feature Extraction

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import OneHotEncoder
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

dataset = load_iris()

x = dataset.data;
y = dataset.target.reshape(-1,1)

# Standardizing the features
x = StandardScaler().fit_transform(x)

#PCA
#pca = PCA(n_components=2)
pca = PCA(0.7)
features = pca.fit_transform(x)

print(features.shape)

print(pca.explained_variance_ratio_)

#classification
encoder = OneHotEncoder()
targets = encoder.fit_transform(y)

train_features, test_features, train_targets, test_targets = train_test_split(features,targets, test_size=0.2, random_state=100)

model = Sequential()
# first parameter is output dimension
model.add(Dense(10, input_dim=features.shape[1], activation='relu'))
model.add(Dense(10, input_dim=10, activation='relu'))
model.add(Dense(10, input_dim=10, activation='relu'))
model.add(Dense(3, activation='softmax'))

#we can define the loss function MSE or negative log lokelihood
#optimizer will find the right adjustements for the weights: SGD, Adagrad, ADAM ...
model.compile(optimizer="adam",loss="categorical_crossentropy",metrics=["accuracy"])
model.summary()

model.fit(train_features, train_targets, epochs=20, batch_size=20, verbose=2)

loss, accuracy = model.evaluate(test_features, test_targets)

print("Accuracy on the test dataset: %.2f" % accuracy)



## Checkpoint 2
Change the data dimension using PCA from 1-4 and observe the results.

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import OneHotEncoder
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

dataset = load_iris()

x = dataset.data;
y = dataset.target.reshape(-1,1)

# Standardizing the features
x = StandardScaler().fit_transform(x)

#PCA
pca = PCA(n_components=1)
# pca = PCA(0.7)
features = pca.fit_transform(x)

print(features.shape)

print(pca.explained_variance_ratio_)

#classification
encoder = OneHotEncoder()
targets = encoder.fit_transform(y)

train_features, test_features, train_targets, test_targets = train_test_split(features,targets, test_size=0.2, random_state=100)

model = Sequential()
# first parameter is output dimension
model.add(Dense(10, input_dim=features.shape[1], activation='relu'))
model.add(Dense(10, input_dim=10, activation='relu'))
model.add(Dense(10, input_dim=10, activation='relu'))
model.add(Dense(3, activation='softmax'))

#we can define the loss function MSE or negative log lokelihood
#optimizer will find the right adjustements for the weights: SGD, Adagrad, ADAM ...
model.compile(optimizer="adam",loss="categorical_crossentropy",metrics=["accuracy"])
model.summary()

model.fit(train_features, train_targets, epochs=20, batch_size=20, verbose=2)

loss, accuracy = model.evaluate(test_features, test_targets)

print("Accuracy on the test dataset: %.2f" % accuracy)

## Checkpoint 3
Implement PCA on MNIST data and then use that data to classify output.

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from keras.datasets import mnist
from keras.utils import to_categorical
from keras.optimizers import SGD

(x_train, y_train), (x_test, y_test) = mnist.load_data()

print("X_train: {},{}".format(x_train.shape,x_train.dtype))
print("Y_train: {}".format(y_train.shape))
print("X_test: {}".format(x_test.shape))
print("Y_test: {}".format(y_test.shape))

nTrain = x_train.shape[0]
nDimTrain = x_train.shape[1]*x_train.shape[2]
nTest = x_test.shape[0]
print("#Train: {},#Test:{}, nDim:{}".format(nTrain,nTest,nDimTrain))

x_train = x_train.reshape(nTrain,nDimTrain)
x_test = x_test.reshape(nTest,nDimTrain)
print("# reshape")
print("X_train: {}".format(x_train.shape))
print("Y_train: {}".format(y_train.shape))
print("X_test: {}".format(x_test.shape))
print("Y_test: {}".format(y_test.shape))

y_train = to_categorical(y_train,10)
y_test = to_categorical(y_test,10)
print("# reshape")
print("X_train: {}".format(x_train.shape))
print("Y_train: {}".format(y_train.shape))
print("X_test: {}".format(x_test.shape))
print("Y_test: {}".format(y_test.shape))

# Normalization
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255

#PCA
pca = PCA(n_components=26)
x_train = pca.fit_transform(x_train)
x_test = pca.fit_transform(x_test)

# Multi-layer Perceptron
model = Sequential()
model.add(Dense(units=15,activation="relu",input_shape=(x_train.shape[1],)))
model.add(Dense(units=10,activation="softmax"))
# Compile the model
epochs = 25
lrate = 0.01
decay = lrate/epochs
sgd = SGD(lr=lrate, momentum=0.9, decay=decay, nesterov=False)
model.compile(loss='categorical_crossentropy',
                  optimizer=sgd,
                  metrics=['accuracy'])
print(model.summary())

# Train the model
model.fit(x_train, y_train,
              batch_size=128,
              shuffle=True,
              epochs=epochs)

# Evaluate the model
scores = model.evaluate(x_test, y_test)

print('Loss: %.3f' % scores[0])
print('Accuracy: %.3f' % scores[1])

# Linear Discriminant Analysis (LDA)

Implementing the Linear Discriminant Analysis algorithm, can use the predefined LinearDiscriminantAnalysis class made available to us by the scikit-learn library

In [None]:
from sklearn.datasets import load_wine
import pandas as pd
import numpy as np
np.set_printoptions(precision=4)
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

wine = load_wine()
X = pd.DataFrame(wine.data, columns=wine.feature_names)
y = pd.Categorical.from_codes(wine.target, wine.target_names)

#create a DataFrame containing both the features and classes
df = X.join(pd.Series(y, name='class'))

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
X_lda = lda.fit_transform(X, y)

lda.explained_variance_ratio_

le = LabelEncoder()
y = le.fit_transform(df['class'])

plt.xlabel('LD1')
plt.ylabel('LD2')
plt.scatter(
    X_lda[:,0],
    X_lda[:,1],
    c=y,
    cmap='rainbow',
    alpha=0.7,
    edgecolors='b'
)

X_train, X_test, y_train, y_test = train_test_split(X_lda, y, random_state=1)

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
confusion_matrix(y_test, y_pred)

## Checkpoint 4
![](http://)Implement LDA on MNIST data and then use that data to classify output.

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from keras.datasets import mnist
from keras.utils import to_categorical
from keras.optimizers import SGD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

(x_train, y_train), (x_test, y_test) = mnist.load_data()

print("X_train: {},{}".format(x_train.shape,x_train.dtype))
print("Y_train: {}".format(y_train.shape))
print("X_test: {}".format(x_test.shape))
print("Y_test: {}".format(y_test.shape))

nTrain = x_train.shape[0]
nDimTrain = x_train.shape[1]*x_train.shape[2]
nTest = x_test.shape[0]
print("#Train: {},#Test:{}, nDim:{}".format(nTrain,nTest,nDimTrain))

x_train = x_train.reshape(nTrain,nDimTrain)
x_test = x_test.reshape(nTest,nDimTrain)
print("# reshape")
print("X_train: {}".format(x_train.shape))
print("Y_train: {}".format(y_train.shape))
print("X_test: {}".format(x_test.shape))
print("Y_test: {}".format(y_test.shape))

# Normalization
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255

#LDA
lda = LinearDiscriminantAnalysis(n_components=26)
x_train = lda.fit_transform(x_train, y_train)
x_test = lda.fit_transform(x_test, y_test)

y_train = to_categorical(y_train,10)
y_test = to_categorical(y_test,10)
print("# reshape")
print("X_train: {}".format(x_train.shape))
print("Y_train: {}".format(y_train.shape))
print("X_test: {}".format(x_test.shape))
print("Y_test: {}".format(y_test.shape))

# Multi-layer Perceptron
model = Sequential()
model.add(Dense(units=15,activation="relu",input_shape=(x_train.shape[1],)))
model.add(Dense(units=10,activation="softmax"))
# Compile the model
epochs = 25
lrate = 0.01
decay = lrate/epochs
sgd = SGD(lr=lrate, momentum=0.9, decay=decay, nesterov=False)
model.compile(loss='categorical_crossentropy',
                  optimizer=sgd,
                  metrics=['accuracy'])
print(model.summary())

# Train the model
model.fit(x_train, y_train,
              batch_size=128,
              shuffle=True,
              epochs=epochs)

# Evaluate the model
scores = model.evaluate(x_test, y_test)

print('Loss: %.3f' % scores[0])
print('Accuracy: %.3f' % scores[1])