### importing necessary libaries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt  
import plotly.graph_objs as go
from plotly.graph_objs import Figure, Data
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans 

#### reading iris dataset csv file

In [2]:
dataset = pd.read_csv('../input/iris-flower-dataset/IRIS.csv')

In [3]:
dataset.head()

#### removing species column

In [4]:
dataset.dropna(inplace=True)
dataset.drop('species', axis = 1 , inplace=True)

In [5]:
dataset.head()

#### plotting all the features in a histogram

In [6]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=dataset['sepal_length'], name='sepal_length'))
fig.add_trace(go.Histogram(x=dataset['sepal_width'], name='sepal_width'))
fig.add_trace(go.Histogram(x=dataset['petal_length'], name='petal_length'))
fig.add_trace(go.Histogram(x=dataset['petal_width'], name='petal_width'))

# Overlay histograms
fig.layout.update(barmode='overlay')
# Reduce opacity to see histograms
fig.update_traces(opacity=0.75)
fig.show()

##### standardizing the dataset to bring down into same scale

In [7]:
sd = StandardScaler()

In [8]:
scaled_data = sd.fit_transform(dataset)

In [9]:
dataset = pd.DataFrame(scaled_data, columns=dataset.columns)

In [10]:
dataset.head()

In [11]:
df = dataset.copy(deep=True)

#### using pca method to decompose the dimensions

In [12]:
pca = PCA()

In [13]:
decomposed_data = pca.fit_transform(dataset)

#### we can calculate the explained variance for each attribute by using scikit-learn, and plot the result

In [14]:
#Plotting the Cumulative Summation of the Explained Variance
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Iris Dataset Explained Variance')
plt.show()

This plot tells us that selecting 2 components we can preserve something around 98.8% or 99% of the total variance of the data. It makes sense, we’ll not use 100% of our variance, because it denotes all components, and we want only the principal ones.

In [15]:
pca = PCA(n_components=2)

In [16]:
decomposed_data = pca.fit_transform(dataset)

In [17]:
columns = ['component#%i' % i for i in range(2)]

In [18]:
decomposed_df = pd.DataFrame(decomposed_data, columns=columns)

In [19]:
decomposed_df.head()

In [20]:
# fig = go.Figure(data=go.Scatter(x=decomposed_df['component#0'], y=decomposed_df['component#1'], mode='markers'))

# Create a trace
trace = go.Scatter(
    x = decomposed_df['component#0'],
    y = decomposed_df['component#1'],
    mode = 'markers',
    marker=dict(
        size=16,
        color=np.random.randn(500), #set color equal to a variable
        colorscale='Viridis', # one of plotly colorscales
        showscale=True
    )
)
layout = {
    "title": "Scatter plot",
    "xaxis": {
        "showgrid": True,
        "zeroline": False,
        "showticklabels": False
    },
    "yaxis": {
        "showgrid": True,
        "zeroline": False,
        "showticklabels": False
    },
    "legend": {"font": {"size": 16}},
    "titlefont": {"size": 24}
}
data = [trace]
fig = Figure(data=data, layout=layout)
fig.show()

In [21]:
pca.explained_variance_ratio_

#### Using elbow method to compute the clusters

In [22]:
wcss = []
clusters = []
d = []
for k in range(1, 40):
    kmeans = KMeans(n_clusters=k, init="k-means++", n_init=10,
                    max_iter=300)
    kmeans.fit_predict(dataset)
    wcss.append(kmeans.inertia_)
    clusters.append(k)

#### using plotly method to plot elbow curve

In [23]:
data = {
                "type": "scatter",
                "x": clusters,
                "y": wcss
            }

data = Data([data])
layout = go.Layout(
    title="Computing WCSS for KMeans++",
    yaxis=dict(title='Sum of squared errors'),
    xaxis=dict(title='Number of clusters'),
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)'
)

fig = Figure(data=data, layout=layout)
fig.show()

as per above graph 5 clusters seems good to proceed further

In [24]:
kmeans = KMeans(n_clusters=5, init="k-means++", n_init=10,
                    max_iter=300)

In [25]:
kmeans

In [26]:
kmeans.fit(decomposed_df)

In [27]:
kmeans.labels_

#### adding cluters into original dataframe

In [28]:
dataset.insert(loc=0, column='clusters', value=pd.Series(kmeans.labels_).astype(int))

In [29]:
dataset.head(10)

#### adding clusters colums to decomposed dataframe for plotting the cluters

In [30]:
decomposed_df.insert(loc=0, column='clusters', value=pd.Series(kmeans.labels_).astype(int))

In [31]:
dataset['clusters'].value_counts()

* seems clusters1 has max datapoints

In [32]:
kmeans.cluster_centers_

#### creating dataframe for centers

In [33]:
centroid_df = pd.DataFrame(kmeans.cluster_centers_, columns = decomposed_df.columns[1:])

In [34]:
centroid_df.head()

In [35]:
dataset.head()

In [36]:
decomposed_df.columns

In [37]:
clusters = decomposed_df['clusters'].unique()

columns = decomposed_df.columns[1:]

symbol = ('circle', 'square', 'triangle-up', 'diamond', 'cross', 'x',
                  'triangle-down', 'asterisk', 'octagon', 'diamond-tall-down')
color = ('yellow','blue',  'magenta', 'green', 'teal', 'navy','peru',
                 'lightslategrey', 'red', 'olive')

plot_data = []

for c in clusters:
    trace1 = {
      "mode": "markers", 
      "name": "cluster"+str(c),
      "type": "scatter", 
      "x": decomposed_df.loc[decomposed_df['clusters']==c][columns[0]], 
      "y": decomposed_df.loc[decomposed_df['clusters']==c][columns[1]],
      "marker": {
        "line": {
          "color": "navy", 
          "width": 0.5
        }, 
        "size": 12, 
        "color": color[c],
        "symbol": symbol[c]
      }
    }
    trace2 = {
      "name": "centroid"+str(c),
      "type": "scatter", 
      "x": [centroid_df[columns[0]][c]], 
      "y": [centroid_df[columns[1]][c]],
      "marker": {
        "color": "rgb(200,10,10)", 
        "symbol": symbol[c]
      }
    }
    plot_data.append(trace1)
    plot_data.append(trace2)
    
data = Data(plot_data)
layout = {
    "title": "K-Means Clustering (k=%s)" % len(clusters),
    "xaxis": {
        "showgrid": True,
        "zeroline": False,
        "showticklabels": False
    },
    "yaxis": {
        "showgrid": True,
        "zeroline": False,
        "showticklabels": False
    },
    "legend": {"font": {"size": 16}},
    "titlefont": {"size": 24}
}
fig = Figure(data=data, layout=layout)
fig.show()