In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Unsupervised learning finds patterns in data by clustering or compressing data (dimension reduction) without a specific prediction task in mind**

# ***k-means clustering***

- finds clusters of samples
- number of clusters must be specified

In [None]:
# import and load data

from sklearn.datasets import load_iris

data = load_iris()

# print(data.DESCR)
print(data.keys())

In [None]:
data.target_names

In [None]:
print(f"feature data shape: {data.data.shape}")
print(f"feature column headings: {data.feature_names}")
print("\n")
print(f"target data shape: {data.target.shape}")
print(f"target data labels: {data.target_names}")
print("\n")


In [None]:
# convert np.array to dataframe and add column labels

df = pd.DataFrame(data.data, columns=['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'])

In [None]:
# split dataframe (140 and 10), total sample size =150

samples = df.iloc[:140]
new_samples = df.iloc[140:]

In [None]:
from sklearn.cluster import KMeans

model= KMeans(n_clusters=3)
model.fit(samples)

In [None]:
labels = model.predict(samples)
print(labels)

# ***cluster labels for new samples***

- new samples can be assigned to existing clusters
- k-means remebers mean of each cluster ("centroids")
- finds nearest centroid to each new sample


In [None]:
new_labels = model.predict(new_samples)
print(new_labels)

# ***Scatter plots***
- scatter plot of sepal length vs petal length, color indicates cluster labels



In [None]:
import matplotlib.pyplot as plt

xs = samples['sepal length (cm)']
ys = samples['petal length (cm)']

plt.scatter(xs,ys, c=labels)
plt.show()

# ***Measuring clustering quality***
- cross tabulation : only helpful if data labelled
- if data unlabelled then use inertia
- inertia measures how spread out clusters are (lower = better)
- distance from each sample to centroid of its cluster
- after fit(), available as attribute inertia_
- k-means attempts to minimise the inertia when choosing clusters

In [None]:
model = KMeans(n_clusters=3)
model.fit(samples)
print(model.inertia_)

**The number of clusters**

- more clusters = less inertia, but too many clusters is not helpful
- how to choose? Choose elbow in inertia plot where inertia begins to decrease slowly
- eg: for iris dataset, 3 clusters is a good choice (see plot below)

In [None]:
# inertia plot of iris dataset with different numbers of clusters

def plot_clusters_vs_inertia(clusters):
    """
    - function plots clusters on x-axis and inertias on y-axis
    - takes 1 argument "clusters" : number of clusters required (int)
    - data sample must be assingned variable 'samples'
    """
    num_clusters = range(1,clusters+1)
    inertias = []

    for num in num_clusters:
        model = KMeans(n_clusters=num)
        model.fit(samples)
        inertias.append(model.inertia_)

    # plot num_clusters vs inertia
    plt.plot(num_clusters, inertias, '-o')
    plt.xlabel('number of clusters')
    plt.ylabel('inertia')
    plt.xticks(num_clusters) # only shows whole numbers on x-axis
    plt.show
    

**Exercise: evaluating grain clustering**

-seeds dataset from UCI Machine Learning Repository

In [None]:
# load dataset
grains_df = pd.read_csv('../input/seeds-grains-data-set/grains_2.csv') 

In [None]:
grains_df.head()

In [None]:
# split data into samples and varieies for cross-tabulation 

samples = grains_df.drop("Class", axis=1)
varieties = grains_df['Class']

In [None]:
varieties.value_counts()

In [None]:
# Create a KMeans model with 3 clusters: model
model = KMeans(n_clusters=3)

# Use fit_predict to fit model and obtain cluster labels: labels
labels = model.fit_predict(samples) 

# Create a DataFrame with labels and varieties as columns: df
df = pd.DataFrame({'labels': labels, 'varieties': varieties}) # Using .fit_predict() is the same as using .fit() followed by .predict()

# Create crosstab: ct
ct = pd.crosstab(df['labels'], df['varieties'])

# Display ct
print(ct)


In [None]:
plot_clusters_vs_inertia(10)

# ***Transforming features for better clusterings***


In [None]:
# load piedmont wines dataset
# 178 samples from 3 distinct varieties of redwine:Barolo,Grignolino and Barbera
# Features measure chemical composition e.g.alcohol content and Visual properties like "color intensity"
# Source: https://archive.ics.uci.edu/ml/datasets/Wine1

df = pd.read_csv('../input/piedmont-wines-dataset/piedmont_wines.csv')

In [None]:
df.head()

In [None]:
# edit dataframe for cross tabulation
varieties = df['class_name']
samples = df.drop(['class_label', 'class_name'], axis=1)

In [None]:
# cluster wines
model = KMeans(n_clusters=3)
labels = model.fit_predict(samples)

In [None]:
# cross tabulate clusters vs varieties
ct_df = pd.DataFrame({'labels': labels, 'varieties': varieties})
ct = pd.crosstab(ct_df['labels'], ct_df['varieties'])
print('Without feature standardization:\n')
print(ct)

- as can be seen by above cross tabulation output, the sample data does not cluster well into distinct varieties (this is due to features having very different variances)
- in KMeans : feature variance = feature influence

In [None]:
# calculate variance of all columns (variance of a feature measures spread of its values)
samples.var()

**Standard Scaler**
- StandardScaler transforms each feature to have mean 0 and variance 1

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('kmeans', KMeans(n_clusters=3))]

pipeline = Pipeline(steps)

pipeline.fit(samples)
labels_scaled = pipeline.predict(samples)

In [None]:
ct_df = pd.DataFrame({'labels': labels_scaled, 'varieties': varieties})
ct = pd.crosstab(ct_df['labels'], ct_df['varieties'])
print('With feature standardization:\n')
print(ct)

**Exercise 1: scaling and clustering fish data**

In [None]:
# load fish measurements dataset
# http://jse.amstat.org/datasets/fishcatch.txt

df = pd.read_csv('../input/fish-measurements-dataset/fish_measurements.csv')

In [None]:
df.head()

In [None]:
# edit dataframe
species = df['species']
samples = df.drop('species', axis=1)

In [None]:
# clustering and cross tabulation without standardization
model = KMeans(n_clusters=4)
labels = model.fit_predict(samples)

ct_df = pd.DataFrame({'labels': labels, 'species': species})
ct = pd.crosstab(ct_df['labels'], ct_df['species'])
print('Without feature standardization:\n')
print(ct)

In [None]:
# scaling and clustering pipeline 

# Setup the pipeline
steps = [('scaler', StandardScaler()),
         ('kmeans', KMeans(n_clusters=4))]

pipeline = Pipeline(steps)

pipeline.fit(samples)
labels = pipeline.predict(samples)

In [None]:
# cross tabulation with standardization

ct_df = pd.DataFrame({'labels': labels, 'species': species})
ct = pd.crosstab(ct_df['labels'], ct_df['species'])
print('With feature standardization:\n')
print(ct)

**Exercise 2: clustering stocks using KMeans**

In [None]:
# load stock movements dataset
# daily price movements from 2010 to 2015 (obtained from Yahoo! Finance), where each row corresponds to a company, and each column corresponds to a trading day.

df = pd.read_csv('../input/stock-movement-dataset/stock_movement.csv')

In [None]:
df.head()

In [None]:
samples = df.drop('Company', axis=1)
company  = df['Company']

- Some stocks are more expensive than others. 
- The Normalizer will separately transform each company's stock price to a relative scale before the clustering begins.
- Note that Normalizer() is different to StandardScaler(). While StandardScaler() standardizes features by removing the mean and scaling to unit variance, Normalizer() rescales each sample - here, each company's stock price - independently of the other.

In [None]:
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans

# Setup the pipeline
steps = [('scaler', Normalizer()),
         ('kmeans', KMeans(n_clusters=10))]

pipeline = Pipeline(steps)

pipeline.fit(samples)
labels = pipeline.predict(samples)

In [None]:
ct_df = pd.DataFrame({'labels': labels, 'company': company})
print(ct_df.sort_values('labels'))