In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# ***Hierarchical clustering***

- SciPy ***linkage()** function performs hierarchical clustering on an array of samples. 
- Use the ***linkage()*** function to obtain a hierarchical clustering
- Use ***dendrogram()*** to visualize the result.

**Hierarchial clustering of grain data**

In [None]:
grains_df = pd.read_csv('../input/seeds-grains-data-set/grains_2.csv')
grains_df.head()

In [None]:
# split data into samples and varieties for cross-tabulation 

samples = grains_df.drop("Class", axis=1)
varieties = grains_df['Class']

In [None]:
# covert pd.series to list for dendrogram
var_list = varieties.tolist()

In [None]:
# Perform the necessary imports

from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt

# Calculate the linkage: mergings

mergings = linkage(samples, method='complete')

# Plot the dendrogram, using varieties as labels

plt.figure(figsize=(24, 10))

dendrogram(mergings, labels=var_list, leaf_rotation=90, leaf_font_size=6) # labels attribute must be list and not pd.series

plt.show()

**Hierarchial clustering of stocks**

In [None]:
stocks_df = pd.read_csv('../input/stock-movement-dataset/stock_movement.csv')
stocks_df.head()

In [None]:
movements = stocks_df.drop('Company', axis=1)
companies  = stocks_df['Company']
companies_list = companies.tolist()

In [None]:
from sklearn.preprocessing import normalize # hierarchical clustering doesn't fit into a sklearn pipeline, so you'll need to use normalize()

# Normalize the movements: normalized_movements
normalized_movements = normalize(movements)

# Calculate the linkage: mergings
mergings = linkage(normalized_movements, method='complete')

# Plot the dendrogram
plt.figure(figsize=(24, 10))

dendrogram(mergings, labels=companies_list, leaf_rotation=90, leaf_font_size=10)

plt.show()


**Different linkage, different hierarchical clustering!**

- linkage method defines how the distance between clusters is measured
- In ***complete*** linkage, the distance between clusters is the distance between the ***furthest*** points of the clusters
- In ***single*** linkage, the distance between clusters is the distance between the ***closest*** points of the clusters

In [None]:
mergings = linkage(normalized_movements, method='single')

# Plot the dendrogram
plt.figure(figsize=(24, 10))

dendrogram(mergings, labels=companies_list, leaf_rotation=90, leaf_font_size=10)

plt.show()

**Extracting the cluster labels**
- use ***fcluster()*** function to extract the cluster labels for intermediate clustering
- returns np.array of cluster labels

In [None]:
# using the fcluster() function to extract the cluster labels for intermediate clustering, and compare the labels with the grain varieties using a cross-tabulation.
from scipy.cluster.hierarchy import fcluster

# Calculate the linkage: mergings
mergings = linkage(samples, method='complete')

# Use fcluster to extract labels: labels
labels = fcluster(mergings, 6, criterion ='distance')

# Create a DataFrame with labels and varieties as columns: df
df = pd.DataFrame({'labels': labels, 'varieties': varieties})

# Create crosstab: ct
ct = pd.crosstab(df['labels'], df['varieties'])

# Display ct
print(ct)

In [None]:
# re-show grains dendrogram to compare above cross tabulation

# Plot the dendrogram
plt.figure(figsize=(24, 10))
dendrogram(mergings, labels=var_list, leaf_rotation=90, leaf_font_size=6) # labels attribute must be list and not pd.series
plt.show()

# ***t-SNE for 2-dimensional maps***
- t-SNE = "t-distributed stochastic neighbor embedding" 
- Maps samples to 2Dspace (or3D) 
- Map approximately preserves nearness of samples
- Great for inspecting datasets
- Can't extend the map to include new data samples (Must start over each time!)
- learning rate: try values btw 50 and 200 (wrong choice: points bunch together)
- t-SNE features are different every time




In [None]:
varieties


In [None]:
# convert varieties to list of  giving the variety number of each grain sample.
variety_numbers = []

for variety in varieties:
    if variety == 'Kama':
        variety = 0
    elif variety == 'Rosa':
        variety = 1
    else:
        variety = 2
    variety_numbers.append(variety)
    

In [None]:
#  apply t-SNE to the grain samples data and inspect the resulting t-SNE features using a scatter plot.

# Import TSNE
from sklearn.manifold import TSNE

# Create a TSNE instance: model
model = TSNE(learning_rate=200)

# Apply fit_transform to samples: tsne_features
tsne_features = model.fit_transform(samples)

# Select the 0th feature: xs
xs = tsne_features[:,0]

# Select the 1st feature: ys
ys = tsne_features[:,1]

# Scatter plot, coloring by variety_numbers
plt.figure(figsize=(10,6))
plt.scatter(xs,ys, c=variety_numbers)
plt.show()

**A t-SNE map of the stock market**
- t-SNE provides great visualizations when the individual samples can be labeled.

In [None]:
# Apply t-SNE to the company stock price data. A scatter plot of the resulting t-SNE features, labeled by the company names, gives you a map of the stock market

# Import TSNE
from sklearn.manifold import TSNE

# Create a TSNE instance: model
model = TSNE(learning_rate=50)

# Apply fit_transform to normalized_movements: tsne_features
tsne_features = tsne_features = model.fit_transform(normalized_movements)


# Select the 0th feature: xs
xs = tsne_features[:,0]

# Select the 1th feature: ys
ys = tsne_features[:,1]

# Scatter plot
plt.figure(figsize=(15,10))
plt.scatter(xs,ys, alpha=0.5)

# Annotate the points
for x, y, company in zip(xs, ys, companies):
    plt.annotate(company, (x, y), fontsize=5, alpha=0.75)
plt.show()
