# Welcome
Welcome to clustering GPS data using visualization techniques.
Robert de Munter, Ahmet Taspinar and Rob Romijnders collaborated to make this repo.

## Goal

  * We aim to visualize and understand messy data such as GPS trajectories. The data consist of 18000 trajectories in GPS coordinates. Somehow we need insight into this data
  * As a second goal, we aim to understand dimensionality reduction. Techniques such as PCA and tSNE allow for 2D visualizations of high dimensional data.
  
## Target
We assume little knowledge of ML or Python in this notebook. Coding challenges require maximum two lines of code. Theoretical questions require insight, no mathematics. If you already know PCA/tSNE, then you could better join another group. If data excites you and you want to learn visualization techniques, then this is the ideal notebook for you!
  
## Approach

  * __First__ take time to read the different functions. Run the standard implementations and discuss with groupmates what you see/expect.
  * __Second__ we have some assignments to challenge your understanding. 

## Assignments

  * Discuss with your group the differences between PCA and tSNE: 
    * What do they do? 
    * When to choose one over the other?
  * Look at the PCA plots: 
    * What insight do you gain on the data?
  * Consider the code for PCA. Explain to your groupmates how it works: 
    * What does the abbreviation _SVD_ mean? 
    * Why are we using SVD? (bonus question: and why truncated SVD?)
  * Look at the tSNE plots: 
    * What insight do you gain on the data?
  * Consider the code for tSNE. Explain to your groupmates how it works: 
    * What does the _perplexity_ parameter do?
    * Bonus question: why do we define the _gradient norm_ and _number of iterations_?
  * Coding challenge
    * Pick either tSNE or PCA and improve the plot: what story does the data tell you? how can you clarify the story?
    * Tune the hyperparameters of the techniques. For PCA, play with plotting different _principal components_. For tSNE play with the _perplexity_ and _angle_.

[data here](https://www.dropbox.com/s/xm5vtriolvmnw4h/03_cluster_data.tar.gz?dl=0) or [here](https://www.dropbox.com/s/p1gdt070a6hl67u/03_cluster_data.zip?dl=0)

### Import libraries and define some globally useful parameters

In [7]:
import sys
sys.path.append('../cluster')
import numpy as np
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
from scipy.stats import zscore
from util_cluster import some_cluster
import glob
import pandas as pd
from collections import OrderedDict

INPUT_FOLDER = '../processed_data/'
IM_FOLDER = '../cluster/im/'
HOT = OrderedDict()
HOT['walk']= np.array([1, 0, 0, 0, 0, 0])
HOT['train']= np.array([0, 1, 0, 0, 0, 0])
HOT['subway']= np.array([0, 0, 1, 0, 0, 0])
HOT['taxi']= np.array([0, 0, 0, 1, 0, 0])
HOT['bus']= np.array([0, 0, 0, 0, 1, 0])
HOT['bike']= np.array([0, 0, 0, 0, 0, 1])
HOT['UNK']= np.array([0, 0, 0, 0, 0, 0])


colors = ['r', 'g', 'b', 'y', 'm', 'c', 'k']
lbl_clr = zip(HOT.keys(), colors)
title = ''
for lbl, clr in lbl_clr:
    title += lbl + '=' +clr + ','
print(title[:-1])

ImportError: No module named 'util_cluster'

### Functions to munge the data


In [8]:
def munge():
    def clean_label(label):
        # Pandas treats NaN's as floats. So in the next line we, effectively, check for NaN
        if not isinstance(label,float):
          return label.lstrip(',').rstrip(',').replace(',,', ',')


    headers_metadf = ['trajectory_id', 'start_time', 'end_time', 'v_ave', 'v_med', 'v_max', 'v_std', 'a_ave', 'a_med', 'a_max',
                      'a_std','labels']

    list_df_metadata = []

    for file in glob.glob(INPUT_FOLDER + "*_metadata.csv"):
        df_metadata = pd.read_csv(file, index_col=0)
        list_df_metadata.append(df_metadata)

    
    df_metadata = pd.concat(list_df_metadata).dropna(subset=['v_ave', 'v_med', 'v_max', 'v_std', 'a_ave', 'a_med', 'a_max','a_std'])
    
    X = df_metadata.as_matrix(['v_ave', 'v_med', 'v_max', 'v_std', 'a_ave', 'a_med', 'a_max','a_std'])
    y = df_metadata['labels'].values

    N = X.shape[0]   #行数
    D = 6            #列数

    # Rewrite the label strings to hot-vectors
    # Note that labels are not mutually exclusive!
    Y = np.zeros((N,D),dtype=np.int16)                     #？？？？？？？？？？？？？？？？？？？？？？？？？？
    for iy in range(N):
        lbl = y[iy]
        # Pandas treats NaN's as floats. So in the next line we, effectively, check for NaN
        if not isinstance(lbl, float):
            for key, value in HOT.items():
                if key in lbl:
                    Y[iy] += value
    return X,Y

### Functions to preprocess data


In [9]:
def remove_outliers(X,y):
    """Start on visualization.
    Remove all data outside (MAX_STD) standard deviations"""
    MAX_STD = 2.0
    mean = np.mean(X, 0)
    std = np.std(X, 0)
    mask = []
    for i, x in enumerate(X):
        if np.isnan(np.sum(x)):
            continue
        elif np.any(np.abs((x - mean) / std) > MAX_STD):
            continue
        else:
            mask.append(i)
    X = X[mask]
    y = y[mask]
    X = zscore(X)
    return X,y

### The working horses: PCA and tSNE

In [10]:
def PCA(X,y):
    """PCA"""
    PCA_model = TruncatedSVD(n_components=2)
    reduced = PCA_model.fit_transform(X)
    return reduced,y

def tSNE(X,y,perp=30):
    """Subsample to prevent memory error
    - perp: the perplexity for the tSNE algo"""
    N = X.shape[0]
    ind = np.random.choice(N,10000)
    X = X[ind]
    y = y[ind]

    tSNE_model = TSNE(verbose=2,perplexity=perp,min_grad_norm=1E-07,n_iter=300,angle=0.6)
    reduced_tsne = tSNE_model.fit_transform(X)
    return reduced_tsne,y

### Our cockpit
Here we load the data and preprocess.
Next we run the algorithms and plot

In [11]:
print('----Munge the data----')
X, y = munge()
X,y = remove_outliers(X,y)
N,D = y.shape

#PCA
print('----Start PCA----')
X_red,y_red = PCA(X,y)
plt.figure()
y_color,y_s = some_cluster(y_red, colors)
plt.scatter(X_red[:, 0], X_red[:, 1],c = y_color,s=y_s ,marker='o', linewidths=0)
plt.title(title)
plt.savefig(IM_FOLDER+'pca.png')
plt.show(block=True)
# If the plot doesnt show after above line, then change the MATPLOTLIB settings for your interpreter

plt.figure()
f4, ax4 = plt.subplots(1, D)
for d in range(D):
    ax4[d].scatter(X_red[:, 0], X_red[:, 1],c =y_red[:,d] ,marker='*', linewidths=0)
    ax4[d].set_title('Mode%3i'%d)
plt.setp([a.get_xticklabels() for a in ax4], visible=False)
plt.setp([a.get_yticklabels() for a in ax4], visible=False)
plt.savefig(IM_FOLDER+'pca_table.png')

----Munge the data----


NameError: name 'glob' is not defined

In [6]:
#tSNE
print('----Start tSNE----')
for per in [300,400,500]:
    print('-'*10+'tSNE on perplexity %i'%per + '-'*10)
    X_red, y_red = tSNE(X, y, per)
    plt.figure()
    y_color, y_s = some_cluster(y_red, colors)
    #color
    plt.scatter(X_red[:, 0], X_red[:, 1], c=y_color, s=y_s, marker='o',
                linewidths=0)
    plt.title(title)
    plt.savefig(IM_FOLDER+'tSNE%i.png' % per)
    #black-white
    plt.scatter(X_red[:, 0], X_red[:, 1], c='k', s=y_s, marker='o',
                linewidths=0)
    plt.title(title)
    plt.savefig(IM_FOLDER+'tSNE_bw%i.png'%per)
    #
    plt.figure()
    f3, ax3 = plt.subplots(1, D)
    for d in range(D):
        ax3[d].scatter(X_red[:, 0], X_red[:, 1],c =y_red[:,d] ,marker='*', linewidths=0)
        ax3[d].set_title('Mode%3i'%d)
    plt.setp([a.get_xticklabels() for a in ax3], visible=False)
    plt.setp([a.get_yticklabels() for a in ax3], visible=False)
    plt.savefig(IM_FOLDER+'tsne_table.png')

----Start tSNE----
----------tSNE on perplexity 300----------


NameError: name 'X' is not defined