# Creating Customer Segments

**Project Status: In Progress (Updated: 3/14/2016)

This project will analyze a dataset containing annual spending amounts for internal structure, to understand the variation in the different types of customers that a wholesale distributor interacts with.

In [None]:
# Import libraries: NumPy, pandas, matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA

#%matplotlib inline

In [None]:
# Read dataset
data = pd.read_csv("wholesale-customers.csv")
print "Dataset has {} rows, {} columns".format(*data.shape)

In [None]:
data

In [None]:
data.describe()

In [None]:
data.corr()

###PCA

In [None]:
# TODO: Apply PCA with the same number of dimensions as variables in the dataset
def doPCA():
    pca = PCA(n_components=5)
    pca.fit(data)
    return pca

# Print the components and the amount of variance in the data contained in each dimension
pca = doPCA()
df = pd.DataFrame(pca.components_, columns =(data.columns), index=['1', '2', '3', '4','5'])
df.index.names = ['PC']
print "Principal Component Analysis:"
print df

ex_var = pca.explained_variance_ratio_
df_var = pd.Series(ex_var,index=['1','2','3','4','5'])
df_var.index.names = ['PC']
print "\nExplained Variance of Each Component:"
print df_var

pl.figure(figsize=(8,5))
x = np.arange(1,6)
y = np.cumsum(ex_var)
pl.plot(x,y,marker ="o", mfc='#780000', color = '#CC0000')
pl.xlabel("No. of PC Components", fontsize = 14)
pl.ylabel("Cumulative Explained Variance Ratio", fontsize =14)
pl.title("No. of PC Components vs Explained Variance Ratio", fontsize = 16)
pl.show()

pl.figure(figsize=(8,5))
pl.plot(x,ex_var, marker ="o", mfc ='#9900CC', color ='#990099')
pl.xlabel("No. of PC Components", fontsize =14)
pl.ylabel("Explained Variance Ratio", fontsize =14)
pl.title("No. PC Components vs Explained Variance Ratio", fontsize =16)
pl.show()

In [None]:
first_pc = pca.components_[0]
second_pc = pca.components_[1]
print "First PC:", first_pc
print "Second PC:", second_pc

data.plot.scatter(x = 'Fresh', y = 'Milk', xlim = [-1000,100000], ylim = [-1000,100000], color = "b", figsize=(8,5))
pl.title("Scatter Plot of Features", fontsize = 16)
transformed_data = pca.transform(data)
pl.figure(figsize =(8,5))
for ii, jj in zip(transformed_data, data):
    pl.scatter(first_pc[0]*ii[0], first_pc[1]*ii[0],color="r")
    pl.scatter(second_pc[0]*ii[1], second_pc[1]*ii[1],color = "g")
    pl.title("Principal Component Analysis", fontsize = 16)

pl.show()

###ICA

In [None]:
# TODO: Fit an ICA model to the data
# Note: Adjust the data to have center at the origin first!
from sklearn.decomposition import FastICA
ica = 

# Print the independent components
print ica.components_

##Clustering

Choose either K Means clustering or Gaussian Mixed Models clustering, which implements expectation-maximization. Then 
sample elements from the clusters to understand their significance.

[source](http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html): from the sklearn documentation.

In [None]:
# Import clustering modules
from sklearn.cluster import KMeans
from sklearn.mixture import GMM

In [None]:
# TODO: First we reduce the data to two dimensions using PCA to capture variation
reduced_data = 
print reduced_data[:10]  # print upto 10 elements

In [None]:
# TODO: Implement your clustering algorithm here, and fit it to the reduced data for visualization
# The visualizer below assumes your clustering object is named 'clusters'

clusters = 
print clusters

In [None]:
# Plot the decision boundary by building a mesh grid to populate a graph.
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
hx = (x_max-x_min)/1000.
hy = (y_max-y_min)/1000.
xx, yy = np.meshgrid(np.arange(x_min, x_max, hx), np.arange(y_min, y_max, hy))

# Obtain labels for each point in mesh. Use last trained model.
Z = clusters.predict(np.c_[xx.ravel(), yy.ravel()])

In [None]:
# TODO: Find the centroids for KMeans or the cluster means for GMM 

centroids = 
print centroids

In [None]:
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap=plt.cm.Paired,
           aspect='auto', origin='lower')

plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
plt.scatter(centroids[:, 0], centroids[:, 1],
            marker='x', s=169, linewidths=3,
            color='w', zorder=10)
plt.title('Clustering on the wholesale grocery dataset (PCA-reduced data)\n'
          'Centroids are marked with white cross')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()