# Creating Customer Segments

**Project Status: In Progress (Updated: 3/14/2016)

This project will analyze a dataset containing annual spending amounts for internal structure, to understand the variation in the different types of customers that a wholesale distributor interacts with.

In [None]:
# Import libraries: NumPy, pandas, matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA

%matplotlib inline

In [None]:
# Read dataset
data = pd.read_csv("wholesale-customers.csv")
print "Dataset has {} rows, {} columns".format(*data.shape)

print "\nStandard Deviation:"
print data.std()

In [None]:
data.head()

In [None]:
data.hist(bins = 30, color='r', figsize=(15,12))

In [None]:
data.describe()

In [None]:
data.corr()

In [None]:
data.cov()

###PCA

In [None]:
# TODO: Apply PCA with the same number of dimensions as variables in the dataset
def doPCA():
    pca = PCA(n_components=5)
    pca.fit(data)
    return pca

# Print the components and the amount of variance in the data contained in each dimension
pca = doPCA()
df_ica = pd.DataFrame(pca.components_, columns = columns, index=['1', '2', '3', '4','5'])
df_ica.index.names = ['PC']
print "Principal Component Analysis:"
print df_ica

ex_var = pca.explained_variance_ratio_
df_var = pd.Series(ex_var,index=['1','2','3','4','5'])
df_var.sort(ascending=False)


df_var.index.names = ['PC']
print "\nExplained Variance of Each Component:"
print df_var

In [None]:
pl.figure(figsize=(8,5))
x = np.arange(1,6)
y = np.cumsum(ex_var)
pl.plot(x,y,marker ="o", mfc='#780000', color = '#CC0000')
pl.xlabel("No. of PC Components", fontsize = 14)
pl.ylabel("Cumulative Explained Variance Ratio", fontsize =14)
pl.title("No. of PC Components vs Explained Variance Ratio", fontsize = 16)
pl.show()



In [None]:
pl.figure(figsize=(10,6))
pl.bar(range(1,6), ex_var, alpha = .8, align='center',
       label = 'Individual Explained Variance', color = 'blue')
pl.ylabel('Explained Variance Ratio', fontsize = 14)
pl.xlabel('Princial Components', fontsize = 14)
pl.title('Explained Variance', fontsize = 16)
pl.show

#Slope:
X1, Y1 = 1, 0.459614
X2, Y2 = 2, 0.405172
X3, Y3 = 3, 0.070030
X4, Y4 = 4, 0.044023
slope1 = (Y2-Y1)/(X2-X1)
slope2 = (Y3-Y2)/(X3-X2)
slope3= (Y4-Y3)/(X4-X3)
print "Slope1:", slope1
print "Slope2:", slope2
print "Slope3:", slope3

In [None]:
first_pc = pca.components_[0]
second_pc = pca.components_[1]
#print "First PC:", first_pc
#print "Second PC:", second_pc

transformed_data = pca.transform(data)
pl.figure(figsize =(8,5))
for ii, jj in zip(transformed_data, data):
    pl.plot(first_pc[0]*ii[0], first_pc[1]*ii[0],marker = "o",color="r")
    pl.plot(second_pc[0]*ii[1], second_pc[1]*ii[1],marker ="o",color = "g")
    pl.title("Principal Component Analysis", fontsize = 16)

pl.show()


# Biplot

In [None]:
def biplot(df):
    # Fit on 2 components
    pca = PCA(n_components=2, whiten=True).fit(df)
    
    # Plot transformed/projected data
    ax = pd.DataFrame(
        pca.transform(df),
        columns=['PC1', 'PC2']
    ).plot(kind='scatter', x='PC1', y='PC2', figsize=(10, 8), color = 'g', s=5)

    # Plot arrows and labels
    for i, (pc1, pc2) in enumerate(zip(pca.components_[0], pca.components_[1])):
        ax.arrow(0, 0, pc1, pc2, width=0.001, fc='r', ec='r')
        ax.annotate(df.columns[i], (pc1, pc2), size=12)
    return ax

ax = biplot(data)
pl.title("Biplot", fontsize = 20)
pl.xlabel("Principal Component: 1")
pl.ylabel("Principal Component: 2")
# Play around with the ranges for scaling the plot
ax.set_xlim([-1.5, .5])
ax.set_ylim([-.3, 1.3])

###ICA

In [None]:
#Adjust the data to have center at the origin:
columns = data.columns
mean = data.mean()
df_m = pd.DataFrame(mean)
df_mean = df_m.transpose()
df_center = pd.DataFrame(data[columns].values - df_mean[columns].values, columns=columns)

def doICA():
    ica = FastICA(n_components = 5)
    ica.fit_transform(df_center)
    return ica

# Print the independent components
ica = doICA()
df_ica = pd.DataFrame(ica.components_,columns=columns, index=['1','2','3','4','5'])
df.index.names = ['PC']
print "Independent Component Analysis:"
print df_ica

pl.figure(figsize=(20,20))
pl.figure(figsize = (11,5))
sns.heatmap(df_ica, annot = True)

##Clustering

Choose either K Means clustering or Gaussian Mixed Models clustering, which implements expectation-maximization. Then 
sample elements from the clusters to understand their significance.

[source](http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html): from the sklearn documentation.

In [None]:
# Import clustering modules
from sklearn.cluster import KMeans
from sklearn.mixture import GMM

In [None]:
# TODO: Reduce to Two dimensions using PCA to capture variation
reduced_data = PCA(n_components = 2).fit_transform(df_center)
print "Reducing to 10 Elements"
print reduced_data[:10] 

In [None]:
# TODO: Implement clustering algorithm and fit it to the reduced data for visualization
def cluster(clusterer):
    clusterer.fit(reduced_data)
    clusters = clusterer
    print clusters
    return clusters

### Decision Boundary 
 ###### Building Mesh Grid to Populate a Graph

In [None]:
def boundary(clusters):
    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    hx = (x_max-x_min)/1000.
    hy = (y_max-y_min)/1000.
    xx, yy = np.meshgrid(np.arange(x_min, x_max, hx), np.arange(y_min, y_max, hy))

    # Obtain labels for each point in mesh. Use last trained model.
    Z = clusters.predict(np.c_[xx.ravel(), yy.ravel()])
    return Z,xx,yy,x_min,x_max,y_min,y_max

In [None]:
# TODO: Find the centroids for KMeans or the cluster means for GMM 
def cluster_means(clusters, func_name):
    centroids = getattr(clusters, func_name)
    print centroids
    return centroids

In [None]:
# Put the result into a color plot
def colorplot (clusters, Z,xx,yy,x_min,x_max,y_min,y_max,centroids):
    pl.figure(figsize=(12,10))
    Z = Z.reshape(xx.shape)
    pl.figure(1)
    pl.clf()
    pl.imshow(Z, interpolation='nearest',
               extent=(xx.min(), xx.max(), yy.min(), yy.max()),
               cmap=pl.cm.Paired,
               aspect='auto', origin='lower')

    pl.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
    pl.scatter(centroids[:, 0], centroids[:, 1],
                marker='x', s=169, linewidths=3,
                color='w', zorder=10)
    pl.title('Clustering on the wholesale grocery dataset (PCA-reduced data)\n'
              'Centroids are marked with white cross', fontsize = 18)
    pl.xlim(x_min, x_max)
    pl.ylim(y_min, y_max)
    pl.xticks(())
    pl.yticks(())
    pl.show()

In [None]:
def clusterplot(clusterer,func_name):
    clusters = cluster(clusterer)
    Z,xx,yy,x_min,x_max,y_min,y_max = boundary(clusters)
    centroids = cluster_means(clusters,func_name)
    colorplot(clusters,Z,xx,yy,x_min,x_max,y_min,y_max,centroids)

In [None]:
clusterplot(KMeans(n_clusters=4),'cluster_centers_')