# Unsupervised Learning Exercise Solution

In [None]:
import warnings
warnings.filterwarnings('ignore')

myfile='~/Dropbox/March onwards/Python Data Science/Data/winequality-red.csv'

import pandas as pd
from sklearn.preprocessing import scale
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

wine=pd.read_csv(myfile,sep=";")

# Q.1

In [None]:
wine=wine[["sulphates","alcohol","pH"]]

wine_std=pd.DataFrame(scale(wine),columns=list(wine.columns))

In [None]:
Ks=np.linspace(2,15,14)

In [None]:
ssw=[]
for k in Ks:
    kmeans=KMeans(n_clusters=int(k))
    kmeans.fit(wine_std)
    sil_score=silhouette_score(wine_std,kmeans.labels_)
    print("for inertia:" ,kmeans.inertia_ ,"and silhouette score:",sil_score,"number of clusters are:", int(k))
    ssw.append(kmeans.inertia_)
plt.plot(Ks,ssw)

In [None]:
# Here selected number of cluster = 6
k = 6
kmeans = KMeans(n_clusters=k)
kmeans.fit(wine_std)

In [None]:
labels = kmeans.labels_
wine_std["cluster"]=labels

In [None]:
from ggplot import *

In [None]:
ggplot(wine_std,aes(x='sulphates',y='alcohol'))+geom_point(aes(color='cluster'),size=50)

In [None]:
ggplot(wine_std,aes(x='sulphates',y='pH'))+geom_point(aes(color='cluster'),size=50)

In [None]:
ggplot(wine_std,aes(x='alcohol',y='pH'))+geom_point(aes(color='cluster'),size=50)

The overlap we see here is not actually an overlap but its a 6 dimensional view in a 2D space. Meaning there are 6 clusters formed in 6D space and when you see them in 2D space its seems to be overlapped but in reality they never overlap.

# Q.2

dbscan doesnt have a good measure and all the internal measures are either suited to k-means or they need labeled data . 
You need to take a subjective decision whther cluster given by dbscan make business sense or not . When you have labeled data however , dbscan can be used to make distance based similarity features and see whether labelling is consistent with the data or not.

# Q.3

In [None]:
myfile='~/Dropbox/March onwards/Python Data Science/Data/Wholesale customers data.csv'

groc=pd.read_csv(myfile)

groc=groc[["Milk","Grocery"]]

groc_std=pd.DataFrame(scale(groc),columns=list(groc.columns))

In [None]:
from sklearn.cluster import DBSCAN
from sklearn import metrics

In [None]:
r=np.linspace(0.5,5)
for epsilon in r:
    db = DBSCAN(eps=epsilon, min_samples=20, metric='euclidean').fit(groc_std)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    n_clust=len(set(labels))-1
    outlier=np.round(np.count_nonzero(labels == -1)/len(labels)*100,2)
        
    print('Estimated number of clusters: %d', n_clust)
    print("For epsilon =", epsilon ,", percentage of outliers is: ",outlier)
   


Choose the epsilon value that gives at aleast 5% customers which are very different in terms of purchase patterns.

In [None]:
db = DBSCAN(eps=0.77, min_samples=10, metric='euclidean').fit(groc_std)
groc_std['cluster']=[str(x) for x in db.labels_]

In [None]:
ggplot(groc_std,aes(x='Milk',y='Grocery',color='cluster'))+geom_point()

# Q.4

In [None]:
from sklearn.decomposition import FactorAnalysis

In [None]:
data_file='~/Dropbox/March onwards/Python Data Science/Data/cars.csv'
cars=pd.read_csv(data_file)

In [None]:
X_cars=cars.drop(['Name'],1)

In [None]:
X_cars=pd.DataFrame(scale(X_cars),columns=X_cars.columns)

In [None]:
fa=FactorAnalysis(n_components=4,max_iter=1000)

In [None]:
fa.fit(X_cars)

In [None]:
nvar=fa.noise_variance_
plt.plot(nvar)

In [None]:
print(*zip(X_cars.columns,nvar))

# a.

We will remove the variables one by one for which noice variance is higher than 0.1

In [None]:
X_cars=X_cars.drop(['Width'],1)

fa=FactorAnalysis(n_components=4,max_iter=1000)

fit=fa.fit(X_cars)
nvar=fa.noise_variance_
print(*zip(X_cars.columns,nvar))
plt.plot(nvar)

In [None]:
X_cars=X_cars.drop(['Length'],1)

fa=FactorAnalysis(n_components=4,max_iter=1000)

fit=fa.fit(X_cars)
nvar=fa.noise_variance_
print(*zip(X_cars.columns,nvar))
plt.plot(nvar)

In [None]:
X_cars=X_cars.drop(['Wheelbase'],1)

fa=FactorAnalysis(n_components=4,max_iter=1000)

fit=fa.fit(X_cars)
nvar=fa.noise_variance_
print(*zip(X_cars.columns,nvar))
plt.plot(nvar)

In [None]:
X_cars=X_cars.drop(['Horsepower'],1)

fa=FactorAnalysis(n_components=4,max_iter=1000)

fit=fa.fit(X_cars)
nvar=fa.noise_variance_
print(*zip(X_cars.columns,nvar))
plt.plot(nvar)

In [None]:
loadings=fa.components_
loadings

# b.

In [None]:
print(*zip(X_cars.columns,loadings[0,]))

Loadings for price variables [Retail & Dealer ] is pretty higher , they are the domninant contributors to these factors. Rest of the variables except mileage ones also contribute positively to this. We can consider this factor to be vehicle's percieved value. Higher mileage indicates towards not so high value vehicle accroding to this value indicator

In [None]:
print(*zip(X_cars.columns,loadings[1,]))

Here we can see that more weightage is given to Weight,Engine and Cylinders and the relation is inverse in nature
which tells all these variables tends to lower the mileage of the car. We can label this factor as fuel efficiency

In [None]:
print(*zip(X_cars.columns,loadings[2,]))

Here we can see that all variables are given the positive weights so we can label this factor as specs.

In [None]:
print(*zip(X_cars.columns,loadings[3,]))

Here we can see that more weightage is given CityMPG, Weight and Cylinder. We can label this factor as torque, which is decreases with increase in Cylinders, and increases with increase in mileage and weight.