In [11]:
%pylab inline
import pandas as pd
import numpy as np
df = pd.read_csv('/home/sophie/projects/LearnDataScience-master/datasets/UN.csv')

print('-----------')
# raw column information plus summary header
print(df[0:10])
print('-----------')
# look at the types of each column explicitly
print('Individual columns - Python data types')
[(x, type(df[x][0])) for x in df.columns]

Populating the interactive namespace from numpy and matplotlib
-----------
          country   region   tfr  contraception  educationMale  \
0     Afghanistan     Asia  6.90            NaN            NaN   
1         Albania   Europe  2.60            NaN            NaN   
2         Algeria   Africa  3.81           52.0           11.1   
3  American.Samoa     Asia   NaN            NaN            NaN   
4         Andorra   Europe   NaN            NaN            NaN   
5          Angola   Africa  6.69            NaN            NaN   
6         Antigua  America   NaN           53.0            NaN   
7       Argentina  America  2.62            NaN            NaN   
8         Armenia   Europe  1.70           22.0            NaN   
9       Australia  Oceania  1.89           76.0           16.3   

   educationFemale  lifeMale  lifeFemale  infantMortality  GDPperCapita  \
0              NaN      45.0        46.0            154.0        2848.0   
1              NaN      68.0        74.0        

[('country', str),
 ('region', str),
 ('tfr', numpy.float64),
 ('contraception', numpy.float64),
 ('educationMale', numpy.float64),
 ('educationFemale', numpy.float64),
 ('lifeMale', numpy.float64),
 ('lifeFemale', numpy.float64),
 ('infantMortality', numpy.float64),
 ('GDPperCapita', numpy.float64),
 ('economicActivityMale', numpy.float64),
 ('economicActivityFemale', numpy.float64),
 ('illiteracyMale', numpy.float64),
 ('illiteracyFemale', numpy.float64)]

In [24]:
# The key point is that in the for loop below we are looking at the first VALUE in each of the columns 
# to see what datatypes they are. df.dtypes will simply say "object".
print (type(df['country'][0]))

# make a list which looks at the data type from
[(x, type(df[x][0])) for x in df.columns]

<class 'str'>


[('country', str),
 ('region', str),
 ('tfr', numpy.float64),
 ('contraception', numpy.float64),
 ('educationMale', numpy.float64),
 ('educationFemale', numpy.float64),
 ('lifeMale', numpy.float64),
 ('lifeFemale', numpy.float64),
 ('infantMortality', numpy.float64),
 ('GDPperCapita', numpy.float64),
 ('economicActivityMale', numpy.float64),
 ('economicActivityFemale', numpy.float64),
 ('illiteracyMale', numpy.float64),
 ('illiteracyFemale', numpy.float64)]

In [18]:
# This prints out the format of the COLUMN. the first two are Series objects.
# The others are numpy float arrays.
print (df.dtypes)


country                    object
region                     object
tfr                       float64
contraception             float64
educationMale             float64
educationFemale           float64
lifeMale                  float64
lifeFemale                float64
infantMortality           float64
GDPperCapita              float64
economicActivityMale      float64
economicActivityFemale    float64
illiteracyMale            float64
illiteracyFemale          float64
dtype: object


In [27]:
# How many non null values are there in each column
df.count()

country                   207
region                    207
tfr                       197
contraception             144
educationMale              76
educationFemale            76
lifeMale                  196
lifeFemale                196
infantMortality           201
GDPperCapita              197
economicActivityMale      165
economicActivityFemale    165
illiteracyMale            160
illiteracyFemale          160
dtype: int64

#### What does the above tell us?

- 14 columns, 207 countries
- We want to focus on columns with the least null values - tfr, lifeMale,lifeFemale,GDP and InfantMortality
- We suspect a clustering of lifeMale, lifeFemale and infantMortality according to GDP - will use K-means to analyse 
in detail
- We don't know how many clusters there might be
- Will use analytics too, to help decide the right number of clusters might be (possible 1 to 10)
- Each field will be converted to a scientific float needed by the numerical algorithms (are numpy floats not right?)

## Analysis

### Methods

We want to find out how many natural clusters there are. This is tested by appling k-means clustering using a different number of clusters (from 1, to 10). When we have more clusters, they will be more tightly packed and least squares error will be small. Less clusters and each cluster will have a bigger spread. 

However, each successive increase in K will not give the same drop in tightness. 

The point at which the improvement starts to level off is "the elbow". We will use the value of k at this point.

**steps**
- generate elbow curve, decide on k, run clustering over the data
- plot the clusters to visualise exactly how the clusters appear
- interpret the results

In [29]:
%pylab inline
# using the code from 
# http://stackoverflow.com/questions/6645895/calculating-the-percentage-of-variance-measure-for-k-means
# we will generate plots from the UN dataset transformed into 4 numeric columns
# lifeMale, lifeFemale, infantMortality and GDPperCapita

fName = ('/home/sophie/projects/LearnDataScience-master/datasets/UN4col.csv')
fp = open(fName)
X = np.loadtxt(fp)
fp.close()
         

Populating the interactive namespace from numpy and matplotlib


In [38]:
# A little query on the numpy array.
print (X.shape)
print (type(X))
print (X.ndim)
print (X.size)

(188, 4)
<class 'numpy.ndarray'>
2
752


In [58]:
import numpy as mp
from scipy.cluster.vq import kmeans,vq
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt

#### cluster into K=1..10 clusters #####
# K, KM, centroids, D_k, cIDx, dist, avgWithinSS = kmeans.run_kmeans(X,10)

K = range(1,10)

# apply kmeans 1 to 10. kmeans(obs, k_or_guess[, iter, thresh, ...]). 
# returns a codebook(A k by N array of k centroids) and 
# distortion (a float describing the distortion between the observations passed and the centroids generated)
# KM is a list of codebooks and distortion for different numbers of clusters, from 1 to 10.
KM = [kmeans(X,k) for k in K]

# centroids is a list of codebooks for each K. cent is the codebook, a numpy array, 
# for 1 cluster, 2 clusters, 3 clusters etc..
centroids = [cent for (cent,var) in KM]

What information is in a codebook?
It maps centroids to codes and vice versa. The format is [n_clusters, n_features]. See below.

In [74]:
# This is one way to look at each element in a list
print(centroids[0].shape)
print(centroids[1].shape)
print(centroids[2].shape)
print(centroids[3].shape)

# This is much better
[s.shape for s in centroids]  # list the shape of each code book.

(1, 4)
(2, 4)
(3, 4)
(4, 4)


[(1, 4), (2, 4), (3, 4), (4, 4), (5, 4), (6, 4), (7, 4), (8, 4), (9, 4)]

In [83]:
# cdist computes the euclidean distance between two points.
# Every value in the array is tested against each code book. 
# cdist(XA, XB, "metric"). 
# Where XA = mA by n array of mA original observations in an n-dimensional space
# and XB = An mB by n array of mB(no of clusters) original observations in an n-dimensional space. 
# Both must have the same number of columns.
D_k = [cdist(X, cent, 'euclidean') for cent in centroids]

Don't quite understand what cdist does. I think D_k is a list matrices, each of which contain a vector for each value of X, for each cluster set. 
We end up with a list of mA by mB distance matrices. i.e. number of rows in X by number of clusters. 

What is the difference between a centroid and a cluster center?

In [111]:
print(X.shape)
print(len(D_k))

# if D_k is a class object of list, what attributes does that have?
print(type(D_k))

# This is how to access individual bits of the matrix.
print(min(D_k[0][3:6])) # Selecting 3:6 from the first matrix. i.e. the matrix made for only one cluster.

(188, 4)
9
<class 'list'>
[ 2164.54439528]


In [123]:
X.shape[0]

188

In [95]:
cIdx = [np.argmin(D, axis=1) for D in D_k]
dist = [np.min(D, axis=1) for D in D_k]
avgWithinSS = [sum(d)/X.shape[0] for d in dist]

What is the difference between the argmin (cIdx) and min(dist)? cIdx is a list of indices for the minimum values. dist is those values.

cIdx = for each distance matrix we have an array of indices of the min values along an axis.          
dist = for each distance matrix we have an array of the min values along an axis            
avgWithinSS = for each of the matrices sum(min values)/188 (what is the average min value in each of the different cluster numbers i.e. with 1 cluster the average min value is avgWithinSS[0])

In [118]:
print ([D.shape for D in D_k]) # each one of these has a array of indices which have the min values.

[(188, 1),
 (188, 2),
 (188, 3),
 (188, 4),
 (188, 5),
 (188, 6),
 (188, 7),
 (188, 8),
 (188, 9)]

In [119]:
print(cIdx[0:3])

[array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0]), array([1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,

In [136]:
print(dist[0][0:3]) # first three points in the first matrix for one cluster. 
print(dist[6][0:3]) 
# As you increase the number of clusters these distances decrease as the clusters become more compact.

[ 3044.71049474  5027.61602297  4359.59802141]
[ 980.43054626  425.17523451  343.95436068]


In [127]:
print(len(dist))

9


In [128]:
print(avgWithinSS)

[6534.9809626620172, 2790.2101193300145, 1890.9166153060171, 1438.7793254224123, 1122.2625658294346, 972.53807558001813, 740.45942949866037, 645.91915410445324, 648.92982126597178]
