## Python Clustering

This notebook does some clustering, making use of `clustering_code.py`, a file of functions taken from Joel's _Data Science from Scratch_. I recommend giving chapter 19 of that book a read as you work through this.

In [None]:
import clustering_code
from collections import defaultdict
from pprint import pprint

input_file = "survey_responses.txt"

Now let's read in the data and put it in a default dictionary.

In [None]:
student_data = defaultdict(list)
with open(input_file,'r') as ifile :
    next(ifile)
    for row in ifile.readlines() :
        row = row.strip().split("\t")
        this_student = row[1]
        student_data[this_student] = row[2:]


We need numerical data for clustering, so we'll convert over the Yes/No responses.

In [None]:
# Let's change No to 0 and Yes to 1, so everything is numerical
for student in student_data :
    this_data = student_data[student] # get the list of data 
    
    for idx, item in enumerate(this_data) : # iterate over the list (and its index)
        if item == "No" :
            this_data[idx] = 0 # change the "No" spot to 0
        elif item == "Yes" :
            this_data[idx] = 1 # change the "Yes" spot to 1 
            
    student_data[student] = [float(item) for item in this_data] 
        # overwrite the old list with the new one. Also make everything numeric
            

In [None]:
# Let's just print the data so it's easier to see
pprint(student_data)

In [None]:
# Now, let's explore some clusters. Try different values of
# k and see what emerges

k = 3
assignments, means = clustering_code.train_dict(student_data, k)


# Sorted version
s_assign = ( (k ,assignments[k]) for k in sorted(assignments, key=assignments.get, reverse=False))
print( str(k) + "-means:")
for student, cluster in s_assign :
    print(str(cluster) + " : " + student)

print(means)
    

In [None]:
# let's re-scale the two mileage columns so that they're in the range of 0 - 1.
miles = []
for student, vec in student_data.items() :
    miles.append(vec[0])
    miles.append(vec[1])

max_miles = max(miles)
min_miles = min(miles)

for student, vec in student_data.items() :
    vec[0] = (vec[0] - min_miles)/(max_miles - min_miles)    
    vec[1] = (vec[1] - min_miles)/(max_miles - min_miles)    



In [None]:
# Let's make a function that prints the means in a nice way.

def pprint_means(the_means) :
    var_labels = ["Birth Dist","Age 15 Dist",
                  "Post-Secondary","Mkt Major",
                  "Biz Major","HH Size"]
    for idx, cluster_mean in enumerate(the_means) :
        print("--- Printing Cluster " + str(idx) + " ---")
        
        for idx2, item in enumerate(cluster_mean) :
            print(": ".join([var_labels[idx2],str(round(item,2))]))

        print("----------------------\n")
            

In [None]:
k = 5
assignments, means = clustering_code.train_dict(student_data, k)

#assignments = sorted(assignments.items(),
#                     key = lambda (student, cluster) : cluster,
#                     reverse = False)

s_assign = ( (k ,assignments[k]) for k in sorted(assignments, key=assignments.get, reverse=False))
print( str(k) + "-means:")
for student, cluster in s_assign :
    print(str(cluster) + " : " + student)



In [None]:
pprint_means(means)