In [54]:
# -*- coding: utf-8 -*-

from __future__ import division
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import csv
import numpy as np

In [55]:
# Zonas_kmeans.csv, Zonas_dbscan.csv
file_name = '../Data/Zonas_dbscan.csv'

In [56]:
f = open(file_name, 'r')
csvfile = csv.reader(f)
caracteristicas = csvfile.next()[1:] #skip the headers
zonas = []

for zona in csvfile:
    zonas.append(map(int, zona))

In [57]:
min_max_scaler = MinMaxScaler()
zonas_norm = min_max_scaler.fit_transform([zona[1:] for zona in zonas])

pca_estimator = PCA(n_components = 2)
X_pca = pca_estimator.fit_transform(zonas_norm)

print 'Varianza explicada por cada varaible: ' + str(pca_estimator.explained_variance_ratio_)
print 'Varianza total explicada: ' + str(sum(pca_estimator.explained_variance_ratio_))

Varianza explicada por cada varaible: [ 0.80022241  0.1217984 ]
Varianza total explicada: 0.922020811566


In [58]:
plt.scatter([x[0] for x in X_pca], [x[1] for x in X_pca])
plt.show()

In [59]:
fig, ax = plt.subplots()
plt.xlim(-0.2, 0.55)
plt.ylim(-0.2, 1)
ax.grid(True)

N = len(X_pca)
numbers = np.arange(len(X_pca))

for i in range(N):
    plt.text(X_pca[i][0], X_pca[i][1], numbers[i])

plt.show()

In [60]:
import sklearn.neighbors
from scipy import cluster 

zona_peligrosa = zonas[11]
indice_out = 11
zonas = zonas[:11] + zonas[12:]
X_pca = np.concatenate((X_pca[:11], X_pca[12:]))

dist = sklearn.neighbors.DistanceMetric.get_metric('euclidean')
matsim = dist.pairwise(X_pca)

In [61]:
clusters = cluster.hierarchy.linkage(matsim, method = 'centroid')
cluster.hierarchy.dendrogram(clusters, color_threshold=1.3)
plt.show()

In [62]:
cut = 1.3
labels = cluster.hierarchy.fcluster(clusters, cut, criterion='distance')
unique_labels = set(labels)
print 'Número de clusters %d' % (len(unique_labels) + 1)

colors = np.array(list('bgrcmykbgrcmykbgrcmykbgrcmyk'))
colors = np.hstack([colors] * 20)

fig, ax = plt.subplots()

plt.xlim(-0.2, 0.55)
plt.ylim(-0.2, 1)
ax.grid(True)

for i in range(len(zonas)):
    plt.text(X_pca[i][0], X_pca[i][1], i if i < indice_out else i + 1, color = colors[labels[i]])

fig.tight_layout()
plt.show()

Número de clusters 6


In [63]:
zone_groups = dict()
for i, zona in enumerate(zonas):
    if labels[i] in zone_groups:
        zone_groups[labels[i]].append(zona)
    else:
        zone_groups[labels[i]] = [zona]
zone_groups[len(unique_labels)+1] = [zona_peligrosa]

In [64]:
features = []

for key in zone_groups:
    zone_feature = []
    for i in range(1, len(zone_groups[key][0])):
        feature = [zone[i] for zone in zone_groups[key]]
        maxi = max(feature)
        mini = min(feature)
        aver = sum(feature) / len(feature)
        zone_feature.append(maxi)
        zone_feature.append(mini)
        zone_feature.append(round(aver,2))
    features.append(zone_feature)

In [65]:
file_name_out = '../Data/Grupos_zonas.csv'

zonas_labels = ['Zona de peligro bajo',
                'Zona de peligro medio',  
                'Zona de peligro alto (Concentracion de accidentes de camiones)', 
                'Zona de peligro bajo (Concentracion de accidentes de camiones)', 
                'Zona de peligro alto',
                'Zona de peligro muy alto']

etiquetas = [' (max)', ' (min)', ' (average)']

with open(file_name_out, 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    
    headers = ['Zona']
    for c in caracteristicas:
        for e in etiquetas:
            headers.append(c + e)
    writer.writerow(headers)
    
    for i, zone_feature in enumerate(features):
        zone_feature.insert(0, zonas_labels[i])
    writer.writerows(features)