# DBScan: Cluster Analysis of every user's location

Using DBScan, find spatial clusters relevant to each user, typically home, work, or school

In [271]:
import threading, json, os, imp
import numpy as np
from dbscan_python import dbscan
imp.reload(dbscan)

<module 'dbscan_python.dbscan' from '/home/anderstj/VulnerableLocations/GeoProcessing/dbscan_python/dbscan.py'>

In [272]:
userList = os.listdir('../zone_a_twitterers/')
len(userList)

22042

In [269]:
class UserClustering(threading.Thread):
    def __init__(self, file_name):
        threading.Thread.__init__(self)
        self.file_name = file_name
        self.tweets = {}
        
    def get_clusters(self):
        points = [tweet['geometry']['coordinates'] for tweet in self.tweets]
        m = np.array([ [p[1] for p in points], [p[0] for p in points] ])
        eps = 100     #Max. Distance for points in the cluster...                                                                                                                   
        min_points = 5  
        self.clusters = dbscan.dbscan(m, eps, min_points)
        
    def enrich_tweet_properties_with_cluster(self):
        for idx, tweet in enumerate(self.tweets):
            tweet['properties']['cluster'] = self.clusters[idx]
            del tweet['properties']['gnip']
            
    def write_clustered_geojson(self):
        with open('../zone_a_clustered_5pts/'+self.file_name,'w') as outFile:
            outFile.write(json.dumps({"type":"FeatureCollection", "features":self.tweets}))
    
    def run(self):
        with open('../zone_a_twitterers/'+self.file_name,'r') as inFile:
            self.tweets = json.loads(inFile.read())['features']
        self.get_clusters()
        self.enrich_tweet_properties_with_cluster()
        self.write_clustered_geojson()

In [None]:
res = []
for user in userList:
    obj = UserClustering(user)
    res.append(obj)
    obj.start()

In [261]:
top_users = sorted(res, key=lambda u: len(u.tweets), reverse=True)
len(top_users)
for u in top_users[:10]:
    print(u.file_name, len(u.tweets))

..Devindra.geojson 78
pawoot.geojson 59
MatthewLaBonne.geojson 58
AliMacAttack.geojson 26
tequilatiff_tif.geojson 5
K_Vander.geojson 3
Fuel_Nike.geojson 3
egzuck.geojson 3
GQswaGGG.geojson 2
hollywooddiva99.geojson 2


In [219]:
top_users[0].tweets[0]

{'geometry': {'coordinates': [-73.77554969, 40.59305065], 'type': 'Point'},
 'properties': {'cluster': None,
  'gnip': {'language': {'value': 'sv'},
   'matching_rules': [{'tag': '[-73.81456689610107,40.60326795092426]',
     'value': 'bounding_box:[-73.85948266030705 40.56916637212157 -73.76965113189509 40.63736952972695]'}]},
  'source': {'displayName': 'Twitter for iPhone',
   'link': 'http://twitter.com/download/iphone'},
  'text': '@CelesteJolie morning, momma!',
  'time': '2012-10-08T14:24:11.000Z',
  'u_utc': '-18000',
  'user': 'RobGPerez',
  'verb': 'post'},
 'type': 'Feature'}

In [221]:
json.dumps({
        'type':'FeatureCollection',
        'features':[
            {'type':'Feature',
             'properties': {
                    'cluster': t['properties']['cluster']
                    },
             'geometry':t['geometry']
             } for t in top_users[0].tweets]})

'{"features": [{"properties": {"cluster": null}, "geometry": {"coordinates": [-73.77554969, 40.59305065], "type": "Point"}, "type": "Feature"}, {"properties": {"cluster": null}, "geometry": {"coordinates": [-73.78201079, 40.59232932], "type": "Point"}, "type": "Feature"}, {"properties": {"cluster": null}, "geometry": {"coordinates": [-73.83673322, 40.67217017], "type": "Point"}, "type": "Feature"}, {"properties": {"cluster": null}, "geometry": {"coordinates": [-73.81565283, 40.60534512], "type": "Point"}, "type": "Feature"}, {"properties": {"cluster": 1}, "geometry": {"coordinates": [-73.98209328, 40.7747305], "type": "Point"}, "type": "Feature"}, {"properties": {"cluster": null}, "geometry": {"coordinates": [-73.84241003, 40.68035255], "type": "Point"}, "type": "Feature"}, {"properties": {"cluster": null}, "geometry": {"coordinates": [-73.83036826, 40.66025853], "type": "Point"}, "type": "Feature"}, {"properties": {"cluster": 1}, "geometry": {"coordinates": [-73.98209328, 40.7747305],

In [174]:
geodistance.distanceHaversine(p1[1],p1[0],p2[1],p2[0])[0]*1000

22.984696090086025

In [192]:
points[:10]

[[-74.13763004, 40.62469093],
 [-74.13790189, 40.62467849],
 [-74.13736593, 40.62543101],
 [-74.13609801, 40.62804978],
 [-74.13605684, 40.62822958],
 [-74.13592377, 40.6282247],
 [-74.1361719, 40.62794196],
 [-74.13615935, 40.62794937],
 [-74.13597703, 40.62821393],
 [-74.13598018, 40.62815867]]

In [196]:
np.array([ [p[1] for p in points[:10]], [p[0] for p in points[:10]]])

array([[ 40.62469093,  40.62467849,  40.62543101,  40.62804978,
         40.62822958,  40.6282247 ,  40.62794196,  40.62794937,
         40.62821393,  40.62815867],
       [-74.13763004, -74.13790189, -74.13736593, -74.13609801,
        -74.13605684, -74.13592377, -74.1361719 , -74.13615935,
        -74.13597703, -74.13598018]])

In [198]:
imp.reload(dbscan)
m = np.array([ [p[1] for p in points[:10]], [p[0] for p in points[:10]]])                                                           
eps = 50                                                                                                                      
min_points = 2                                                                                                                 
dbscan.dbscan(m, eps, min_points)

[ 40.62469093 -74.13763004] [ 40.62469093 -74.13763004]
40.62469093 40.62469093
[ 40.62469093 -74.13763004] [ 40.62467849 -74.13790189]
40.62469093 40.62467849
[ 40.62469093 -74.13763004] [ 40.62543101 -74.13736593]
40.62469093 40.62543101
[ 40.62469093 -74.13763004] [ 40.62804978 -74.13609801]
40.62469093 40.62804978
[ 40.62469093 -74.13763004] [ 40.62822958 -74.13605684]
40.62469093 40.62822958
[ 40.62469093 -74.13763004] [ 40.6282247  -74.13592377]
40.62469093 40.6282247
[ 40.62469093 -74.13763004] [ 40.62794196 -74.1361719 ]
40.62469093 40.62794196
[ 40.62469093 -74.13763004] [ 40.62794937 -74.13615935]
40.62469093 40.62794937
[ 40.62469093 -74.13763004] [ 40.62821393 -74.13597703]
40.62469093 40.62821393
[ 40.62469093 -74.13763004] [ 40.62815867 -74.13598018]
40.62469093 40.62815867
[ 40.62469093 -74.13763004] [ 40.62469093 -74.13763004]
40.62469093 40.62469093
[ 40.62469093 -74.13763004] [ 40.62467849 -74.13790189]
40.62469093 40.62467849
[ 40.62469093 -74.13763004] [ 40.62543101

[1, 1, 1, 2, 2, 2, 2, 2, 2, 2]

In [182]:
m

array([[-74.13763004,  40.62469093],
       [-74.13790189,  40.62467849],
       [-74.13736593,  40.62543101],
       ..., 
       [-74.35589489,  40.44075727],
       [-74.3549489 ,  40.44064217],
       [-74.1171932 ,  40.56543909]])