In [1]:
%pylab inline

%load_ext autoreload
%autoreload 2

Populating the interactive namespace from numpy and matplotlib


## Libraries

In [2]:
import numpy as np
from numpy.random import randn

import pandas as pd

#time
from datetime import datetime
from datetime import timedelta

#counting
from collections import Counter

In [3]:
# good old matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt

#high-level based on matplotlib
import seaborn as sns

#dynamic and interactive
import highcharts
from highcharts.charts import chart

## Init

In [4]:
sns.set_palette("deep", desat=.6)
sns.set_context(rc={"figure.figsize": (8, 4)})
np.random.seed(1234)

In [5]:
#loading highcharts javascript
highcharts.init()

## Reading data

### Checkins

In [7]:
dfc = pd.read_csv('../../datasets/loc-gowalla_totalCheckins.txt', sep='\t', header=False)
dfc.columns = ['uid','utc','lat','lon','vid']

dfc['utc'] = dfc['utc'].astype('datetime64[ms]')

dfc.head()

Unnamed: 0,uid,utc,lat,lon,vid
0,0,2010-10-18 22:17:43,30.269103,-97.749395,420315
1,0,2010-10-17 23:42:03,30.255731,-97.763386,316637
2,0,2010-10-17 19:26:05,30.263418,-97.757597,16516
3,0,2010-10-16 18:50:42,30.274292,-97.740523,5535878
4,0,2010-10-12 23:58:03,30.261599,-97.758581,15372


### Data munging: cleanup, time parsing, etc.

New York, New York!
 - bounding box: (40.4774, -74.2589), (40.9176, -73.7004)

In [8]:
ny = (dfc['lat']>=40.4774) & (dfc['lat']<=40.9176) & (dfc['lon']>=-74.2589) & (dfc['lon']<=-73.7004)
dfc = dfc[ny]

Time columns

In [9]:
dfc['year']     = dfc['utc'].apply(lambda x: x.date().year)
dfc['month']    = dfc['utc'].apply(lambda x: x.date().month)
dfc['day']      = dfc['utc'].apply(lambda x: x.date().day)

dfc['date']     = dfc['utc'].apply(lambda x: x.date())
dfc['time']     = dfc['utc'].apply(lambda x: x.time())

dfc['isotime']  = dfc['utc'].apply(lambda x: x.isoformat()+'Z')

dfc = dfc[dfc['date']<=datetime(2010,10,19).date()]
dfc[['uid','utc','lat','lon','vid']].head()

Unnamed: 0,uid,utc,lat,lon,vid
9,0,2010-10-12 00:21:28,40.643885,-73.782806,23261
10,0,2010-10-11 20:21:20,40.741374,-73.988105,16907
11,0,2010-10-11 20:20:42,40.741388,-73.989455,12973
12,0,2010-10-11 00:06:30,40.72491,-73.994621,341255
13,0,2010-10-10 22:00:37,40.729768,-73.998535,260957


### Venues

In [11]:
dfv = pd.read_csv('../../datasets/spots.txt', sep='\t', header=False)
dfv.columns = ['vid','name','loc']

coords = dfv['loc'].replace('[^0-9. -]+', '',regex=True)
coords = coords.apply(lambda x: x.split())

dfv['v_lat'] = coords.apply(lambda x: x[1])
dfv['v_lon'] = coords.apply(lambda x: x[0])

dfv = dfv.drop('loc', 1)
dfv.head()

Unnamed: 0,vid,name,v_lat,v_lon
0,1391604,Conference House Park,40.5017589436,-74.2523431778
1,1391611,Almer G. Russell Pavilion,40.5022647413,-74.2542636395
2,3612422,Conference House,40.5000644614,-74.2490418254
3,3612431,Billop House,40.5000644614,-74.2490418254
4,1391499,Biddle House,40.5054829933,-74.2541456223


### Merge venue names and checkins

In [12]:
df = pd.merge(dfc, dfv[['vid', 'name']], how='left', on='vid')

Missing venue names on the available checkins:

In [13]:
print "missing venue names: {}% of available checkins".format(len(df[pd.isnull(df['name'])])*100 / len(df))

missing venue names: 19% of available checkins


## Data Exploration

In [14]:
d = df.groupby('date').size()

In [15]:
chart({
            'chart': {
                'type': 'line',
                'marginRight': 30,
                'marginBottom': 50
            },
            'title': {
                'text':'#checkins per day in New York City'
            },
            'yAxis': {
                'type': 'linear',
                'title': {'text':'#checkins'},
                
            },
            'xAxis': {
                'categories': [str(x) for x in d.index.tolist()]
            },
            'series': [{
                'name': 'date',
                'data': d.tolist()
            }]
})

## Clustering

In [16]:
from sklearn.cluster import KMeans

In [17]:
start = datetime(2010,1,1,0,0) 
dfw = df[df['utc']>start]

In [18]:
cl = min(50, len(dfw)/8)
ml = KMeans(n_clusters=cl)
ml.fit(dfw[['lat', 'lon']])

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=50, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [19]:
labels = Counter(ml.labels_)
clusters = [ [ ml.cluster_centers_[x][1], ml.cluster_centers_[x][0], labels[x] ] for x in labels]
clusters

[[-74.028759144718549, 40.786033036363747, 480],
 [-73.954997734611197, 40.717775764575833, 3625],
 [-74.179811824538874, 40.691531765560633, 2210],
 [-73.784133202672095, 40.643429900691714, 3355],
 [-73.972033072872222, 40.757542139092955, 9832],
 [-73.988105928795804, 40.72711000704335, 9437],
 [-73.93296329449845, 40.823003772969727, 1351],
 [-73.871195935667828, 40.773744853461359, 2580],
 [-74.226380851177694, 40.883050138923743, 139],
 [-73.9912035197802, 40.67406253981278, 2324],
 [-74.160135369494967, 40.588651986014526, 358],
 [-73.81869331799831, 40.824708585451788, 531],
 [-73.983291481418945, 40.761263163035117, 15008],
 [-74.148165246936074, 40.824503066136757, 438],
 [-74.025411232710425, 40.623038911924667, 681],
 [-74.034107225088249, 40.731961366395666, 2563],
 [-73.991585926997189, 40.698408992483955, 3129],
 [-74.050484116147018, 40.910032785624601, 504],
 [-74.166500408258514, 40.740343192709901, 1121],
 [-73.79884338826777, 40.710376004573966, 630],
 [-73.97428418

In [20]:
delta=0.15
chart({
            'chart': {
                'type': 'bubble',
                'zoomType': 'xy',
                'width':800,
                'height':800,
                'plotBackgroundImage':'https://maps.googleapis.com/maps/api/staticmap?center=40.8,-74.0&zoom=11&size=800x800&maptype=roadmap'
            },
            'plotOptions': {
              'bubble': {
                'maxSize':'5%',
                'minSize':'1%'
               }
             },
            'legend': { 
              'enabled': False
            },
            'yAxis': {
              'min':40.8-delta,
              'max':40.8+delta
            },
            'xAxis': {
              'min':-74.0-delta,
              'max':-74.0+delta
            },
            'series': [{'data': clusters, 'color':"#FF0000"}]
}, '800px', '800px')

### Write data to csv file

####checkins

In [21]:
cols = ['year','month','day','isotime','uid','lat','lon','vid']
dfc[cols].to_csv('../../datasets/checkins.csv', header=False, index=False)

####venues

In [22]:
dfv[['vid','name','v_lat','v_lon']].to_csv('../../datasets/venues.csv', header=False, index=False)