In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import MeanShift

In [2]:
dat_file = 'checkins.dat'

In [3]:
with open(dat_file, 'r') as f:
    # get column names
    columns = f.readline().split('|')
    columns = [c.strip() for c in columns]
    lines = f.readlines()
    # create index for DataFrame
    index = [i for i in range(len(lines))]    
    df = pd.DataFrame(columns=columns, index=index)

    i = 0
    for line in lines:
        line = line.strip()
        try:
            if line[0].isdigit():
                values = line.split('|')
                values = [v.strip() for v in values]
                lat, lon = values[3], values[4]
                if not lat or not lon:
                    continue
                # convert to numbers
                lat = float(lat)
                lon = float(lon)
                current_row = values[:3] + [lat, lon] + values[5:]
                df.loc[i] = current_row
                i +=1           
        except:
            print(line)




In [4]:
df.dropna(inplace=True)
print(len(df))

396634


In [9]:
limit = 100_000
df_sampled = df[:limit]

In [10]:
len(df_sampled)

100000

In [12]:
df_to_cluster = df_sampled[['latitude','longitude']]

In [13]:
clustering = MeanShift(bandwidth=0.1).fit(df_to_cluster.values)

In [30]:
len(clustering.cluster_centers_), len(clustering.labels_)

(3231, 100000)

In [36]:
# pick clusters that have more than 15 points
clustering.cluster_centers_[:6]

array([[  40.7177164 ,  -73.99183542],
       [  33.44943805, -112.00213969],
       [  33.44638027, -111.90188756],
       [  41.87824378,  -87.62984336],
       [  37.68868157, -122.40933037],
       [  38.88616522,  -77.04878333]])

In [33]:
sorted(set(clustering.labels_))

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [22]:
from collections import Counter

In [24]:
counts = Counter(clustering.labels_)
frequent_clusters = []
for elem, cnt in counts.items():
    if cnt > 15:
        frequent_clusters.append(elem)

In [37]:
len(frequent_clusters), frequent_clusters

(592,
 [5,
  7,
  30,
  65,
  1,
  23,
  0,
  2,
  8,
  137,
  237,
  11,
  507,
  45,
  22,
  32,
  47,
  4,
  51,
  48,
  3,
  403,
  38,
  6,
  80,
  146,
  10,
  264,
  14,
  352,
  12,
  189,
  28,
  20,
  405,
  190,
  59,
  209,
  133,
  93,
  151,
  475,
  371,
  148,
  353,
  259,
  519,
  79,
  416,
  162,
  13,
  170,
  17,
  311,
  122,
  423,
  184,
  367,
  39,
  104,
  60,
  366,
  69,
  9,
  109,
  61,
  522,
  168,
  68,
  16,
  485,
  150,
  128,
  284,
  100,
  446,
  66,
  21,
  147,
  41,
  19,
  236,
  37,
  31,
  204,
  509,
  281,
  49,
  26,
  292,
  427,
  233,
  448,
  42,
  50,
  81,
  186,
  194,
  44,
  72,
  1350,
  15,
  205,
  18,
  116,
  308,
  27,
  246,
  267,
  551,
  302,
  84,
  92,
  239,
  25,
  554,
  269,
  276,
  58,
  330,
  24,
  339,
  213,
  200,
  291,
  223,
  166,
  36,
  199,
  46,
  43,
  144,
  124,
  262,
  159,
  55,
  78,
  518,
  289,
  420,
  167,
  468,
  254,
  242,
  365,
  131,
  35,
  268,
  295,
  218,
  165,
  331,
  24

In [41]:
clusters_to_use = clustering.cluster_centers_[frequent_clusters]

In [42]:
clusters_to_use

array([[  38.88616522,  -77.04878333],
       [  33.76663623,  -84.39328918],
       [  45.52348321, -122.67628042],
       ...,
       [  50.1115118 ,    8.6805059 ],
       [  42.0166667 ,  -94.3766667 ],
       [  37.2046429 ,  -80.4126892 ]])

In [58]:
len(clusters_to_use)

592

In [27]:
d = {'lat':[33.751277, 25.867736, 51.503016, 52.378894, 39.366487, -33.868457], 
     'lon': [-118.188740, -80.324116, -0.075479, 4.885084, 117.036146, 151.205134] }

offices = pd.DataFrame(data=d)

In [28]:
offices

Unnamed: 0,lat,lon
0,33.751277,-118.18874
1,25.867736,-80.324116
2,51.503016,-0.075479
3,52.378894,4.885084
4,39.366487,117.036146
5,-33.868457,151.205134


In [59]:
distances = []

In [60]:
for cluster_centroid in clusters_to_use:
    for index, office in offices.iterrows():
        current_dist = np.linalg.norm(office-cluster_centroid)
        distances.append((current_dist, cluster_centroid, office))

In [61]:
len(distances)

3552

In [63]:
3552/6

592.0

In [65]:
distances_sorted = sorted(distances, key=lambda tup: tup[0])

In [66]:
distances_sorted

[(0.007834758163107856,
  array([-33.86063043, 151.20477593]),
  lat    -33.868457
  lon    151.205134
  Name: 5, dtype: float64),
 (0.009353316185992226,
  array([52.37296399,  4.89231722]),
  lat    52.378894
  lon     4.885084
  Name: 3, dtype: float64),
 (0.022674066158385495,
  array([ 25.84567226, -80.3188906 ]),
  lat    25.867736
  lon   -80.324116
  Name: 1, dtype: float64),
 (0.05005829482278787,
  array([51.50299126, -0.12553729]),
  lat    51.503016
  lon    -0.075479
  Name: 2, dtype: float64),
 (0.07084773242719973,
  array([  33.80987796, -118.14892381]),
  lat     33.751277
  lon   -118.188740
  Name: 0, dtype: float64),
 (0.13410903336184654,
  array([ 25.78581242, -80.21793804]),
  lat    25.867736
  lon   -80.324116
  Name: 1, dtype: float64),
 (0.1674059642503429,
  array([ 25.70534972, -80.28342874]),
  lat    25.867736
  lon   -80.324116
  Name: 1, dtype: float64),
 (0.18887596060185083,
  array([ 26.01009825, -80.19999059]),
  lat    25.867736
  lon   -80.324116
