# PRÁCTICA RETO:
## POR ALBERTO PICO LARA Y KARLA ALEJANDRA MONTER BENITEZ.

In [1]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 500)

data_path = '../../data/modulo3/Citibike NY/'
frames = [f for f in os.listdir(data_path) if '.csv' in f]
df = pd.DataFrame()

for f in frames:
    df = pd.concat([df, pd.read_csv(data_path + f)])

In [2]:
tr = df[['start station id', 'start station name', 'start station latitude', 'start station longitude', 'tripduration']].copy()
tr.columns = ['id', 'name', 'slat', 'slong', 'trip_d']

In [3]:
stations = (tr[[c for c in tr.columns if c != 'id']]
            .groupby('name').agg({'slat':pd.Series.mode,
                                  'slong':pd.Series.mode}))

In [4]:
stations.head(5)

Unnamed: 0_level_0,slat,slong
name,Unnamed: 1_level_1,Unnamed: 2_level_1
5 Corners Library,40.734961,-74.059503
Astor Place,40.719282,-74.071262
Baldwin at Montgomery,40.723659,-74.064194
Bergen Ave,40.722104,-74.071455
Brunswick & 6th,40.726012,-74.050389


In [5]:
stations.reset_index(inplace=True, level=['name'])

In [6]:
stations.head(5)

Unnamed: 0,name,slat,slong
0,5 Corners Library,40.734961,-74.059503
1,Astor Place,40.719282,-74.071262
2,Baldwin at Montgomery,40.723659,-74.064194
3,Bergen Ave,40.722104,-74.071455
4,Brunswick & 6th,40.726012,-74.050389


In [7]:
def media(x:list):
    return(sum(x)/len(x))

In [8]:
def mapeo(S:list, m:int, n:int, f=media):
    Z = []
    for i in range(n):
        Z.append(f([S[j][i] for j in range(m)]))
    return(Z)

In [9]:
# Función de la distancia. 

from math import radians, cos, sin, asin, sqrt
import pandas as pd

def lat_long_to_distance(latitudes_1, latitudes_2, longitudes_1, longitudes_2,
                        units = 'Kilometers', vector=False):
    
    '''This function obtains the distance between two Earth points using Haversine
        formula it has as inputs the series of data.
        
        Args:
        1. latitudes_1: an array containing the start latitudes.
        2. latitudes_2: an array containing the end latitudes.
        3. longitudes_1: an array containing the start longitudes.
        4. longitudes_2: an array containing the end longitudes.
        5. units: the unit for the result. Possible values 'Kilometers' 'Miles'
        
        Returns:
        1. A pandas series with the distance in kilometers.'''
    
    if vector:
        la1 = len(latitudes_1)
        la2 = len(latitudes_2)
        lg1 = len(longitudes_1)
        lg2 = len(longitudes_2)
    else:
        la1=la2=lg1=lg2=0
    
    if (la1 == la2 == lg1 == lg2):
        
        #Radians conversion:
        lat1 = pd.Series(latitudes_1).map(radians)
        lat2 = pd.Series(latitudes_2).map(radians)
        lon1 = pd.Series(longitudes_1).map(radians)
        lon2 = pd.Series(longitudes_2).map(radians)
        
        #Haversine:
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        #a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
        a = (dlat.map(lambda x:sin(x/2)**2) + lat1.map(cos) * lat2.map(cos) * 
             dlon.map(lambda x:sin(x/2)**2))
        
        #c = 2 * asin(sqrt(a))
        c = a.map(sqrt).map(asin) * 2
        
        #Earth Radius:
        if units == 'Kilometers':
            r = 6371
        elif units == 'Miles':
            r = 3956
        else: return('Invalid units.')
        
        if vector:return(c*r)
        else:
            result = c*r
            return(result[0])
        
    else:
        
        print(la1, la2, lg1, lg2)
        return('Invalid dimensions.')

In [80]:
%%time
import itertools
combinaciones = list(itertools.combinations(np.arange(0, 53), 2))

CPU times: user 229 µs, sys: 16 µs, total: 245 µs
Wall time: 220 µs


In [82]:
%%time
names = list(stations['name'])
distances = []
for i in combinaciones:
    v1 = list(stations.loc[i[0], ['slat', 'slong']])
    v2 = list(stations.loc[i[1], ['slat', 'slong']])
    distances.append(    lat_long_to_distance(v1[0], v2[0], v1[1], v2[1]))

CPU times: user 3.39 s, sys: 866 µs, total: 3.39 s
Wall time: 3.4 s


In [83]:
%%time
names = pd.DataFrame(names, columns =['station_name'])
com = pd.DataFrame(combinaciones, columns =['index_1', 'index_2'])
ke = com.index[com['index_1']].tolist()
ke2 = com.index[com['index_2']].tolist()
names = pd.DataFrame(names, columns =['station_name'])
mmm = []
for i in ke:
    mmm.append(names['station_name'][i])

CPU times: user 13.9 ms, sys: 45 µs, total: 13.9 ms
Wall time: 13.3 ms


In [84]:
%%time
mmm2 = []
for i in ke2:
    mmm2.append(names['station_name'][i])
com['name_1'] = mmm
com['name_2'] = mmm2
com['Vectores'] = (com.name_1.astype(str) + ',' + com.name_2.astype(str))
com['distances'] = distances
com.loc[com['distances'] == com['distances'].min()].values

CPU times: user 13.9 ms, sys: 1.12 ms, total: 15.1 ms
Wall time: 13.7 ms


array([[8, 14, 'Columbus Dr at Exchange Pl', 'Exchange Place',
        'Columbus Dr at Exchange Pl,Exchange Place', 0.0882636199205624]],
      dtype=object)

In [85]:
com

Unnamed: 0,index_1,index_2,name_1,name_2,Vectores,distances
0,0,1,5 Corners Library,Astor Place,"5 Corners Library,Astor Place",2.005315
1,0,2,5 Corners Library,Baldwin at Montgomery,"5 Corners Library,Baldwin at Montgomery",1.317442
2,0,3,5 Corners Library,Bergen Ave,"5 Corners Library,Bergen Ave",1.748778
3,0,4,5 Corners Library,Brunswick & 6th,"5 Corners Library,Brunswick & 6th",1.256998
4,0,5,5 Corners Library,Brunswick St,"5 Corners Library,Brunswick St",1.411998
...,...,...,...,...,...,...
1373,49,51,Van Vorst Park,Washington St,"Van Vorst Park,Washington St",1.217123
1374,49,52,Van Vorst Park,York St,"Van Vorst Park,York St",0.605005
1375,50,51,Warren St,Washington St,"Warren St,Washington St",0.413682
1376,50,52,Warren St,York St,"Warren St,York St",0.573888


1. Cada observación es un cluster - 53 clusters
2. Encontrar los más cercanos, formar un cluster entre ellos - 52 clusters
3. Encontrar el cluster más cercano al cluster del paso 2 - 51 clusters
4. Repetir paso 3 hasta que el conteo de clusters = 1

In [186]:
cluster = stations.copy()

In [187]:
# Cada observación es un cluster - 53 clusters
stations.shape[0]

53

In [188]:
%%time
# 2. Encontrar los más cercanos, formar un cluster entre ellos - 52 cluster
minimo = com.loc[com['distances'] == com['distances'].min()]
v = minimo.values
index_1 = v[0][0]
index_2 = v[0][1]
cluster0 = mapeo([list(stations.loc[index_1, ['slat', 'slong']]) , list(stations.loc[index_2, ['slat', 'slong']])], 2, 2)

CPU times: user 3.26 ms, sys: 10 µs, total: 3.27 ms
Wall time: 3.14 ms


In [189]:
cluster0 

[40.71655845, -74.0331344]

In [190]:
cluster.drop([index_1, index_2], inplace=True)

In [191]:
cluster.shape

(51, 3)

In [192]:
cluster0.insert(0, 'c0')

In [193]:
cluster = cluster.append(pd.Series(cluster0, index = ['name', 'slat', 'slong']), ignore_index=True)
cluster.shape # 

(52, 3)

In [194]:
%%time
# 3. Encontrar el cluster más cercano al cluster del paso 2 - 51 clusters
distancias = []
for i in list(cluster.index):
    v = cluster.loc[[i], ['slat', 'slong']].values
    distancias.append(lat_long_to_distance(cluster0[1], v[0][0], cluster0[2], v[0][1]))
    pass

CPU times: user 134 ms, sys: 5.09 ms, total: 139 ms
Wall time: 136 ms


In [195]:
cluster['distance'] = distancias

In [196]:
cluster.shape

(52, 4)

In [197]:
aux = cluster[:-1]

In [198]:
%%time
minimo = aux.loc[aux['distance'] == aux['distance'].min()]

cluster0 = mapeo([list(cluster.iloc[-1])[1:3] , list(aux.loc[minimo.index.values[0], ['slat', 'slong']])], 2, 2)

CPU times: user 2.57 ms, sys: 26 µs, total: 2.59 ms
Wall time: 2.41 ms


In [199]:
cluster0.insert(0, 'c0')

In [200]:
cluster.drop(minimo.index, inplace=True)

In [201]:
cluster.shape

(51, 4)

In [202]:
cluster = cluster.append(pd.Series(cluster0, index = ['name', 'slat', 'slong']), ignore_index=True)

In [203]:
cluster.shape

(52, 4)

In [204]:
aux.shape

(51, 4)

In [205]:
aux.loc[[1], ['slat', 'slong']].values

array([[ 40.7192822 , -74.07126188]])

In [206]:
%%time
# repetir 3 hasta que haya un solo cluster
cont = 1
while len(cluster['name'].unique()) > 1:
    distancias = []
    for i in list(cluster.index):
        v = cluster.loc[[i], ['slat', 'slong']].values
        distancias.append(lat_long_to_distance(cluster0[1], v[0][0], cluster0[2], v[0][1]))
        pass
    cluster['distance'] = distancias
    aux = cluster[:-cont]
    minimo = aux.loc[aux['distance'] == aux['distance'].min()]
    cluster0 = mapeo([list(cluster.iloc[-1])[1:3] , list(aux.loc[minimo.index.values[0], ['slat', 'slong']])], 2, 2)
    cluster0.insert(0, 'c0')
    cluster.drop(minimo.index, inplace=True)
    cluster = cluster.append(pd.Series(cluster0, index = ['name', 'slat', 'slong']), ignore_index=True)
    cont += 1

CPU times: user 6.86 s, sys: 16.2 ms, total: 6.88 s
Wall time: 6.87 s


In [207]:
cluster

Unnamed: 0,name,slat,slong,distance
0,c0,40.715352,-74.033343,2.112744
1,c0,40.715955,-74.033239,2.129793
2,c0,40.717603,-74.033736,2.12208
3,c0,40.716391,-74.03571,1.93184
4,c0,40.717373,-74.037312,1.823244
5,c0,40.716931,-74.039181,1.65849
6,c0,40.715758,-74.040999,1.4807
7,c0,40.716745,-74.042422,1.38935
8,c0,40.718166,-74.04277,1.413996
9,c0,40.719908,-74.042827,1.494879
