In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np

# Weather

In [2]:
# Data from NYC - Central Park https://www.weather.gov/wrh/Climate?wfo=okx
df_weather = pd.read_csv('../data/external/weather_data.csv', delimiter=';')

In [3]:
df_weather = df_weather.sort_values(by=['Type', 'Month', 'Day'], ascending=[True, True, True])
df_weather.Type.value_counts()

Type
Avg Temperature Normal                 372
Daily Precipitation Normal (inches)    372
Max Temperature Normal                 372
Min Temperature Normal                 372
Name: count, dtype: int64

In [4]:
# This works because dataframe is ordered
df_weather_unify = df_weather[df_weather['Type'] == 'Daily Precipitation Normal (inches)'][['Day', 'Month', 'Value']].rename(columns={'Value' : 'daily_preciptation_normal_inches'}).reset_index(drop=True)
        
df_weather_unify['max_temperature_normal_f'] = df_weather[df_weather['Type'] == 'Max Temperature Normal']['Value'].values

df_weather_unify['min_temperature_normal_f'] = df_weather[df_weather['Type'] == 'Min Temperature Normal']['Value'].values

df_weather_unify['avg_temperature_normal_f'] = df_weather[df_weather['Type'] == 'Avg Temperature Normal']['Value'].values

df_weather_unify['Year'] = 2016

In [5]:
df_weather_unify = df_weather_unify.replace('-', '0')

In [11]:
df_weather_unify.to_csv('../data/external/weather_unify.csv')

# Streets Address

In [2]:
df = pd.read_csv('../data/external/Centerline.csv', dtype=str)

In [3]:
df['list_coords'] = df['the_geom'].str.replace('MULTILINESTRING ((', '').str.replace('))', '').str.split(',')

In [4]:
def extract_coords(X):
    np_arr_lst_coords = X['list_coords'].to_numpy(dtype=object)

    lst_lat_long = []
    for index, coords in enumerate(tqdm(np_arr_lst_coords)):
        lst_lat_long += list(zip(coords, [index]*len(coords)))

    return lst_lat_long

In [5]:
lst_ = extract_coords(df)

  0%|          | 0/121932 [00:00<?, ?it/s]

100%|██████████| 121932/121932 [00:00<00:00, 357126.56it/s]


In [6]:
df_coords = pd.DataFrame(lst_, columns=['lat_long', 'index'])

In [7]:
df_coords[['latitude', 'longitude']] = pd.DataFrame([[lat, long] for lat, long in df_coords['lat_long'].str.strip().str.split(' ')]).astype(np.float64)

In [8]:
df_coords.head()

Unnamed: 0,lat_long,index,latitude,longitude
0,-74.01793061274537 40.70617486563979,0,-74.017931,40.706175
1,-74.01786933958108 40.70634871855227,0,-74.017869,40.706349
2,-74.01778897787625 40.70651868456784,0,-74.017789,40.706519
3,-74.01769028387064 40.70668296244674,0,-74.01769,40.706683
4,-74.01757437049282 40.70683986715218,0,-74.017574,40.70684


In [9]:
df = df_coords.merge(df.reset_index(),on='index' , how='right').drop(columns=['list_coords', 'index', 'the_geom'])

In [10]:
df.head()

Unnamed: 0,lat_long,latitude,longitude,PHYSICALID,L_LOW_HN,L_HIGH_HN,R_LOW_HN,R_HIGH_HN,L_ZIP,R_ZIP,...,PRE_DIRECT,PRE_TYPE,POST_TYPE,POST_DIREC,POST_MODIF,FULL_STREE,ST_NAME,BIKE_TRAFD,SEGMENT_TY,SHAPE_Leng
0,-74.01793061274537 40.70617486563979,-74.017931,40.706175,3,50,64,51,63,10280,10280,...,,,PL,,,BATTERY PL,BATTERY,,U,262.77781228
1,-74.01786933958108 40.70634871855227,-74.017869,40.706349,3,50,64,51,63,10280,10280,...,,,PL,,,BATTERY PL,BATTERY,,U,262.77781228
2,-74.01778897787625 40.70651868456784,-74.017789,40.706519,3,50,64,51,63,10280,10280,...,,,PL,,,BATTERY PL,BATTERY,,U,262.77781228
3,-74.01769028387064 40.70668296244674,-74.01769,40.706683,3,50,64,51,63,10280,10280,...,,,PL,,,BATTERY PL,BATTERY,,U,262.77781228
4,-74.01757437049282 40.70683986715218,-74.017574,40.70684,3,50,64,51,63,10280,10280,...,,,PL,,,BATTERY PL,BATTERY,,U,262.77781228


In [11]:
df.to_parquet('../data/external/Centerline_unify.parquet.gzip', compression='gzip', index=False)