In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', 50)
pd.set_option("max_colwidth", 50)

In [2]:
df_trees = pd.read_csv("data_100000.csv", sep=',')

In [3]:
df_trees['created_at'] = df_trees['created_at'].astype('datetime64[ns]')

In [4]:
df_trees.index

RangeIndex(start=0, stop=100000, step=1)

In [5]:
df_trees = df_trees.rename(columns={'created_at': 'creation_date', 'tree_dbh': 'breast_diameter', 'stump_diam': 'stump_diameter', 'curb_loc': 'curb_location', 'spc_latin': 'latin_name', 'spc_common': 'common_name', 'trnk_wire': 'trunk_wire', 'trnk_light': 'trunk_light', 'trnk_other': 'trunk_other', 'brnch_ligh': 'branch_light', 'brnch_shoe': 'branch_shoe', 'brnch_othe': 'branch_other', 'cb_num': 'com_board', 'cncldist': 'council_distr', 'st_assem': 'state_assem', 'st_senate': 'state_senate', 'nta': 'nta_code', 'boro_ct': 'census_track', 'x_sp': 'X_sp_coord', 'y_sp': 'Y_sp_coord'})

In [6]:
def fill_na(column_name:str):
    df_trees[column_name] = df_trees[column_name].fillna(value='unknown')
    return df_trees[column_name]

df_trees['health'] = fill_na('health')
df_trees['latin_name'] = fill_na('latin_name')
df_trees['common_name'] = fill_na('common_name')
df_trees['steward'] = fill_na('steward')
df_trees['guards'] = fill_na('guards')
df_trees['sidewalk'] = fill_na('sidewalk')
df_trees['problems'] = fill_na('problems')

In [7]:
df_trees = df_trees.sort_values(by='creation_date')

In [8]:
df_trees = df_trees.reset_index(drop=True)

In [9]:
def values_to_bool(column_name:str):
    df_trees[column_name] = df_trees[column_name].replace(to_replace='Yes', value=True)
    df_trees[column_name] = df_trees[column_name].replace(to_replace='No', value=False)
    df_trees[column_name] = df_trees[column_name].astype('bool')
    return df_trees[column_name]

df_trees['root_stone'] = values_to_bool('root_stone')
df_trees['root_grate'] = values_to_bool('root_grate')
df_trees['root_other'] = values_to_bool('root_other')
df_trees['trunk_wire'] = values_to_bool('trunk_wire')
df_trees['trunk_light'] = values_to_bool('trunk_light')
df_trees['trunk_other'] = values_to_bool('trunk_other')
df_trees['branch_light'] = values_to_bool('branch_light')
df_trees['branch_shoe'] = values_to_bool('branch_shoe')
df_trees['branch_other'] = values_to_bool('branch_other')

In [10]:
def to_lower(column_name:str):
    df_trees[column_name] = df_trees[column_name].apply(lambda x:x.lower())
    return df_trees[column_name]

df_trees['common_name'] = to_lower('common_name')
df_trees['latin_name'] = to_lower('latin_name')
df_trees['curb_location'] = to_lower('curb_location')
df_trees['status'] = to_lower('status')
df_trees['health'] = to_lower('health')
df_trees['steward'] = to_lower('steward')
df_trees['guards'] = to_lower('guards')
df_trees['sidewalk'] = to_lower('sidewalk')
df_trees['user_type'] = to_lower('user_type')
df_trees['address'] = to_lower('address')
df_trees['state'] = to_lower('state')

In [11]:
df_trees.common_name = df_trees[['common_name', 'latin_name']].agg(' - '.join, axis=1)

In [12]:
df_trees = df_trees.rename(columns={'common_name': 'tree_name'})

In [13]:
df_trees.drop(['zip_city', 'boroname', 'nta_name', 'latin_name', 'the_geom', 'problems'], axis=1, inplace=True)

In [14]:
df_trees.to_csv('nyc_trees_dataset.csv')