In [35]:
import numpy as np
import pandas as pd
from typing import List

pd.set_option('display.max_columns', 50)
pd.set_option("max_colwidth", 50)

In [36]:
df_trees = pd.read_csv("data_100000.csv", sep=',')

In [37]:
df_trees['created_at'] = df_trees['created_at'].astype('datetime64[ns]')

In [38]:
df_trees.index

RangeIndex(start=0, stop=100000, step=1)

In [39]:
df_trees = df_trees.rename(columns={'created_at': 'creation_date', 'tree_dbh': 'breast_diameter', 'stump_diam': 'stump_diameter', 'curb_loc': 'curb_location', 'spc_latin': 'latin_name', 'spc_common': 'common_name', 'trnk_wire': 'trunk_wire', 'trnk_light': 'trunk_light', 'trnk_other': 'trunk_other', 'brnch_ligh': 'branch_light', 'brnch_shoe': 'branch_shoe', 'brnch_othe': 'branch_other', 'cb_num': 'com_board', 'cncldist': 'council_distr', 'st_assem': 'state_assem', 'st_senate': 'state_senate', 'nta': 'nta_code', 'boro_ct': 'census_track', 'x_sp': 'X_sp_coord', 'y_sp': 'Y_sp_coord'})

In [40]:
fill_list = ['health', 'latin_name', 'common_name', 'steward', 'guards', 'sidewalk', 'problems']

def fill_na(column_name: str):
    df_trees[column_name] = df_trees[column_name].fillna(value='unknown')
    return df_trees[column_name]

for column_name in fill_list:
    df_trees[column_name] = fill_na(column_name)

In [41]:
df_trees = df_trees.sort_values(by='creation_date')

In [42]:
df_trees = df_trees.reset_index(drop=True)

In [43]:
bool_list = ['root_stone', 'root_grate', 'root_other', 'trunk_wire', 'trunk_light', 'trunk_other', 'branch_light', 'branch_shoe', 'branch_other']

def values_to_bool(column_name:str):
    df_trees[column_name] = df_trees[column_name].replace(to_replace='Yes', value=True)
    df_trees[column_name] = df_trees[column_name].replace(to_replace='No', value=False)
    df_trees[column_name] = df_trees[column_name].astype('bool')
    return df_trees[column_name]

for column_name in bool_list: 
    df_trees[column_name] = values_to_bool(column_name)

In [44]:
lower_list = ['common_name', 'latin_name', 'curb_location', 'status', 'health', 'steward', 'guards', 'sidewalk', 'user_type', 'address', 'state']

def to_lower(column_name:str):
    df_trees[column_name] = df_trees[column_name].apply(lambda x:x.lower())
    return df_trees[column_name]

for column_name in lower_list:
    df_trees[column_name] = to_lower(column_name)

In [45]:
df_trees.common_name = df_trees[['common_name', 'latin_name']].agg(' - '.join, axis=1)

In [46]:
df_trees = df_trees.rename(columns={'common_name': 'tree_name'})

In [47]:
df_trees.drop(['zip_city', 'boroname', 'nta_name', 'latin_name', 'the_geom', 'problems'], axis=1, inplace=True)

In [48]:
df_trees.to_csv('nyc_trees_dataset.csv')

In [50]:
df_trees.head()

Unnamed: 0,creation_date,tree_id,block_id,breast_diameter,stump_diameter,curb_location,status,health,tree_name,steward,guards,sidewalk,user_type,root_stone,root_grate,root_other,trunk_wire,trunk_light,trunk_other,branch_light,branch_shoe,branch_other,address,zipcode,com_board,borocode,council_distr,state_assem,state_senate,nta_code,census_track,state,latitude,longitude,X_sp_coord,Y_sp_coord
0,2015-05-19,311,501897,4,0,oncurb,alive,good,siberian elm - ulmus pumila,none,helpful,damage,treescount staff,False,False,False,False,False,False,False,False,False,1038 rogers place,10459,202,2,17,79,32,BX33,2013100,new york,40.82438,-73.897406,1012644.0,239637.705814
1,2015-05-19,317,501897,5,0,oncurb,alive,fair,amur maackia - maackia amurensis,none,none,nodamage,treescount staff,False,False,False,False,False,False,False,False,False,1066 hall place,10459,202,2,17,79,32,BX33,2013100,new york,40.825308,-73.897495,1012619.0,239975.852119
2,2015-05-19,306,501897,5,0,oncurb,alive,good,siberian elm - ulmus pumila,none,helpful,nodamage,treescount staff,False,False,False,False,False,False,False,False,False,1010 rogers place,10459,202,2,17,79,32,BX33,2013100,new york,40.823933,-73.897177,1012708.0,239474.904302
3,2015-05-19,347,501893,5,0,oncurb,alive,fair,american hornbeam - carpinus caroliniana,none,none,damage,treescount staff,False,False,False,False,False,False,False,False,False,941 simpson street,10459,202,2,17,85,32,BX27,2015900,new york,40.821445,-73.892916,1013888.0,238569.793579
4,2015-05-19,7,501911,3,0,oncurb,alive,poor,silver birch - betula pendula,3or4,none,nodamage,treescount staff,False,False,False,False,False,False,False,False,False,1242 spofford avenue,10474,202,2,17,84,34,BX27,2009300,new york,40.814092,-73.889123,1014942.0,235892.23299


In [51]:
df_trees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 36 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   creation_date    100000 non-null  datetime64[ns]
 1   tree_id          100000 non-null  int64         
 2   block_id         100000 non-null  int64         
 3   breast_diameter  100000 non-null  int64         
 4   stump_diameter   100000 non-null  int64         
 5   curb_location    100000 non-null  object        
 6   status           100000 non-null  object        
 7   health           100000 non-null  object        
 8   tree_name        100000 non-null  object        
 9   steward          100000 non-null  object        
 10  guards           100000 non-null  object        
 11  sidewalk         100000 non-null  object        
 12  user_type        100000 non-null  object        
 13  root_stone       100000 non-null  bool          
 14  root_grate       1000