In [1]:
import pandas as pd
import numpy as np
import glob
import json
import csv 
import requests

In [2]:
#Get filenames from the directory for setting up the dropdown.

FullFilenames = (glob.glob("../../Dataset/*.csv"))
OnlyFilenames = []
for i in FullFilenames:
    SplitOnUnderscores = i.split('_')
    SplitOnSlash = SplitOnUnderscores[0].split('\\')
    filename = SplitOnSlash[1]
    OnlyFilenames.append(filename)
print(OnlyFilenames)

['Albuquerque', 'Anaheim', 'Arlington', 'Atlanta', 'Aurora', 'Austin', 'Baltimore', 'Boston', 'Buffalo', 'CapeCoral', 'ColoradoSprings', 'Columbus', 'Dallas', 'Denver', 'DesMoines', 'Detroit', 'Durham', 'Fresno', 'GardenGrove', 'GrandRapids', 'Greensboro', 'Honolulu', 'Houston', 'HuntingtonBeach', 'Indianapolis', 'Irvine', 'Jerseycity', 'Knoxville', 'LasVegas', 'LosAngeles', 'Louisville', 'Madison', 'Miami', 'Milwaukee', 'Minneapolis', 'Nashville', 'NewOrleans', 'NewYork', 'Oakland', 'OklahomaCity', 'Ontario', 'Orlando', 'OverlandPark', 'Phoenix', 'Pittsburgh', 'Plano', 'Portland', 'Providence', 'RanchoCucamonga', 'Richmond', 'Rochester', 'Sacramento', 'SanDiego', 'SanFrancisco', 'SanJose', 'SantaRosa', 'Seattle', 'SiouxFalls', 'StLouis', 'Stockton', 'Tampa', 'WashingtonDC', 'Worcester']


In [3]:
# insert the names of columns you wish to extract Intresting columns are 
# the ones for which we will visualise the data. If all rows of any intresting columns have null values we will drop the data file.

IntrestingColumns = ['scientific_name','common_name','city','state'] 
print(IntrestingColumns)

['scientific_name', 'common_name', 'city', 'state']


In [4]:
# Api Call to get the family name by scientific name
def getPlantFamily(plantName):
    url = "https://api.gbif.org/v1/species?name="+plantName
    myResponse = requests.get(url)
    Data = json.loads(myResponse.content)
    results = Data['results']
    for i in results:
        if 'family' in i:
            return i['family']


In [5]:
State_city_dictionary = dict()
Cities = (glob.glob("../../Dataset/*.csv"))
for city in Cities:
    data = pd.read_csv(city, usecols=IntrestingColumns) #use nrows paramter to limit the data if it consumes too much time to process
    data.dropna(how='any', inplace=True)
    
    if(not(data.empty)):
        # Extract city and state information
        city_state_info = data.iloc[0][['city', 'state']].to_dict()
        print(city_state_info['city'], city)
        if city_state_info['state'] not in State_city_dictionary:
                State_city_dictionary[city_state_info['state']] = [city_state_info['city'].replace(" ", "")]
        else:
            State_city_dictionary[city_state_info['state']].append(city_state_info['city'].replace(" ", ""))
states = list(State_city_dictionary.keys())
json_states = json.dumps(states)
with open('StatesDropdown.json', 'w') as f:
    json.dump(json_states, f)



Albuquerque ../../Dataset\Albuquerque_Final_2022-06-18.csv
Anaheim ../../Dataset\Anaheim_Final_2022-06-18.csv
Aurora ../../Dataset\Aurora_Final_2022-06-18.csv
Austin ../../Dataset\Austin_Final_2022-06-18.csv
Baltimore ../../Dataset\Baltimore_Final_2022-06-18.csv
Boston ../../Dataset\Boston_Final_2022-06-18.csv
Buffalo ../../Dataset\Buffalo_Final_2022-06-18.csv
Cape Coral ../../Dataset\CapeCoral_Final_2022-06-18.csv
Colorado Springs ../../Dataset\ColoradoSprings_Final_2022-06-18.csv
Columbus ../../Dataset\Columbus_Final_2022-06-18.csv
Dallas ../../Dataset\Dallas_Final_2022-06-18.csv
Denver ../../Dataset\Denver_Final_2022-06-18.csv
De Soto ../../Dataset\DesMoines_Final_2022-06-18.csv
Detroit ../../Dataset\Detroit_Final_2022-06-18.csv
Durham ../../Dataset\Durham_Final_2022-06-18.csv
Fresno ../../Dataset\Fresno_Final_2022-06-18.csv
Garden Grove ../../Dataset\GardenGrove_Final_2022-06-18.csv
Grand Rapids ../../Dataset\GrandRapids_Final_2022-06-18.csv
Greensboro ../../Dataset\Greensboro_Fina

In [6]:
for state, cities in State_city_dictionary.items():
    StateData = pd.DataFrame(columns=['source','target','value'])
    for city in cities:
        Filename = (glob.glob("../../Dataset/" + city + "*.csv"))
        if(city =='DeSoto'):
            Filename = (glob.glob("../../Dataset/DesMoines*.csv"))
        if(city =='Kailua'):
            Filename = (glob.glob("../../Dataset/Honolulu*.csv"))
        if(city =='Cedarburg'):
            Filename = (glob.glob("../../Dataset/Milwaukee*.csv"))
        if(city =='St.Louis'):
            Filename = (glob.glob("../../Dataset/StLouis*.csv"))
        print("city: ",city, " state: ",state," filename: ",Filename)
        data = pd.read_csv(Filename[0], usecols=IntrestingColumns) #use nrows paramter to limit the data if it consumes too much time to process
        data.dropna(how='any', inplace=True)
        
        if(not(data.empty)):
            # Group data by scientific_name and get the count
            tree_counts = data['scientific_name'].value_counts().to_dict()
    
            # Sort the dictionary by values in descending order
            sorted_tree_counts = dict(sorted(tree_counts.items(), key=lambda item: item[1], reverse=True))
    
            # Get the top 5 items
            top_5_trees = dict(list(sorted_tree_counts.items())[:5])
            
            
            family_plant_count = pd.DataFrame(columns=['source','target','value'])
            for key,value in top_5_trees.items():
                familyName = getPlantFamily(key)
                family_plant_count = pd.concat([pd.DataFrame([[familyName,key,value]], columns=family_plant_count.columns), family_plant_count], ignore_index=True)
    
            groupBySource = family_plant_count.groupby('source').agg( {"value":"sum"} ).reset_index()
            
            city_family_count = pd.DataFrame({
                'source': city,
                'target':  groupBySource['source'], 
                'value': groupBySource['value']
            })
            StateData = pd.concat([family_plant_count, StateData], ignore_index=True)
            StateData = pd.concat([city_family_count, StateData], ignore_index=True)
         
    
    StateCityDataframe = pd.DataFrame(columns=['source','target','value'])
    for city in cities:
        StateCityDataframe = pd.concat([pd.DataFrame([[state,city,1]], columns=StateCityDataframe.columns), StateCityDataframe], ignore_index=True)
    StateData = pd.concat([StateCityDataframe, StateData], ignore_index=True)
    StateData.dropna(how='any', inplace=True)       
    StateData.to_csv(state+'.csv', index = False)

city:  Albuquerque  state:  New Mexico  filename:  ['../../Dataset\\Albuquerque_Final_2022-06-18.csv']
city:  Anaheim  state:  California  filename:  ['../../Dataset\\Anaheim_Final_2022-06-18.csv']
city:  Fresno  state:  California  filename:  ['../../Dataset\\Fresno_Final_2022-06-18.csv']
city:  GardenGrove  state:  California  filename:  ['../../Dataset\\GardenGrove_Final_2022-06-18.csv']
city:  HuntingtonBeach  state:  California  filename:  ['../../Dataset\\HuntingtonBeach_Final_2022-06-18.csv']
city:  Irvine  state:  California  filename:  ['../../Dataset\\Irvine_Final_2022-06-18.csv']
city:  LosAngeles  state:  California  filename:  ['../../Dataset\\LosAngeles_Final_2022-06-18.csv']
city:  Ontario  state:  California  filename:  ['../../Dataset\\Ontario_Final_2022-06-18.csv']
city:  RanchoCucamonga  state:  California  filename:  ['../../Dataset\\RanchoCucamonga_Final_2022-06-18.csv']
city:  Sacramento  state:  California  filename:  ['../../Dataset\\Sacramento_Final_2022-06-18.