In [1]:
import pandas as pd
import numpy as np
import glob
import json
import csv 
import requests

In [2]:
#Get filenames from the directory for setting up the dropdown.

FullFilenames = (glob.glob("../../Dataset/*.csv"))
OnlyFilenames = []
for i in FullFilenames:
    SplitOnUnderscores = i.split('_')
    SplitOnSlash = SplitOnUnderscores[0].split('\\')
    filename = SplitOnSlash[1]
    OnlyFilenames.append(filename)
print(OnlyFilenames)

['Albuquerque', 'Anaheim', 'Arlington', 'Atlanta', 'Aurora', 'Austin', 'Baltimore', 'Boston', 'Buffalo', 'CapeCoral', 'ColoradoSprings', 'Columbus', 'Dallas', 'Denver', 'DesMoines', 'Detroit', 'Durham', 'Fresno', 'GardenGrove', 'GrandRapids', 'Greensboro', 'Honolulu', 'Houston', 'HuntingtonBeach', 'Indianapolis', 'Irvine', 'Jerseycity', 'Knoxville', 'LasVegas', 'LosAngeles', 'Louisville', 'Madison', 'Miami', 'Milwaukee', 'Minneapolis', 'Nashville', 'NewOrleans', 'NewYork', 'Oakland', 'OklahomaCity', 'Ontario', 'Orlando', 'OverlandPark', 'Phoenix', 'Pittsburgh', 'Plano', 'Portland', 'Providence', 'RanchoCucamonga', 'Richmond', 'Rochester', 'Sacramento', 'SanDiego', 'SanFrancisco', 'SanJose', 'SantaRosa', 'Seattle', 'SiouxFalls', 'StLouis', 'Stockton', 'Tampa', 'WashingtonDC', 'Worcester']


In [3]:
# insert the names of columns you wish to extract Intresting columns are 
# the ones for which we will visualise the data. If all rows of any intresting columns have null values we will drop the data file.

IntrestingColumns = ['scientific_name','common_name','city','state'] 
print(IntrestingColumns)

['scientific_name', 'common_name', 'city', 'state']


In [4]:
# Api Call to get the family name by scientific name
def getPlantFamily(plantName):
    url = "https://api.gbif.org/v1/species?name="+plantName
    myResponse = requests.get(url)
    Data = json.loads(myResponse.content)
    results = Data['results']
    for i in results:
        if 'family' in i:
            return i['family']


In [None]:
# Generate the csv which will be used for sankey diagram development
def processFile():
    StateData = pd.DataFrame(columns=['source','target','value'])
    State_city_dictionary = dict()
    Cities = (glob.glob("../../Dataset/*.csv"))
    for city in Cities:
        data = pd.read_csv(city, usecols=IntrestingColumns) #use nrows paramter to limit the data if it consumes too much time to process
        data.dropna(how='any', inplace=True)
        
        if(not(data.empty)):
            # Extract city and state information
            city_state_info = data.iloc[0][['city', 'state']].to_dict()
            if city_state_info['state'] not in State_city_dictionary:
                    State_city_dictionary[city_state_info['state']] = [city_state_info['city'].replace(" ", "")]
            else:
                State_city_dictionary[city_state_info['state']].append(city_state_info['city'].replace(" ", ""))
                
    for state, cities in State_city_dictionary.items():
        for city in cities:
            Filename = (glob.glob("../../Dataset/" + city + "*.csv"))
            if(city =='DeSoto'):
                Filename = (glob.glob("../../Dataset/DesMoines*.csv"))
            
            data = pd.read_csv(Filename[0], usecols=IntrestingColumns) #use nrows paramter to limit the data if it consumes too much time to process
            data.dropna(how='any', inplace=True)
            
            if(not(data.empty)):
                # Group data by scientific_name and get the count
                tree_counts = data['scientific_name'].value_counts().to_dict()
        
                # Sort the dictionary by values in descending order
                sorted_tree_counts = dict(sorted(tree_counts.items(), key=lambda item: item[1], reverse=True))
        
                # Get the top 5 items
                top_5_trees = dict(list(sorted_tree_counts.items())[:5])
                
                
                family_plant_count = pd.DataFrame(columns=['source','target','value'])
                for key,value in top_5_trees.items():
                    familyName = getPlantFamily(key)
                    family_plant_count = pd.concat([pd.DataFrame([[familyName,key,value]], columns=family_plant_count.columns), family_plant_count], ignore_index=True)
        
                groupBySource = family_plant_count.groupby('source').agg( {"value":"sum"} ).reset_index()
                
                city_family_count = pd.DataFrame({
                    'source': city,
                    'target':  groupBySource['source'], 
                    'value': groupBySource['value']
                })
                StateData = pd.concat([family_plant_count, StateData], ignore_index=True)
                StateData = pd.concat([city_family_count, StateData], ignore_index=True)
                
        
        StateCityDataframe = pd.DataFrame(columns=['source','target','value'])
        for key, val in State_city_dictionary.items():
            if len(val) >1 :
                for i in val:
                    StateCityDataframe = pd.concat([pd.DataFrame([[key,i,1]], columns=StateCityDataframe.columns), StateCityDataframe], ignore_index=True)
            StateCityDataframe = pd.concat([pd.DataFrame([[key,val[0],1]], columns=StateCityDataframe.columns), StateCityDataframe], ignore_index=True)
    
        StateData = pd.concat([StateCityDataframe, StateData], ignore_index=True)
        StateData.dropna(how='any', inplace=True)       
        StateData.to_csv(state+'.csv', index = False)
processFile()

{'New Mexico': ['Albuquerque'], 'California': ['Anaheim', 'Fresno', 'GardenGrove', 'HuntingtonBeach', 'Irvine', 'LosAngeles', 'Ontario', 'RanchoCucamonga', 'Sacramento', 'SanDiego', 'SanFrancisco', 'SantaRosa', 'Stockton'], 'Colorado': ['Aurora', 'ColoradoSprings', 'Denver'], 'Texas': ['Austin', 'Dallas', 'Houston', 'Plano'], 'Maryland': ['Baltimore'], 'Massachusetts': ['Boston'], 'New York': ['Buffalo', 'NewYork', 'Rochester'], 'Florida': ['CapeCoral', 'Miami', 'Orlando', 'Tampa'], 'Ohio': ['Columbus'], 'Iowa': ['DeSoto'], 'Michigan': ['Detroit', 'GrandRapids'], 'North Carolina': ['Durham', 'Greensboro'], 'Hawaii': ['Kailua'], 'Tennessee': ['Knoxville', 'Nashville'], 'Nevada': ['LasVegas'], 'Kentucky': ['Louisville'], 'Wisconsin': ['Madison', 'Cedarburg'], 'Louisiana': ['NewOrleans'], 'Oklahoma': ['OklahomaCity'], 'Kansas': ['OverlandPark'], 'Arizona': ['Phoenix'], 'Pennsylvania': ['Pittsburgh'], 'Oregon': ['Portland'], 'Virginia': ['Richmond'], 'Washington': ['Seattle'], 'South Dakot