Import needed libaries

In [1]:
from dotenv import dotenv_values
import sqlalchemy
import pandas as pd
import sql_functions as sf
import matplotlib.pyplot as plt

Define schema & engine to call df from Database

In [2]:
schema = 'capstone_wildfire'
engine = sf.get_engine()

Define the table & query 

In [3]:
sql_query = f'select * from {schema}.fires_data_v1_rdy_to_clean_rows;'

Get datatable as dataframe

In [4]:
wild_fire_df = sf.get_dataframe(sql_query)

In [52]:
def create_trend_df(wildfire_df, input_year):

  # Convert date column to datetime 
  wildfire_df['combined_discovery_date'] = pd.to_datetime(wildfire_df['combined_discovery_date'])

  # Get total fires per state per year
  yearly_counts = wildfire_df.groupby(['state_name', wildfire_df['combined_discovery_date'].dt.year])['unique_id'].count().reset_index(name='unique_id')

  # Get top 10 states by average fires per year
  state_avg = yearly_counts.groupby('state_name')['unique_id'].mean().sort_values(ascending=False)
  
  # Get top state names
  top_states = state_avg.index

  # Create output DataFrame
  output = pd.DataFrame({
    'category': ['20 Year AVG']*len(top_states),
    'state': top_states,
    'total_avg_count': state_avg.values
  })

  return output

In [53]:
create_trend_df(wild_fire_df,2015)

Unnamed: 0,category,state,total_avg_count
0,20 Year AVG,California,7703.809524
1,20 Year AVG,Georgia,6986.333333
2,20 Year AVG,Texas,6594.619048
3,20 Year AVG,North Carolina,4525.428571
4,20 Year AVG,New York,3812.666667
5,20 Year AVG,Florida,3712.380952
6,20 Year AVG,Mississippi,3208.619048
7,20 Year AVG,South Carolina,3159.190476
8,20 Year AVG,Alabama,2994.571429
9,20 Year AVG,Arizona,2806.142857


In [54]:
import pandas as pd

def create_trend_df_avgburn(wildfire_df, input_year):

  yearly_data = wildfire_df.groupby(['state_name', wildfire_df['combined_discovery_date'].dt.year]).agg({'burning_time':'sum', 'unique_id':'count'}).reset_index()

  yearly_data['nonnull_count'] = yearly_data['burning_time'].notnull().sum()

  yearly_data['avg_burning_time'] = yearly_data['burning_time'] / yearly_data['nonnull_count']

  state_avg = yearly_data.groupby('state_name')['avg_burning_time'].mean().sort_values(ascending=False)

  # Create output DataFrame
  output = pd.DataFrame({
    'category': ['20 Year AVG']*len(state_avg), 
    'state': state_avg.index,
    'total_avg_burn': state_avg.round(2).values
  })
  return output

In [55]:
create_trend_df_avgburn(wild_fire_df,2015)

Unnamed: 0,category,state,total_avg_burn
0,20 Year AVG,New York,8.46
1,20 Year AVG,Idaho,5.41
2,20 Year AVG,Washington,4.48
3,20 Year AVG,Alaska,3.72
4,20 Year AVG,Oregon,2.91
5,20 Year AVG,California,2.78
6,20 Year AVG,Montana,2.68
7,20 Year AVG,Arizona,2.51
8,20 Year AVG,New Mexico,1.77
9,20 Year AVG,Utah,1.49


In [20]:
import pandas as pd

def create_trend_df_avgsize(wildfire_df):

  # Calculate averages only for fire_size
  yearly_data = wildfire_df.groupby(['state_name', wildfire_df['combined_discovery_date'].dt.year]).agg({'fire_size':'sum', 'unique_id':'count'}).reset_index()

  yearly_data['avg_fire_size'] = yearly_data['fire_size'] / yearly_data['unique_id']

  state_avg_size = yearly_data.groupby('state_name')['avg_fire_size'].mean().sort_values(ascending=False)

  # Construct output DataFrame
  output = pd.DataFrame({
    'category': ['20 Year AVG Size']*len(state_avg_size), 
    'state': state_avg_size.index,
    'total_avg_size': state_avg_size.values
  })

  return output

In [21]:
create_trend_df_avgsize(wild_fire_df)

Unnamed: 0,category,state,total_avg_size
0,20 Year AVG Size,Alaska,2460.209963
1,20 Year AVG Size,Kansas,606.498086
2,20 Year AVG Size,Nevada,477.3921
3,20 Year AVG Size,Idaho,399.42341
4,20 Year AVG Size,Nebraska,277.029829
5,20 Year AVG Size,Hawaii,233.709699
6,20 Year AVG Size,New Mexico,168.18785
7,20 Year AVG Size,Oregon,161.329295
8,20 Year AVG Size,Wyoming,160.93752
9,20 Year AVG Size,Washington,145.016463


In [56]:
import pandas as pd

df_size = create_trend_df_avgsize(wild_fire_df)
df_burn = create_trend_df_avgburn(wild_fire_df, 2015) 
df_total = create_trend_df(wild_fire_df, 2015) 

# Get ranks per dataframe
ranks1 = {state:rank for rank, state in enumerate(df_size.sort_values('total_avg_size')['state'])} 
ranks2 = {state:rank for rank, state in enumerate(df_total.sort_values('total_avg_count')['state'])}
ranks3 = {state:rank for rank, state in enumerate(df_burn.sort_values('total_avg_burn')['state'])}

# Initialize state ranks dict
state_ranks = {}
for state in set(df_size['state']).union(df_burn['state'], df_total['state']):
    state_ranks[state] = 0

# Sum ranks for each state   
for state in state_ranks:
    if state in ranks1:
        state_ranks[state] += ranks1[state]
    if state in ranks2:
        state_ranks[state] += ranks2[state] 
    if state in ranks3:
        state_ranks[state] += ranks3[state]

# Print ranks        
for state, rank in sorted(state_ranks.items(), key=lambda x: x[1], reverse=True):
    print(state, rank)

# Top 5 states   
print("\nTop 5:")
print(sorted(state_ranks, key=state_ranks.get, reverse=True)[:5])

California 134
Oregon 132
Idaho 132
New Mexico 125
Arizona 125
Montana 124
Washington 121
Texas 118
Florida 116
Alaska 116
Oklahoma 113
Utah 109
Nevada 108
Colorado 108
New York 104
Wyoming 102
Minnesota 97
Georgia 96
Mississippi 91
Alabama 89
Tennessee 88
Kentucky 88
South Dakota 85
North Carolina 84
Louisiana 83
Hawaii 82
South Carolina 79
Kansas 78
Arkansas 77
Nebraska 75
Virginia 73
West Virginia 72
New Jersey 72
North Dakota 66
Missouri 66
Puerto Rico 63
Michigan 59
Maryland 57
Wisconsin 50
Iowa 43
Delaware 41
Maine 38
Pennsylvania 35
Massachusetts 33
Ohio 32
Illinois 31
Indiana 21
Connecticut 20
New Hampshire 11
Vermont 11
Rhode Island 5
District of Columbia 0

Top 5:
['California', 'Oregon', 'Idaho', 'New Mexico', 'Arizona']
