Import needed libaries

In [1]:
from dotenv import dotenv_values
import sqlalchemy
import pandas as pd
import sql_functions as sf
import matplotlib.pyplot as plt

Define schema & engine to call df from Database

In [2]:
schema = 'capstone_wildfire'
engine = sf.get_engine()

Define the table & query 

In [3]:
sql_query = f'select * from {schema}.fires_data_v1_rdy_to_clean_rows;'

Get datatable as dataframe

In [4]:
wild_fire_df = sf.get_dataframe(sql_query)

Get the count of fires per year per state 

In [6]:
# Extract the year from the 'combined_discovery_date' column
wild_fire_df['year'] = wild_fire_df['combined_discovery_date'].dt.year

# Group the DataFrame by 'state', 'year', and count the unique 'unique_id' values
fire_counts = wild_fire_df.groupby(['state_name', 'year'])['unique_id'].nunique().reset_index()

# Rename the count column
fire_counts.rename(columns={'unique_id': 'fire_count'}, inplace=True)

# Print the result
print(fire_counts)


     state_name  year  fire_count
0       Alabama  1995        4230
1       Alabama  1996        4048
2       Alabama  1997        2591
3       Alabama  1998        3566
4       Alabama  1999        4501
...         ...   ...         ...
1034    Wyoming  2011         640
1035    Wyoming  2012        1045
1036    Wyoming  2013         581
1037    Wyoming  2014         595
1038    Wyoming  2015         632

[1039 rows x 3 columns]


Get the count of fires per year per category 

In [7]:
# Extract the year from the 'combined_discovery_date' column
wild_fire_df['year'] = wild_fire_df['combined_discovery_date'].dt.year

# Group the DataFrame by 'state', 'year', and count the unique 'unique_id' values
fire_counts_category = wild_fire_df.groupby(['cause_descr', 'year'])['unique_id'].nunique().reset_index()

# Rename the count column
fire_counts_category.rename(columns={'unique_id': 'fire_count_category'}, inplace=True)

# Print the result
print(fire_counts_category)

    cause_descr  year  fire_count_category
0         Arson  1995                12648
1         Arson  1996                10205
2         Arson  1997                10936
3         Arson  1998                14102
4         Arson  1999                19282
..          ...   ...                  ...
268   Structure  2011                  240
269   Structure  2012                  357
270   Structure  2013                  237
271   Structure  2014                  284
272   Structure  2015                  335

[273 rows x 3 columns]


Avg ha burned per category per year 

In [None]:
# Extract the year from the 'combined_discovery_date' column
wild_fire_df['year'] = wild_fire_df['combined_discovery_date'].dt.year

# Group the DataFrame by 'cause_descr' and 'year', then calculate the average 'fire_size' per group
avg_burned_ha_per_category_per_year = wild_fire_df.groupby(['cause_descr', 'year'])['fire_size'].mean().reset_index()

# Rename the 'fire_size' column to 'avg_burned_ha'
avg_burned_ha_per_category_per_year.rename(columns={'fire_size': 'avg_burned_ha'}, inplace=True)

# Print the result
print(avg_burned_ha_per_category_per_year)

Number of fire_class by category by Year

In [111]:
# Filter the data to only include rows where 'fire_size_class' is 'A'
filtered_data = wild_fire_df[wild_fire_df['fire_size_class'] == 'A']

# Group the filtered data by the year and calculate the sum of 'fire_size' for each year
yearly_sum = filtered_data.groupby('year')['fire_size'].sum().reset_index()

# Count the number of occurrences of 'A' in the 'fire_size_class' column for each year
yearly_A_count = wild_fire_df[wild_fire_df['fire_size_class'] == 'A'].groupby('year')['fire_size_class'].count().reset_index()

# Merge the two DataFrames on the 'year' column
result = pd.merge(yearly_sum, yearly_A_count, on='year')

# Calculate the final result by dividing the sum of 'fire_size' by the count of 'A's for each year
result['result'] = result['fire_size'] / result['fire_size_class']

# Display the result DataFrame
print(result)


    year    fire_size  fire_size_class    result
0   1995  2907.145000            23912  0.121577
1   1996  2968.250000            25119  0.118168
2   1997  2597.470000            21662  0.119909
3   1998  2836.700000            23406  0.121195
4   1999  3399.385300            27560  0.123345
5   2000  3841.495000            33060  0.116198
6   2001  3735.989000            31129  0.120016
7   2002  3282.973759            27476  0.119485
8   2003  3276.026580            28261  0.115920
9   2004  3231.868990            27354  0.118150
10  2005  3422.344000            28483  0.120154
11  2006  4126.296600            34049  0.121187
12  2007  4024.919000            33839  0.118943
13  2008  3387.711960            27848  0.121650
14  2009  3215.613280            27165  0.118373
15  2010  3413.650890            28714  0.118885
16  2011  3333.277500            28677  0.116235
17  2012  3272.380000            27346  0.119666
18  2013  3322.297010            28004  0.118637
19  2014  3202.81370

Combine all df into one

In [52]:
import pandas as pd
import matplotlib.pyplot as plt

# Filter the dataset for the year 2015
wild_fire_2015 = wild_fire_df[wild_fire_df['year'] == 2015]

# Filter the dataset for the previous year (2014)
wild_fire_2014 = wild_fire_df[wild_fire_df['year'] == 2014]

# Group by state and count the number of fires for 2015 and 2014
state_counts_2015 = wild_fire_2015['state_name'].value_counts().sort_values(ascending=False)
state_counts_2014 = wild_fire_2014['state_name'].value_counts().sort_values(ascending=False)

# Take the top 3 states for 2015
top_3_states_2015 = state_counts_2015.head(3)

# Find the state with the highest count in 2014
highest_count_state_2014 = state_counts_2014.index[0]

# Create a new DataFrame in the desired format for 2015
top_3_states_df = pd.DataFrame({
    'category': ['Current Year'] * 3,  # All 3 states in the same year
    'state': top_3_states_2015.index.tolist(),
    'Total Fires (Current Year)': top_3_states_2015.values.tolist()
})

# Calculate the percentage difference for each state and multiply by 100 to express as a percentage
top_3_states_df['Percentage Difference'] = [
    ((count_2015 - state_counts_2014.get(highest_count_state_2014, 0)) / state_counts_2014.get(highest_count_state_2014, 1)) * 100
    for count_2015 in top_3_states_df['Total Fires (Current Year)']
]

# Print the DataFrame
top_3_states_df


Unnamed: 0,category,state,Total Fires (Current Year),Percentage Difference
0,Current Year,Texas,8304,-2.740689
1,Current Year,California,7365,-13.73858
2,Current Year,Kansas,5887,-31.049426


In [105]:
import pandas as pd

def create_df(wildfire_df, input_year):

  # Get fires for current year 
  current_year = wildfire_df[wildfire_df['combined_discovery_date'].dt.year == input_year].groupby('state_name')['unique_id'].count().sort_values(ascending=False).head(3)
  current_year = pd.DataFrame({'state_name': current_year.index, 'fire_count': current_year, 'category': 'Current Year'})

  # Get fires for previous year
  previous_year = wildfire_df[wildfire_df['combined_discovery_date'].dt.year == input_year - 1].groupby('state_name')['unique_id'].count().sort_values(ascending=False).head(3)
  previous_year = pd.DataFrame({'state_name': previous_year.index, 'fire_count': previous_year, 'category': 'Previous Year'})

 # Rest of code...

  result = pd.concat([current_year, previous_year])

  return result

In [106]:
df = create_df(wild_fire_df, input_year=2014,)

df

Unnamed: 0_level_0,state_name,fire_count,category
state_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Texas,Texas,8538,Current Year
California,California,6494,Current Year
North Carolina,North Carolina,4587,Current Year
Texas,Texas,9735,Previous Year
California,California,8723,Previous Year
Colorado,Colorado,3894,Previous Year


Added the percentage difference for the previous year

In [107]:
import pandas as pd

def create_df(wildfire_df, input_year):

  # Get fires for current and previous year
  current_year = wildfire_df[wildfire_df['combined_discovery_date'].dt.year == input_year].groupby('state_name')['unique_id'].count().sort_values(ascending=False).head(3)
  previous_year = wildfire_df[wildfire_df['combined_discovery_date'].dt.year == input_year - 1].groupby('state_name')['unique_id'].count().sort_values(ascending=False).head(3)

  current_year = pd.DataFrame({'fire_count': current_year, 'category': 'Current Year'})
  previous_year = pd.DataFrame({'fire_count': previous_year, 'category': 'Previous Year'})

  # Calculate percentage difference
  for i in range(3):
    current_top = current_year.iloc[i]['fire_count']
    prev_top = previous_year.iloc[i]['fire_count']
    perc_diff = (current_top - prev_top) / prev_top * 100
    current_year.at[current_year.index[i], 'perc_diff'] = perc_diff

  # Set index name to state_name
  current_year.index.name = 'state_name'
  previous_year.index.name = 'state_name'

  # Rest of code...

  result = pd.concat([current_year, previous_year])

  return result

In [108]:
create_df(wild_fire_df, 2015)

Unnamed: 0_level_0,fire_count,category,perc_diff
state_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Texas,8304,Current Year,-2.740689
California,7365,Current Year,13.412381
Kansas,5887,Current Year,28.340964
Texas,8538,Previous Year,
California,6494,Previous Year,
North Carolina,4587,Previous Year,


Calculating the burned Acres

In [109]:
import pandas as pd 

def create_df(wildfire_df, input_year):

  # Get burned acres for current and previous year
  current_year = wildfire_df[wildfire_df['combined_discovery_date'].dt.year == input_year].groupby('state_name')['fire_size'].sum().sort_values(ascending=False).head(3)

  previous_year = wildfire_df[wildfire_df['combined_discovery_date'].dt.year == input_year - 1].groupby('state_name')['fire_size'].sum().sort_values(ascending=False).head(3)

  current_year = pd.DataFrame({'burned_acres': current_year, 'category': 'Current Year'})  
  previous_year = pd.DataFrame({'burned_acres': previous_year, 'category': 'Previous Year'})

  # Calculate percentage difference
  for i in range(3):
    current_top = current_year.iloc[i]['burned_acres']
    prev_top = previous_year.iloc[i]['burned_acres']
    perc_diff = (current_top - prev_top) / prev_top * 100
    current_year.at[current_year.index[i], 'perc_diff'] = perc_diff  

  # Set index name to state_name
  current_year.index.name = 'state_name'
  previous_year.index.name = 'state_name'

  # Rest of code...

  result = pd.concat([current_year, previous_year])

  return result

In [110]:
create_df(wild_fire_df, 2015)

Unnamed: 0_level_0,burned_acres,category,perc_diff
state_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Washington,1161646.38,Current Year,24.36283
California,848728.61,Current Year,55.195016
Idaho,791026.11,Current Year,75.822292
Oregon,934078.44,Previous Year,
California,546878.78,Previous Year,
Washington,449900.92,Previous Year,
