In [1]:
import sqlite3
import pandas as pd
import sqlalchemy
import psycopg2
from sql_functions import get_engine
from sql_functions import get_sql_config
con = sqlite3.connect('data/FPA_FOD_20170508.sqlite')
cursor = con.cursor()

In [None]:
fires_raw = pd.read_sql_query('SELECT * FROM FIRES', con)

In [None]:
fires_shape_raw = pd.read_sql_query('SELECT * FROM IDX_FIRES_SHAPE', con)

In [None]:
fires_raw.head(10)

In [None]:
fires_shape_raw.head(10)

In [None]:
fires_shape_raw.rename(columns={"xmin":"x_min", "xmax":"x_max", "ymin":"y_min", "ymax":"y_max"}, inplace=True)

In [None]:
fires_raw['STATE'].value_counts()

In [None]:
fires_raw['STATE'].nunique()

In [None]:
fires_raw.info()

In [None]:
get_sql_config()

In [2]:
engine = get_engine()

In [3]:
table_name = 'fires_shape_raw'
schema = 'capstone_wildfire'

In [None]:
if engine!=None:
    try:
        fires_raw.to_sql(name=table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # Use schmea that was defined earlier
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

In [None]:
if engine!=None:
    try:
        fires_shape_raw.to_sql(name=table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # Use schmea that was defined earlier
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

In [None]:
from sql_functions import get_data
get_data(f'SELECT COUNT(*) FROM {schema}.fires_raw')

## preparing clean tables

In [4]:
#creating a dataset to drop unnecessary columns 
fires_clean = pd.read_sql_query('SELECT * FROM FIRES', con)

In [5]:
#pythonize the column names, lowercase all column titles
fires_clean.columns = fires_clean.columns.str.lower()

In [6]:
#create a new specific dataframe only with relevant columns
fires_specific = fires_clean[['fod_id', 'source_reporting_unit_name', 'fire_code', 'fire_name', 'fire_year', 'discovery_date', 'discovery_doy', 'discovery_time', 'stat_cause_code', 'stat_cause_descr', 'cont_date', 'cont_doy', 'cont_time', 'fire_size', 'fire_size_class', 'latitude', 'longitude', 'state', 'county', 'shape']]

In [7]:
#check for new dataframe
fires_specific.head(10)

Unnamed: 0,fod_id,source_reporting_unit_name,fire_code,fire_name,fire_year,discovery_date,discovery_doy,discovery_time,stat_cause_code,stat_cause_descr,cont_date,cont_doy,cont_time,fire_size,fire_size_class,latitude,longitude,state,county,shape
0,1,Plumas National Forest,BJ8K,FOUNTAIN,2005,2453403.5,33,1300,9.0,Miscellaneous,2453403.5,33.0,1730,0.1,A,40.036944,-121.005833,CA,63.0,b'\x00\x01\xad\x10\x00\x00\xe8d\xc2\x92_@^\xc0...
1,2,Eldorado National Forest,AAC0,PIGEON,2004,2453137.5,133,845,1.0,Lightning,2453137.5,133.0,1530,0.25,A,38.933056,-120.404444,CA,61.0,b'\x00\x01\xad\x10\x00\x00T\xb6\xeej\xe2\x19^\...
2,3,Eldorado National Forest,A32W,SLACK,2004,2453156.5,152,1921,5.0,Debris Burning,2453156.5,152.0,2024,0.1,A,38.984167,-120.735556,CA,17.0,b'\x00\x01\xad\x10\x00\x00\xd0\xa5\xa0W\x13/^\...
3,4,Eldorado National Forest,,DEER,2004,2453184.5,180,1600,1.0,Lightning,2453189.5,185.0,1400,0.1,A,38.559167,-119.913333,CA,3.0,b'\x00\x01\xad\x10\x00\x00\x94\xac\xa3\rt\xfa]...
4,5,Eldorado National Forest,,STEVENOT,2004,2453184.5,180,1600,1.0,Lightning,2453189.5,185.0,1200,0.1,A,38.559167,-119.933056,CA,3.0,b'\x00\x01\xad\x10\x00\x00@\xe3\xaa.\xb7\xfb]\...
5,6,Eldorado National Forest,,HIDDEN,2004,2453186.5,182,1800,1.0,Lightning,2453187.5,183.0,1600,0.1,A,38.635278,-120.103611,CA,5.0,b'\x00\x01\xad\x10\x00\x00\xf0<~\x90\xa1\x06^\...
6,7,Eldorado National Forest,,FORK,2004,2453187.5,183,1800,1.0,Lightning,2453188.5,184.0,1400,0.1,A,38.688333,-120.153333,CA,17.0,b'\x00\x01\xad\x10\x00\x00$o\x996\xd0\t^\xc0h\...
7,8,Shasta-Trinity National Forest,BK5X,SLATE,2005,2453437.5,67,1300,5.0,Debris Burning,2453437.5,67.0,1600,0.8,B,40.968056,-122.433889,CA,,b'\x00\x01\xad\x10\x00\x00t)\xe8\xd5\xc4\x9b^\...
8,9,Shasta-Trinity National Forest,BLPQ,SHASTA,2005,2453444.5,74,1200,5.0,Debris Burning,2453444.5,74.0,1700,1.0,B,41.233611,-122.283333,CA,,"b'\x00\x01\xad\x10\x00\x00\xdc\x8d\x1e""""\x92^\..."
9,10,Eldorado National Forest,,TANGLEFOOT,2004,2453187.5,183,1800,1.0,Lightning,2453188.5,184.0,1800,0.1,A,38.548333,-120.149167,CA,5.0,b'\x00\x01\xad\x10\x00\x00dS\\\xf2\x8b\t^\xc0\...


In [8]:
#renaming columns for better understanding
fires_specific.rename(columns={"fod_id":"unique_id", "fire_code":"gov_firecode", "stat_cause_code":"cause_code", "stat_cause_descr":"cause_descr", "cont_date":"control_date", "cont_doy":"control_doy", "cont_time":"control_time", "shape":"shape_hex"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fires_specific.rename(columns={"fod_id":"unique_id", "fire_code":"gov_firecode", "stat_cause_code":"cause_code", "stat_cause_descr":"cause_descr", "cont_date":"control_date", "cont_doy":"control_doy", "cont_time":"control_time", "shape":"shape_hex"}, inplace=True)


In [9]:
#checking result
print(fires_specific.columns)

Index(['unique_id', 'source_reporting_unit_name', 'gov_firecode', 'fire_name',
       'fire_year', 'discovery_date', 'discovery_doy', 'discovery_time',
       'cause_code', 'cause_descr', 'control_date', 'control_doy',
       'control_time', 'fire_size', 'fire_size_class', 'latitude', 'longitude',
       'state', 'county', 'shape_hex'],
      dtype='object')


In [10]:
#converting Julian calendar date from discovery_date and control_date to Gregorian calendar
epoch = pd.to_datetime(0, unit='s').to_julian_date()
fires_specific['discovery_date'] = pd.to_datetime(fires_specific['discovery_date'] - epoch, unit='D')
fires_specific['control_date'] = pd.to_datetime(fires_specific['control_date'] - epoch, unit='D')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fires_specific['discovery_date'] = pd.to_datetime(fires_specific['discovery_date'] - epoch, unit='D')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fires_specific['control_date'] = pd.to_datetime(fires_specific['control_date'] - epoch, unit='D')


In [11]:
fires_specific.head(10)

Unnamed: 0,unique_id,source_reporting_unit_name,gov_firecode,fire_name,fire_year,discovery_date,discovery_doy,discovery_time,cause_code,cause_descr,control_date,control_doy,control_time,fire_size,fire_size_class,latitude,longitude,state,county,shape_hex
0,1,Plumas National Forest,BJ8K,FOUNTAIN,2005,2005-02-02,33,1300,9.0,Miscellaneous,2005-02-02,33.0,1730,0.1,A,40.036944,-121.005833,CA,63.0,b'\x00\x01\xad\x10\x00\x00\xe8d\xc2\x92_@^\xc0...
1,2,Eldorado National Forest,AAC0,PIGEON,2004,2004-05-12,133,845,1.0,Lightning,2004-05-12,133.0,1530,0.25,A,38.933056,-120.404444,CA,61.0,b'\x00\x01\xad\x10\x00\x00T\xb6\xeej\xe2\x19^\...
2,3,Eldorado National Forest,A32W,SLACK,2004,2004-05-31,152,1921,5.0,Debris Burning,2004-05-31,152.0,2024,0.1,A,38.984167,-120.735556,CA,17.0,b'\x00\x01\xad\x10\x00\x00\xd0\xa5\xa0W\x13/^\...
3,4,Eldorado National Forest,,DEER,2004,2004-06-28,180,1600,1.0,Lightning,2004-07-03,185.0,1400,0.1,A,38.559167,-119.913333,CA,3.0,b'\x00\x01\xad\x10\x00\x00\x94\xac\xa3\rt\xfa]...
4,5,Eldorado National Forest,,STEVENOT,2004,2004-06-28,180,1600,1.0,Lightning,2004-07-03,185.0,1200,0.1,A,38.559167,-119.933056,CA,3.0,b'\x00\x01\xad\x10\x00\x00@\xe3\xaa.\xb7\xfb]\...
5,6,Eldorado National Forest,,HIDDEN,2004,2004-06-30,182,1800,1.0,Lightning,2004-07-01,183.0,1600,0.1,A,38.635278,-120.103611,CA,5.0,b'\x00\x01\xad\x10\x00\x00\xf0<~\x90\xa1\x06^\...
6,7,Eldorado National Forest,,FORK,2004,2004-07-01,183,1800,1.0,Lightning,2004-07-02,184.0,1400,0.1,A,38.688333,-120.153333,CA,17.0,b'\x00\x01\xad\x10\x00\x00$o\x996\xd0\t^\xc0h\...
7,8,Shasta-Trinity National Forest,BK5X,SLATE,2005,2005-03-08,67,1300,5.0,Debris Burning,2005-03-08,67.0,1600,0.8,B,40.968056,-122.433889,CA,,b'\x00\x01\xad\x10\x00\x00t)\xe8\xd5\xc4\x9b^\...
8,9,Shasta-Trinity National Forest,BLPQ,SHASTA,2005,2005-03-15,74,1200,5.0,Debris Burning,2005-03-15,74.0,1700,1.0,B,41.233611,-122.283333,CA,,"b'\x00\x01\xad\x10\x00\x00\xdc\x8d\x1e""""\x92^\..."
9,10,Eldorado National Forest,,TANGLEFOOT,2004,2004-07-01,183,1800,1.0,Lightning,2004-07-02,184.0,1800,0.1,A,38.548333,-120.149167,CA,5.0,b'\x00\x01\xad\x10\x00\x00dS\\\xf2\x8b\t^\xc0\...


In [12]:
#changing discovery_time and control_time into readable time
fires_specific['discovery_time'] = pd.to_datetime(fires_specific.discovery_time, format='%H%M').dt.time
fires_specific['control_time'] = pd.to_datetime(fires_specific.control_time, format='%H%M').dt.time

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fires_specific['discovery_time'] = pd.to_datetime(fires_specific.discovery_time, format='%H%M').dt.time
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fires_specific['control_time'] = pd.to_datetime(fires_specific.control_time, format='%H%M').dt.time


In [13]:
#converting control_doy into int result in error as there are null values
fires_specific['control_doy'] = fires_specific['control_doy'].astype(int)

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [14]:
#converting cause_code into int
fires_specific['cause_code'] = fires_specific['cause_code'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fires_specific['cause_code'] = fires_specific['cause_code'].astype(int)


In [15]:
#pushing final table into SQL database
if engine!=None:
    try:
        fires_specific.to_sql(name='fires_data', # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # Use schmea that was defined earlier
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The fires_data table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

The fires_data table was imported successfully.
