In [9]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [11]:
# ignores deprecation warnings
import warnings
warnings.filterwarnings("ignore")

In [12]:
import matplotlib.pyplot as plt
import matplotlib.style as style

In [13]:
# Seaborn Library 
import seaborn as sns
sns.set_style("darkgrid")

In [14]:
# Read data file and create pandas data frame
df = pd.read_csv('../input/california-wildfire-incidents-20132020/California_Fire_Incidents.csv')

#Understanding the dataset

In [15]:
df.describe()

In [19]:
df

In [20]:
df.columns

In [21]:
df.info()

Let us see if there is any missing values in dataset

In [24]:
MV = df.isna().sum()
MV


In [32]:
Selected_Columns=['AcresBurned','AdminUnit', 'ArchiveYear','Counties','Extinguished','Fatalities','Latitude','Longitude','MajorIncident','Name','PersonnelInvolved','Started','WaterTenders']

In [35]:
fire_df = df[Selected_Columns].copy()
fire_df

Duration of how long the fire incident lasts.

In [36]:
fire_df['fire_duration'] = (pd.to_datetime(fire_df.Extinguished) - pd.to_datetime(fire_df.Started)).astype('timedelta64[h]')/24

In [37]:
fire_df['fire_duration'] = pd.to_numeric(fire_df.fire_duration,errors = 'coerce')

In [38]:
fire_df['fire_duration'].plot()

In [39]:
fire_df.describe()

We observe that the fire_duration calculated spans from -17052 to 17900 days. In order to decide if this is a data entry error, we need to explore how many of these fire incidents have negative durations and if we should ignore these or take absolute values of. This is what we will explore next.


In [40]:
fire_df[fire_df.fire_duration < 0]['fire_duration'].value_counts()

Dropping invalid data


In [42]:
fire_df.drop(fire_df[(fire_df.fire_duration < 0) | (fire_df.fire_duration > 500)].index, inplace=True) # 0 days are same day fire extinguished, hence still kept in the dataset
# Taking absolute value for the rest of the fire duration 
fire_df['fire_duration'] = np.abs(fire_df.fire_duration)

In [43]:
fire_df.drop(fire_df[(fire_df.Latitude < -90) | (fire_df.Latitude > 90)].index, inplace=True)
fire_df.drop(fire_df[(fire_df.Longitude < -180) | (fire_df.Longitude >= 0)].index, inplace=True)

Exploratory data analysis

Top 20 Affected counties

In [68]:
plt.figure(figsize=(16,4))
fire_df.Counties.value_counts()[0:20].plot(kind='bar')
plt.title('Top 20 affected Counties')
plt.grid()
plt.show()

Top 20 admin units


In [77]:
plt.figure(figsize=(4,10))
fire_df.AdminUnit.value_counts()[0:20].plot(kind='barh', color = 'red')
plt.title('Top 20 Admin Units')
plt.grid()
plt.show()

Area burned

In [78]:
plt.figure(figsize=(16,4))
plt.scatter(fire_df.ArchiveYear, fire_df.AcresBurned, color='blue', alpha=0.25)
plt.xlabel('Archive Year')
plt.ylabel('AcresBurned')
plt.grid()
plt.show()

In [79]:
#Voilin plot

plt.figure(figsize=(16,4))
sns.violinplot(x='ArchiveYear', y='AcresBurned', data=fire_df)
plt.grid()
plt.title('Acres Burned vs Year')
plt.show()

Acre burned by year

In [80]:
acres_sum = df.groupby(by='ArchiveYear').AcresBurned.sum()
acres_sum

In [89]:
# crews involved (if not NaN)
df.CrewsInvolved.plot(kind='hist', color = 'purple')
plt.title('Crews Involved')
plt.grid()
plt.show()

In [88]:
plt.scatter(acres_sum.index, acres_sum)
plt.grid()
plt.title('Acres Burned sum per year')
plt.show()

In [90]:
# water tenders (if not NaN)
df.WaterTenders.plot(kind='hist')
plt.title('Water Tenders')
plt.grid()
plt.show()