In [180]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

# Data Processing and Feature Engineering

In [2]:
df = pd.read_csv('California Wild Fire.csv')
df.head()

Unnamed: 0,latitude,longitude,acq_date,satellite,instrument,confidence,year,month
0,32.35334,-114.76826,2022-03-23,Aqua,MODIS,80,2022,3
1,32.35882,-114.76273,2022-03-24,N,VIIRS,50,2022,3
2,32.35924,-114.76539,2022-03-24,N,VIIRS,50,2022,3
3,32.36003,-114.75967,2022-03-23,N,VIIRS,50,2022,3
4,32.36036,-114.75935,2022-03-23,N,VIIRS,50,2022,3



Confidence 

    This value is based on a collection of intermediate algorithm quantities used in the detection process. 
    
    It is intended to help users gauge the quality of individual hotspot/fire pixels. 
    
    Confidence estimates range between 0 and 100% and are assigned one of the three fire classes 
    (low-confidence fire, nominal-confidence fire, or high-confidence fire).
    

Latitude

    Center of 1 km fire pixel, but not necessarily the actual location of the fire as one or more fires can be detected within the 1km pixel.


Longitude
    
    Center of 1 km fire pixel, but not necessarily the actual location of the fire as one or more fires can be detected within the 1 km pixel.

In [3]:
df.describe()

Unnamed: 0,latitude,longitude,confidence,year,month
count,1116861.0,1116861.0,1116861.0,1116861.0,1116861.0
mean,38.59871,-120.6445,60.10158,2017.01,7.994844
std,2.377151,2.234325,18.51666,4.331892,2.00785
min,32.35334,-123.9997,50.0,2000.0,1.0
25%,36.88822,-122.7088,50.0,2015.0,8.0
50%,39.14958,-121.0172,50.0,2018.0,8.0
75%,40.528,-119.1556,58.0,2020.0,9.0
max,41.99997,-114.0001,100.0,2022.0,12.0


In [17]:
print(df['acq_date'].min())
print(df['acq_date'].max())

2000-11-01
2022-03-25


In [6]:
df['satellite'].unique()

array(['Aqua', 'N', '1', 'Terra'], dtype=object)

In [13]:
df['instrument'].unique()

array(['MODIS', 'VIIRS'], dtype=object)

In [6]:
df

Unnamed: 0,latitude,longitude,acq_date,satellite,instrument,confidence,year,month
0,32.4,-114.8,2022-03-23,Aqua,MODIS,80,2022,3
1,32.4,-114.8,2022-03-24,N,VIIRS,50,2022,3
2,32.4,-114.8,2022-03-24,N,VIIRS,50,2022,3
3,32.4,-114.8,2022-03-23,N,VIIRS,50,2022,3
4,32.4,-114.8,2022-03-23,N,VIIRS,50,2022,3
...,...,...,...,...,...,...,...,...
1116856,42.0,-120.7,2012-08-14,N,VIIRS,50,2012,8
1116857,42.0,-123.6,2020-09-20,N,VIIRS,50,2020,9
1116858,42.0,-122.6,2018-07-08,N,VIIRS,50,2018,7
1116859,42.0,-120.6,2012-08-14,N,VIIRS,50,2012,8


## Feature Engineering

In [18]:
df.head(4)

Unnamed: 0,latitude,longitude,acq_date,satellite,instrument,confidence,year,month
0,32.35334,-114.76826,2022-03-23,Aqua,MODIS,80,2022,3
1,32.35882,-114.76273,2022-03-24,N,VIIRS,50,2022,3
2,32.35924,-114.76539,2022-03-24,N,VIIRS,50,2022,3
3,32.36003,-114.75967,2022-03-23,N,VIIRS,50,2022,3


In [4]:
df.latitude = df.latitude.round(1)
df.longitude = df.longitude.round(1)

In [4]:
df.head()

Unnamed: 0,latitude,longitude,acq_date,satellite,instrument,confidence,year,month
0,32.4,-114.8,2022-03-23,Aqua,MODIS,80,2022,3
1,32.4,-114.8,2022-03-24,N,VIIRS,50,2022,3
2,32.4,-114.8,2022-03-24,N,VIIRS,50,2022,3
3,32.4,-114.8,2022-03-23,N,VIIRS,50,2022,3
4,32.4,-114.8,2022-03-23,N,VIIRS,50,2022,3


### Fire count

In [5]:
total_fires = df.groupby(['latitude','longitude','year','month']).size().reset_index()
total_fires.rename(columns={0:'fire count'},inplace=True)
total_fires

Unnamed: 0,latitude,longitude,year,month,fire count
0,32.4,-117.1,2022,1,3
1,32.4,-117.1,2022,3,14
2,32.4,-116.9,2022,1,5
3,32.4,-116.9,2022,2,9
4,32.4,-116.9,2022,3,2
...,...,...,...,...,...
58026,42.0,-114.1,2012,8,7
58027,42.0,-114.1,2019,8,3
58028,42.0,-114.0,2007,7,5
58029,42.0,-114.0,2013,7,4


In [6]:
total_fires.nunique()

latitude       97
longitude     101
year           23
month          12
fire count    866
dtype: int64

In [12]:
total_fires.isna().sum()

latitude      0
longitude     0
year          0
month         0
fire count    0
dtype: int64

### For each coordinates making year and month and seting it to zero if there are no fire incidents. 

In [13]:
fire_coordinate = total_fires[['latitude','longitude']].drop_duplicates()
fire_coordinate.head()

Unnamed: 0,latitude,longitude
0,32.4,-117.1
2,32.4,-116.9
5,32.4,-114.9
7,32.4,-114.8
10,32.4,-114.6


In [14]:
fire_coordinate.count()

latitude     4914
longitude    4914
dtype: int64

In [15]:
fire_times = total_fires[['year','month']].drop_duplicates()
fire_times

Unnamed: 0,year,month
0,2022,1
1,2022,3
3,2022,2
10,2014,6
14,2003,1
...,...,...
3450,2009,9
3764,2001,11
4085,2001,5
4168,2010,10


In [16]:
#we are creating a new column called 'feat'. This column will be used to merge two dataframes
fire_coordinate['feat'] = 1 
fire_times['feat'] = 1 

In [17]:
fire_times.head()

Unnamed: 0,year,month,feat
0,2022,1,1
1,2022,3,1
3,2022,2,1
10,2014,6,1
14,2003,1,1


In [18]:
df2 = pd.merge(fire_coordinate,fire_times,how='outer',on='feat')
df2

Unnamed: 0,latitude,longitude,feat,year,month
0,32.4,-117.1,1,2022,1
1,32.4,-117.1,1,2022,3
2,32.4,-117.1,1,2022,2
3,32.4,-117.1,1,2014,6
4,32.4,-117.1,1,2003,1
...,...,...,...,...,...
1262893,42.0,-114.0,1,2009,9
1262894,42.0,-114.0,1,2001,11
1262895,42.0,-114.0,1,2001,5
1262896,42.0,-114.0,1,2010,10


In [19]:
df2.loc[(df2['year'] == 2021) & (df2['latitude'] == 32.4) &(df2['longitude'] == -117.1)]

Unnamed: 0,latitude,longitude,feat,year,month
41,32.4,-117.1,1,2021,8
50,32.4,-117.1,1,2021,1
51,32.4,-117.1,1,2021,6
129,32.4,-117.1,1,2021,2
130,32.4,-117.1,1,2021,3
131,32.4,-117.1,1,2021,4
132,32.4,-117.1,1,2021,5
133,32.4,-117.1,1,2021,7
134,32.4,-117.1,1,2021,9
135,32.4,-117.1,1,2021,10


In [35]:
total_fires.head()

Unnamed: 0,latitude,longitude,year,month,fire count
0,32.4,-117.1,2022,1,3
1,32.4,-117.1,2022,3,14
2,32.4,-116.9,2022,1,5
3,32.4,-116.9,2022,2,9
4,32.4,-116.9,2022,3,2


In [20]:
history = df2.merge(total_fires,how='left', on =['latitude','longitude','year','month'])
history

Unnamed: 0,latitude,longitude,feat,year,month,fire count
0,32.4,-117.1,1,2022,1,3.0
1,32.4,-117.1,1,2022,3,14.0
2,32.4,-117.1,1,2022,2,
3,32.4,-117.1,1,2014,6,
4,32.4,-117.1,1,2003,1,
...,...,...,...,...,...,...
1262893,42.0,-114.0,1,2009,9,
1262894,42.0,-114.0,1,2001,11,
1262895,42.0,-114.0,1,2001,5,
1262896,42.0,-114.0,1,2010,10,


In [21]:
history['fire count'].isna().sum()

1204867

In [22]:
history['fire count'].fillna(0,inplace=True)

In [23]:
history['fire count'].isna().sum()

0

In [24]:
history['fire count'].value_counts().head()

0.0    1204867
1.0      21714
2.0      10234
3.0       5382
4.0       3615
Name: fire count, dtype: int64

### Creating a column where we it would be 1 if firecount of the given latitude, longitude on given year and month 

In [25]:
history['fire'] = (history['fire count']>=1)
history['fire'] 

0           True
1           True
2          False
3          False
4          False
           ...  
1262893    False
1262894    False
1262895    False
1262896    False
1262897    False
Name: fire, Length: 1262898, dtype: bool

In [26]:
history.loc[history['fire'] == True,'fire'] = 1 
history.loc[history['fire'] == False,'fire'] = 0 

In [27]:
history.head()

Unnamed: 0,latitude,longitude,feat,year,month,fire count,fire
0,32.4,-117.1,1,2022,1,3.0,1
1,32.4,-117.1,1,2022,3,14.0,1
2,32.4,-117.1,1,2022,2,0.0,0
3,32.4,-117.1,1,2014,6,0.0,0
4,32.4,-117.1,1,2003,1,0.0,0


In [145]:
history['fire'].value_counts()

0    1204867
1      58031
Name: fire, dtype: int64

In [171]:
history.count()

latitude      1262898
longitude     1262898
feat          1262898
year          1262898
month         1262898
fire count    1262898
fire          1262898
dtype: int64

### Yearly 

**we are creating a table to extract columns from it to get the wether there was fire last year and count of last year fire**

In [91]:
yearly1 = history.groupby(['latitude', 'longitude', 'year'])[['fire']].mean().reset_index()
yearly2 = history.groupby(['latitude', 'longitude', 'year'])[['fire count']].mean().reset_index()

yearly = pd.merge(yearly1,yearly2,on=['latitude', 'longitude', 'year'])

yearly.head()

Unnamed: 0,latitude,longitude,year,fire,fire count
0,32.4,-117.1,2000,0.0,0.0
1,32.4,-117.1,2001,0.0,0.0
2,32.4,-117.1,2002,0.0,0.0
3,32.4,-117.1,2003,0.0,0.0
4,32.4,-117.1,2004,0.0,0.0


In [71]:
# history.groupby(['latitude', 'longitude', 'year'])[['fire']].mean().reset_index()
yearly1.loc[yearly1['fire']>0]

Unnamed: 0,latitude,longitude,year,fire
22,32.4,-117.1,2022,0.666667
45,32.4,-116.9,2022,1.000000
68,32.4,-114.9,2022,0.666667
91,32.4,-114.8,2022,1.000000
106,32.4,-114.6,2014,0.083333
...,...,...,...,...
112988,42.0,-114.1,2012,0.083333
112995,42.0,-114.1,2019,0.083333
113006,42.0,-114.0,2007,0.083333
113012,42.0,-114.0,2013,0.083333


In [74]:
# history.loc

history.loc[(history['latitude'] == 32.4) & (history['longitude'] == -117.1) & (history['year'] == 2022)]

Unnamed: 0,latitude,longitude,feat,year,month,fire count,fire
0,32.4,-117.1,1,2022,1,3.0,1
1,32.4,-117.1,1,2022,3,14.0,1
2,32.4,-117.1,1,2022,2,0.0,0


In [92]:
last_year = yearly.copy()
last_year.year +=1 

In [93]:
last_year.rename(columns={'fire':'Last_year_fire', 'fire count':'fire_count_last_year'},inplace=True)
last_year

Unnamed: 0,latitude,longitude,year,Last_year_fire,fire_count_last_year
0,32.4,-117.1,2001,0.000000,0.000000
1,32.4,-117.1,2002,0.000000,0.000000
2,32.4,-117.1,2003,0.000000,0.000000
3,32.4,-117.1,2004,0.000000,0.000000
4,32.4,-117.1,2005,0.000000,0.000000
...,...,...,...,...,...
113017,42.0,-114.0,2019,0.083333,0.083333
113018,42.0,-114.0,2020,0.000000,0.000000
113019,42.0,-114.0,2021,0.000000,0.000000
113020,42.0,-114.0,2022,0.000000,0.000000


In [52]:
last_year.loc[(last_year['latitude'] == 42.0) & (last_year['longitude']== -114.0)]

Unnamed: 0,latitude,longitude,year,Last_year_fire,fire_count_last_year
112999,42.0,-114.0,2001,0.0,0.0
113000,42.0,-114.0,2002,0.0,0.0
113001,42.0,-114.0,2003,0.0,0.0
113002,42.0,-114.0,2004,0.0,0.0
113003,42.0,-114.0,2005,0.0,0.0
113004,42.0,-114.0,2006,0.0,0.0
113005,42.0,-114.0,2007,0.0,0.0
113006,42.0,-114.0,2008,0.083333,0.416667
113007,42.0,-114.0,2009,0.0,0.0
113008,42.0,-114.0,2010,0.0,0.0


### Monthly

In [40]:
monthly1 = history.groupby(['latitude', 'longitude', 'year','month'])[['fire']].mean().reset_index()
monthly2 = history.groupby(['latitude', 'longitude', 'year','month'])[['fire count']].mean().reset_index()

monthly = pd.merge(monthly1,monthly2,on=['latitude', 'longitude', 'year','month'])

monthly.head()

Unnamed: 0,latitude,longitude,year,month,fire,fire count
0,32.4,-117.1,2000,11,0.0,0.0
1,32.4,-117.1,2000,12,0.0,0.0
2,32.4,-117.1,2001,1,0.0,0.0
3,32.4,-117.1,2001,2,0.0,0.0
4,32.4,-117.1,2001,3,0.0,0.0


In [75]:
last_year_month = monthly.copy()
last_year_month.year += 1
last_year_month.columns = ['latitude', 'longitude', 'year', 'month', 'last_year_fire_same_month', 'fire_count_last_year_same_month']
last_year_month

Unnamed: 0,latitude,longitude,year,month,last_year_fire_same_month,fire_count_last_year_same_month
0,32.4,-117.1,2001,11,0.0,0.0
1,32.4,-117.1,2001,12,0.0,0.0
2,32.4,-117.1,2002,1,0.0,0.0
3,32.4,-117.1,2002,2,0.0,0.0
4,32.4,-117.1,2002,3,0.0,0.0
...,...,...,...,...,...,...
1262893,42.0,-114.0,2022,11,0.0,0.0
1262894,42.0,-114.0,2022,12,0.0,0.0
1262895,42.0,-114.0,2023,1,0.0,0.0
1262896,42.0,-114.0,2023,2,0.0,0.0


### getting fire data of previous yearss(till 2000) for each latitude and logitude

In [115]:
yearly['feat'] = 1 
yearly

Unnamed: 0,latitude,longitude,year,fire,fire count,feat
0,32.4,-117.1,2000,0.000000,0.000000,1
1,32.4,-117.1,2001,0.000000,0.000000,1
2,32.4,-117.1,2002,0.000000,0.000000,1
3,32.4,-117.1,2003,0.000000,0.000000,1
4,32.4,-117.1,2004,0.000000,0.000000,1
...,...,...,...,...,...,...
113017,42.0,-114.0,2018,0.083333,0.083333,1
113018,42.0,-114.0,2019,0.000000,0.000000,1
113019,42.0,-114.0,2020,0.000000,0.000000,1
113020,42.0,-114.0,2021,0.000000,0.000000,1


In [95]:
history.head()

Unnamed: 0,latitude,longitude,feat,year,month,fire count,fire
0,32.4,-117.1,1,2022,1,3.0,1
1,32.4,-117.1,1,2022,3,14.0,1
2,32.4,-117.1,1,2022,2,0.0,0
3,32.4,-117.1,1,2014,6,0.0,0
4,32.4,-117.1,1,2003,1,0.0,0


In [96]:
history[['latitude', 'longitude', 'year','feat']].drop_duplicates().count()

latitude     113022
longitude    113022
year         113022
feat         113022
dtype: int64

In [97]:
yearly.count()

latitude      113022
longitude     113022
year          113022
fire          113022
fire count    113022
dtype: int64

In [116]:
past_history = history[['latitude', 'longitude', 'year','feat']].drop_duplicates().merge(
                        yearly, on=['latitude', 'longitude', 'feat'])

In [117]:
past_history = past_history[past_history.year_x<past_history.year_y]
past_history

Unnamed: 0,latitude,longitude,year_x,feat,year_y,fire,fire count
38,32.4,-117.1,2014,1,2015,0.000000,0.000000
39,32.4,-117.1,2014,1,2016,0.000000,0.000000
40,32.4,-117.1,2014,1,2017,0.000000,0.000000
41,32.4,-117.1,2014,1,2018,0.000000,0.000000
42,32.4,-117.1,2014,1,2019,0.000000,0.000000
...,...,...,...,...,...,...,...
2599501,42.0,-114.0,2000,1,2018,0.083333,0.083333
2599502,42.0,-114.0,2000,1,2019,0.000000,0.000000
2599503,42.0,-114.0,2000,1,2020,0.000000,0.000000
2599504,42.0,-114.0,2000,1,2021,0.000000,0.000000


In [118]:
np.unique(past_history.loc[past_history['year_y']==2003]['year_x'])

array([2000, 2001, 2002], dtype=int64)

In [123]:
past_history = past_history.groupby(['latitude','longitude','year_y'])[['fire','fire count']].mean().reset_index()
past_history

Unnamed: 0,latitude,longitude,year_y,fire,fire count
0,32.4,-117.1,2001,0.000000,0.000000
1,32.4,-117.1,2002,0.000000,0.000000
2,32.4,-117.1,2003,0.000000,0.000000
3,32.4,-117.1,2004,0.000000,0.000000
4,32.4,-117.1,2005,0.000000,0.000000
...,...,...,...,...,...
108103,42.0,-114.0,2018,0.083333,0.083333
108104,42.0,-114.0,2019,0.000000,0.000000
108105,42.0,-114.0,2020,0.000000,0.000000
108106,42.0,-114.0,2021,0.000000,0.000000


In [126]:
past_history.rename(columns={'fire':'fire_before','fire count':'fire count before','year_y':'year'},inplace=True)
past_history

Unnamed: 0,latitude,longitude,year,fire_before,fire count before
0,32.4,-117.1,2001,0.000000,0.000000
1,32.4,-117.1,2002,0.000000,0.000000
2,32.4,-117.1,2003,0.000000,0.000000
3,32.4,-117.1,2004,0.000000,0.000000
4,32.4,-117.1,2005,0.000000,0.000000
...,...,...,...,...,...
108103,42.0,-114.0,2018,0.083333,0.083333
108104,42.0,-114.0,2019,0.000000,0.000000
108105,42.0,-114.0,2020,0.000000,0.000000
108106,42.0,-114.0,2021,0.000000,0.000000


## Getting all together in one dataframe(Making final Dataframe for model building)

In [141]:
x_data = history.merge(past_history, how='left', on=['latitude', 'longitude', 'year'])
x_data

Unnamed: 0,latitude,longitude,feat,year,month,fire count,fire,fire_before,fire count before
0,32.4,-117.1,1,2022,1,3.0,1,0.666667,5.666667
1,32.4,-117.1,1,2022,3,14.0,1,0.666667,5.666667
2,32.4,-117.1,1,2022,2,0.0,0,0.666667,5.666667
3,32.4,-117.1,1,2014,6,0.0,0,0.000000,0.000000
4,32.4,-117.1,1,2003,1,0.0,0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...
1262893,42.0,-114.0,1,2009,9,0.0,0,0.000000,0.000000
1262894,42.0,-114.0,1,2001,11,0.0,0,0.000000,0.000000
1262895,42.0,-114.0,1,2001,5,0.0,0,0.000000,0.000000
1262896,42.0,-114.0,1,2010,10,0.0,0,0.000000,0.000000


In [142]:
x_data = x_data.merge(last_year, how='left', on=['latitude', 'longitude', 'year'])
x_data = x_data.merge(last_year_month, how='left', on=['latitude', 'longitude', 'year','month'])

In [148]:
x_data.head(4)

Unnamed: 0,latitude,longitude,year,month,fire count,fire,fire_before,fire count before,Last_year_fire,fire_count_last_year,last_year_fire_same_month,fire_count_last_year_same_month
0,32.4,-117.1,2022,1,3.0,1,0.666667,5.666667,0.0,0.0,0.0,0.0
1,32.4,-117.1,2022,3,14.0,1,0.666667,5.666667,0.0,0.0,0.0,0.0
2,32.4,-117.1,2022,2,0.0,0,0.666667,5.666667,0.0,0.0,0.0,0.0
3,32.4,-117.1,2014,6,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [145]:
x_data.drop(columns=['feat_y','feat_x'],inplace=True)

In [146]:
x_data

Unnamed: 0,latitude,longitude,year,month,fire count,fire,fire_before,fire count before,Last_year_fire,fire_count_last_year,last_year_fire_same_month,fire_count_last_year_same_month
0,32.4,-117.1,2022,1,3.0,1,0.666667,5.666667,0.0,0.0,0.0,0.0
1,32.4,-117.1,2022,3,14.0,1,0.666667,5.666667,0.0,0.0,0.0,0.0
2,32.4,-117.1,2022,2,0.0,0,0.666667,5.666667,0.0,0.0,0.0,0.0
3,32.4,-117.1,2014,6,0.0,0,0.000000,0.000000,0.0,0.0,0.0,0.0
4,32.4,-117.1,2003,1,0.0,0,0.000000,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1262893,42.0,-114.0,2009,9,0.0,0,0.000000,0.000000,0.0,0.0,0.0,0.0
1262894,42.0,-114.0,2001,11,0.0,0,0.000000,0.000000,0.0,0.0,0.0,0.0
1262895,42.0,-114.0,2001,5,0.0,0,0.000000,0.000000,0.0,0.0,,
1262896,42.0,-114.0,2010,10,0.0,0,0.000000,0.000000,0.0,0.0,0.0,0.0


### **Columns(attributes information)**

   __Fire__ - 1/0 (1 - there was a fire, 0 - there was no fire)
    
   __Fire count__ - Number of fire accidents in that respective year and month
    
   __fire before__ - average of fire incidents(1/0) in previous years(till 2000)
    
   __fire_count_Before__ - average count of fire incidents in previous years(till 2000) 
    
   __last_year_fire__ - was their any fire accident in last year(1/0)
    
   __fire count last year__ - How many fire accidents happened in last year(if none happened then the value will be 0)
    
   __last year same month__ - Last year but same month was their any fire accident(1/0)
    
   __fire count last year same month__ - How many fire accidents happened in last year but exact same month (if none happened then the value will be 0)

In [151]:
x_data.groupby('year').count()

Unnamed: 0_level_0,latitude,longitude,month,fire count,fire,fire_before,fire count before,Last_year_fire,fire_count_last_year,last_year_fire_same_month,fire_count_last_year_same_month
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2000,9828,9828,9828,9828,9828,0,0,0,0,0,0
2001,58968,58968,58968,58968,58968,58968,58968,58968,58968,9828,9828
2002,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968
2003,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968
2004,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968
2005,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968
2006,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968
2007,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968
2008,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968
2009,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968


### Train & test and validation set 

In [160]:
train = x_data.loc[x_data['year']<2020].dropna()
validate = x_data.loc[(x_data['year']>=2020) & (x_data['year']<=2021)].dropna()
test = x_data.loc[(x_data['year']>=2022) ].dropna()

In [158]:
test = x_data.loc[(x_data['year']>=2022) ].dropna()

Unnamed: 0,latitude,longitude,year,month,fire count,fire,fire_before,fire count before,Last_year_fire,fire_count_last_year,last_year_fire_same_month,fire_count_last_year_same_month
0,32.4,-117.1,2022,1,3.0,1,0.666667,5.666667,0.0,0.0,0.0,0.0
1,32.4,-117.1,2022,3,14.0,1,0.666667,5.666667,0.0,0.0,0.0,0.0
2,32.4,-117.1,2022,2,0.0,0,0.666667,5.666667,0.0,0.0,0.0,0.0
257,32.4,-116.9,2022,1,5.0,1,1.000000,5.333333,0.0,0.0,0.0,0.0
258,32.4,-116.9,2022,3,2.0,1,1.000000,5.333333,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1262385,42.0,-114.1,2022,3,0.0,0,0.000000,0.000000,0.0,0.0,0.0,0.0
1262386,42.0,-114.1,2022,2,0.0,0,0.000000,0.000000,0.0,0.0,0.0,0.0
1262641,42.0,-114.0,2022,1,0.0,0,0.000000,0.000000,0.0,0.0,0.0,0.0
1262642,42.0,-114.0,2022,3,0.0,0,0.000000,0.000000,0.0,0.0,0.0,0.0


In [161]:
train.groupby('year').count()

Unnamed: 0_level_0,latitude,longitude,month,fire count,fire,fire_before,fire count before,Last_year_fire,fire_count_last_year,last_year_fire_same_month,fire_count_last_year_same_month
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2001,9828,9828,9828,9828,9828,9828,9828,9828,9828,9828,9828
2002,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968
2003,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968
2004,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968
2005,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968
2006,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968
2007,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968
2008,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968
2009,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968
2010,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968


In [162]:
test.groupby('year').count()

Unnamed: 0_level_0,latitude,longitude,month,fire count,fire,fire_before,fire count before,Last_year_fire,fire_count_last_year,last_year_fire_same_month,fire_count_last_year_same_month
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2022,14742,14742,14742,14742,14742,14742,14742,14742,14742,14742,14742


In [163]:
validate.groupby('year').count()

Unnamed: 0_level_0,latitude,longitude,month,fire count,fire,fire_before,fire count before,Last_year_fire,fire_count_last_year,last_year_fire_same_month,fire_count_last_year_same_month
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968
2021,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968,58968


## Saving the date

In [164]:
train.to_csv('ca_fire_data_train.csv',index = False)
test.to_csv('ca_fire_data_test.csv',index = False)
validate.to_csv('ca_fire_data_validate.csv',index = False)