In [1]:
import pandas as pd
import numpy as np
from datetime import date

In [2]:
import_path = r"C:\Datasets"
export_path = r"C:\Datasets"
plots_path = r'C:\Datasets\plots'

In [3]:
file, date_fields = 'FireData.csv', ['acq_date']
fire = pd.read_csv('\\'.join([import_path,file[:]]),parse_dates=date_fields)
print(fire.shape)
print(fire.info())
fire

(8345062, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8345062 entries, 0 to 8345061
Data columns (total 12 columns):
latitude      float64
longitude     float64
brightness    float64
scan          float64
track         float64
acq_date      datetime64[ns]
acq_time      object
confidence    object
bright_t31    float64
frp           float64
daynight      object
type          float64
dtypes: datetime64[ns](1), float64(8), object(3)
memory usage: 764.0+ MB
None


Unnamed: 0,latitude,longitude,brightness,scan,track,acq_date,acq_time,confidence,bright_t31,frp,daynight,type
0,-32.052837,151.245422,331.58,0.36,0.58,2014-01-01,03:01:00.0000000,n,273.95,6.16,D,0.0
1,-32.057903,151.251083,347.15,0.36,0.58,2014-01-01,03:01:00.0000000,n,276.08,8.27,D,0.0
2,-33.106487,150.480713,354.04,0.41,0.61,2014-01-01,03:01:00.0000000,n,274.22,9.60,D,0.0
3,-33.092632,150.425232,334.32,0.42,0.61,2014-01-01,03:01:00.0000000,n,273.75,4.07,D,0.0
4,-32.068897,151.249878,330.37,0.36,0.58,2014-01-01,03:01:00.0000000,n,271.73,3.41,D,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
8345057,-32.728120,116.353980,297.20,0.46,0.39,2021-04-20,17:00:00.0000000,n,283.80,0.70,N,0.0
8345058,-32.667190,116.329360,298.70,0.46,0.39,2021-04-20,17:00:00.0000000,n,284.30,1.00,N,0.0
8345059,-30.481630,115.502590,301.70,0.54,0.42,2021-04-20,17:00:00.0000000,n,284.80,1.10,N,0.0
8345060,-30.481580,115.501350,302.20,0.54,0.42,2021-04-20,17:00:00.0000000,n,284.50,1.30,N,0.0


In [4]:
# more recent satellite .csv files do not have 'type' data - filling NaN values from these files with 0 (possibly inaccurate)
fire['type'] = fire['type'].fillna(0)

print('Measurement TYPE counts - before clean',fire['type'].value_counts(),'\n',sep = '\n')

# retaining only measurement type 0 (presumed vegetation fire)
fire = fire[(fire[['type']] == 0).all(axis=1)]

print('Measurement TYPE counts - after clean',fire['type'].value_counts(),'\n',sep = '\n')

# retaining only 'confidence' values that don't equal 'l' (low). n (nominal) and h (high) values retained
fire = fire.loc[(fire[['confidence']] != 'l').all(axis=1)].copy()

print('Measurement Confidence',fire['confidence'].value_counts(),'\n',sep = '\n')

# dropping unnecessary columns
fire = fire.drop(['acq_time','scan','track','daynight','type'], axis=1)

# reorder columns
fire = fire[['acq_date','latitude','longitude','brightness','bright_t31','confidence','frp',]].reset_index(drop=True)

print(fire.info())

# trimming down the data set for reverse geocoding
fire_filter = pd.DataFrame(fire)
fire_filter = fire.filter(["latitude","longitude","acq_date","frp"])

# all data
fire

Measurement TYPE counts - before clean
0.0    8206906
3.0      70439
2.0      67717
Name: type, dtype: int64


Measurement TYPE counts - after clean
0.0    8206906
Name: type, dtype: int64


Measurement Confidence
n    6673402
h     754365
Name: confidence, dtype: int64


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7427767 entries, 0 to 7427766
Data columns (total 7 columns):
acq_date      datetime64[ns]
latitude      float64
longitude     float64
brightness    float64
bright_t31    float64
confidence    object
frp           float64
dtypes: datetime64[ns](1), float64(5), object(1)
memory usage: 396.7+ MB
None


Unnamed: 0,acq_date,latitude,longitude,brightness,bright_t31,confidence,frp
0,2014-01-01,-32.052837,151.245422,331.58,273.95,n,6.16
1,2014-01-01,-32.057903,151.251083,347.15,276.08,n,8.27
2,2014-01-01,-33.106487,150.480713,354.04,274.22,n,9.60
3,2014-01-01,-33.092632,150.425232,334.32,273.75,n,4.07
4,2014-01-01,-32.068897,151.249878,330.37,271.73,n,3.41
...,...,...,...,...,...,...,...
7427762,2021-04-20,-32.728120,116.353980,297.20,283.80,n,0.70
7427763,2021-04-20,-32.667190,116.329360,298.70,284.30,n,1.00
7427764,2021-04-20,-30.481630,115.502590,301.70,284.80,n,1.10
7427765,2021-04-20,-30.481580,115.501350,302.20,284.50,n,1.30


In [5]:
import reverse_geocoder as rg
import pprint as pp

reverse_geo = fire_filter[['latitude','longitude']].apply(tuple, axis=1)
reverse_geo = reverse_geo.tolist()
    
results = rg.search(reverse_geo)
rg_viirs = pd.DataFrame(results)

# tidying up the columns
rg_viirs.columns = ['lat_suburb','long_suburb','suburb','state','region','country']
rg_viirs = rg_viirs[['lat_suburb','long_suburb','suburb','region','state','country']]
rg_viirs

Loading formatted geocoded file...


Unnamed: 0,lat_suburb,long_suburb,suburb,region,state,country
0,-32.26118,150.89009,Muswellbrook,Muswellbrook,New South Wales,AU
1,-32.26118,150.89009,Muswellbrook,Muswellbrook,New South Wales,AU
2,-33.48247,150.13631,Lithgow,Lithgow,New South Wales,AU
3,-33.48247,150.13631,Lithgow,Lithgow,New South Wales,AU
4,-32.26118,150.89009,Muswellbrook,Muswellbrook,New South Wales,AU
...,...,...,...,...,...,...
7427762,-32.84323,115.92201,Waroona,Waroona,Western Australia,AU
7427763,-32.84323,115.92201,Waroona,Waroona,Western Australia,AU
7427764,-30.30591,115.03825,Jurien Bay,Dandaragan,Western Australia,AU
7427765,-30.30591,115.03825,Jurien Bay,Dandaragan,Western Australia,AU


In [6]:
fire = pd.concat([fire,rg_viirs], axis=1, ignore_index=True)
fire.columns =['acq_date','latitude','longitude','brightness','bright_t31','confidence','frp','lat_suburb','long_suburb','suburb','region','state','country']
fire.iloc[:,7:9] = fire.iloc[:,7:9].astype('float')
fire = fire[(fire[['country']] == 'AU').all(axis=1)].reset_index(drop=True)
fire = fire.drop(['country'], axis=1)

# filtering to match weather date range (2014-09-01 - 2020-11-30)
fire = fire.loc[fire["acq_date"] > "2014-08-31"].reset_index(drop = True)
fire

Unnamed: 0,acq_date,latitude,longitude,brightness,bright_t31,confidence,frp,lat_suburb,long_suburb,suburb,region,state
0,2014-09-01,-36.058708,141.151855,325.21,285.02,n,2.57,-36.30768,140.77167,Bordertown,Tatiara,South Australia
1,2014-09-01,-36.193665,141.106796,334.36,287.15,n,2.68,-36.30768,140.77167,Bordertown,Tatiara,South Australia
2,2014-09-01,-30.645002,152.868011,326.96,297.31,n,1.04,-30.64997,152.85146,Bowraville,Nambucca Shire,New South Wales
3,2014-09-01,-31.571972,148.971619,328.54,297.54,n,5.87,-31.71173,148.66252,Gilgandra,Gilgandra,New South Wales
4,2014-09-01,-31.571371,148.975906,331.45,299.01,n,5.87,-31.71173,148.66252,Gilgandra,Gilgandra,New South Wales
...,...,...,...,...,...,...,...,...,...,...,...,...
6907576,2021-04-20,-32.728120,116.353980,297.20,283.80,n,0.70,-32.84323,115.92201,Waroona,Waroona,Western Australia
6907577,2021-04-20,-32.667190,116.329360,298.70,284.30,n,1.00,-32.84323,115.92201,Waroona,Waroona,Western Australia
6907578,2021-04-20,-30.481630,115.502590,301.70,284.80,n,1.10,-30.30591,115.03825,Jurien Bay,Dandaragan,Western Australia
6907579,2021-04-20,-30.481580,115.501350,302.20,284.50,n,1.30,-30.30591,115.03825,Jurien Bay,Dandaragan,Western Australia


In [7]:
fire[fire.region==''].groupby('suburb').count()

Unnamed: 0_level_0,acq_date,latitude,longitude,brightness,bright_t31,confidence,frp,lat_suburb,long_suburb,region,state
suburb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Acton,3,3,3,3,3,3,3,3,3,3,3
Alexandria,3,3,3,3,3,3,3,3,3,3,3
Alyangula,355366,355366,355366,355366,355366,355366,355366,355366,355366,355366,355366
Ballarat East,2,2,2,2,2,2,2,2,2,2,2
Belconnen,12,12,12,12,12,12,12,12,12,12,12
Black Hill,9,9,9,9,9,9,9,9,9,9,9
Booker Bay,2,2,2,2,2,2,2,2,2,2,2
Brown Hill,375,375,375,375,375,375,375,375,375,375,375
Canberra,1,1,1,1,1,1,1,1,1,1,1
Chinderah,5,5,5,5,5,5,5,5,5,5,5


In [8]:
import warnings
warnings.filterwarnings("ignore")

fire.region[fire.state == 'Australian Capital Territory'] = 'Australian Capital Territory'
fire.region[(fire.suburb == 'Ballarat East') | (fire.suburb == 'Black Hill') 
            | (fire.suburb == 'Brown Hill') | (fire.suburb == 'Mount Pleasant')] = 'Ballarat'
fire.region[fire.suburb == 'Heatherton'] = 'Kingston'
fire.region[fire.suburb == 'Heidelberg West'] = 'Banyule'
fire.region[fire.suburb == 'Plenty'] = 'Nillumbik'
fire.region[fire.suburb == 'Somers'] = 'Mornington Peninsula'
fire.region[fire.suburb == 'Alexandria'] = 'City of Sydney'
fire.region[fire.suburb == 'Booker Bay'] = 'Central Coast'
fire.region[fire.suburb == 'Chinderah'] = 'Tweed'
fire.region[fire.suburb == 'Alyangula'] = 'East Arnhem'
fire.region[fire.suburb == 'Port Denison'] = 'Irwin'

fire[fire.region==''].groupby('suburb').count()

Unnamed: 0_level_0,acq_date,latitude,longitude,brightness,bright_t31,confidence,frp,lat_suburb,long_suburb,region,state
suburb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Innamincka,4961,4961,4961,4961,4961,4961,4961,4961,4961,4961,4961
Koomooloo,14,14,14,14,14,14,14,14,14,14,14
The Sill,484,484,484,484,484,484,484,484,484,484,484


In [9]:
# assigning 'nil' to any records where there is no council defined by state government 
fire['region'] = fire['region'].replace({'': 'nil'})
fire[fire.region==''].groupby('suburb').count()

temp1 = fire[fire.region == 'nil'].groupby('suburb')
print('\nCount of records with value of nil for region = ',len(fire[fire.region == 'nil'].index),'\n')
temp1.first()


Count of records with value of nil for region =  5459 



Unnamed: 0_level_0,acq_date,latitude,longitude,brightness,bright_t31,confidence,frp,lat_suburb,long_suburb,region,state
suburb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Innamincka,2014-09-01,-25.086842,142.110504,354.79,311.94,n,7.08,-27.70728,140.73697,nil,South Australia
Koomooloo,2015-03-04,-33.883419,139.181046,346.75,318.21,n,3.47,-33.5547,139.46805,nil,South Australia
The Sill,2014-09-05,-27.588493,139.580704,295.29,282.9,n,0.48,-28.16048,138.6754,nil,South Australia


In [10]:
fire_locats2021 = fire.drop_duplicates(subset=['suburb', 'state'], keep='last').reset_index(drop=True)
fire_locats2021 = fire_locats2021[['lat_suburb', 'long_suburb','suburb', 'region', 'state']]
fire_locats2021 = fire_locats2021.sort_values(['suburb', 'state'], ascending=[True, True]).reset_index(drop=True)
fire_locats2021

Unnamed: 0,lat_suburb,long_suburb,suburb,region,state
0,-35.27767,149.11829,Acton,Australian Capital Territory,Australian Capital Territory
1,-34.91119,138.70735,Adelaide Hills,Adelaide Hills,South Australia
2,-35.00310,117.86595,Albany,Albany,Western Australia
3,-34.85925,138.52138,Alberton,Port Adelaide Enfield,South Australia
4,-35.01667,138.73333,Aldgate,Adelaide Hills,South Australia
...,...,...,...,...,...
1101,-23.12683,150.74406,Yeppoon,Rockhampton,Queensland
1102,-31.88809,116.76780,York,York,Western Australia
1103,-16.80278,145.72083,Yorkeys Knob,Cairns,Queensland
1104,-34.31350,148.30107,Young,Young,New South Wales


In [11]:
file, date_fields = 'WeatherData.csv', ['Date']
weather = pd.read_csv('\\'.join([import_path,file[:]]),parse_dates=date_fields)
print(weather.shape)
print(weather.info())
weather

(2237302, 22)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2237302 entries, 0 to 2237301
Data columns (total 22 columns):
Date             datetime64[ns]
MaxTemp          int64
MinTemp          int64
Snow             float64
SunHour          float64
UVindex          int64
Moon             int64
Dewpoint         int64
FeelsLike        int64
HeatIndex        int64
WindChill        int64
WindGust         int64
CloudCover       int64
Humidity         int64
Precipitation    float64
Pressure         int64
Temperature      int64
Visibility       int64
WindDirection    int64
WindSpeed        int64
Latitude         float64
Longitude        float64
dtypes: datetime64[ns](1), float64(5), int64(16)
memory usage: 375.5 MB
None


Unnamed: 0,Date,MaxTemp,MinTemp,Snow,SunHour,UVindex,Moon,Dewpoint,FeelsLike,HeatIndex,...,CloudCover,Humidity,Precipitation,Pressure,Temperature,Visibility,WindDirection,WindSpeed,Latitude,Longitude
0,2016-11-27,26,7,0.0,14.5,5,0,5,17,17,...,3,51,0.0,1015,26,10,251,7,-34.8164,147.196
1,2016-11-28,29,12,0.0,14.5,6,0,5,20,20,...,3,43,0.0,1012,29,10,182,12,-34.8164,147.196
2,2016-11-29,30,10,0.0,14.5,6,0,3,20,20,...,16,37,0.0,1010,30,10,210,9,-34.8164,147.196
3,2016-11-30,33,14,0.0,14.5,6,2,5,22,22,...,12,38,0.0,1007,33,10,201,21,-34.8164,147.196
4,2016-12-01,30,11,0.0,13.4,6,9,4,21,21,...,20,38,0.0,1007,30,10,282,10,-34.8164,147.196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2237297,2016-11-22,33,20,0.0,14.5,7,36,10,26,26,...,20,42,0.1,1013,33,10,154,17,-34.8164,147.196
2237298,2016-11-23,19,10,0.0,12.3,4,28,10,15,15,...,54,72,2.4,1012,19,10,241,19,-34.8164,147.196
2237299,2016-11-24,21,6,0.0,14.5,4,21,4,12,14,...,3,55,0.0,1015,21,10,230,15,-34.8164,147.196
2237300,2016-11-25,22,7,0.0,14.5,4,13,4,14,15,...,1,52,0.0,1015,22,10,231,17,-34.8164,147.196


In [12]:
# drop data that is of no value to fire prediction
weather = weather.drop(['Snow','Moon','Temperature'], axis=1)

# rename columns to mactch previous work
weather.columns =['date','maxC','minC','sunHour','uv','DewPointC','FeelsLikeC','HeatIndexC',
                  'WindChillC','WindGustKmph','cloudcover','humidity','precipMM','pressure',
                  'visibility','windAZI','windKmph','lat_w', 'long_w']   
weather

Unnamed: 0,date,maxC,minC,sunHour,uv,DewPointC,FeelsLikeC,HeatIndexC,WindChillC,WindGustKmph,cloudcover,humidity,precipMM,pressure,visibility,windAZI,windKmph,lat_w,long_w
0,2016-11-27,26,7,14.5,5,5,17,17,17,10,3,51,0.0,1015,10,251,7,-34.8164,147.196
1,2016-11-28,29,12,14.5,6,5,20,20,20,19,3,43,0.0,1012,10,182,12,-34.8164,147.196
2,2016-11-29,30,10,14.5,6,3,20,20,20,14,16,37,0.0,1010,10,210,9,-34.8164,147.196
3,2016-11-30,33,14,14.5,6,5,22,22,23,31,12,38,0.0,1007,10,201,21,-34.8164,147.196
4,2016-12-01,30,11,13.4,6,4,21,21,21,15,20,38,0.0,1007,10,282,10,-34.8164,147.196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2237297,2016-11-22,33,20,14.5,7,10,26,26,25,27,20,42,0.1,1013,10,154,17,-34.8164,147.196
2237298,2016-11-23,19,10,12.3,4,10,15,15,15,27,54,72,2.4,1012,10,241,19,-34.8164,147.196
2237299,2016-11-24,21,6,14.5,4,4,12,14,12,23,3,55,0.0,1015,10,230,15,-34.8164,147.196
2237300,2016-11-25,22,7,14.5,4,4,14,15,14,26,1,52,0.0,1015,10,231,17,-34.8164,147.196


In [13]:
# import reverse_geocoder as rg

weather_revgeo = weather[['lat_w','long_w']].apply(tuple, axis=1)
weather_revgeo = weather_revgeo.tolist()
    
results1 = rg.search(weather_revgeo)
rg_weather = pd.DataFrame(results1)

# tidying up the columns
rg_weather.columns = ['lat_w_rg','long_w_rg','suburb_w','state_w','region_w','country']
rg_weather = rg_weather[['lat_w_rg','long_w_rg','suburb_w','region_w','state_w','country']]
rg_weather['lat_w_rg'] = rg_weather['lat_w_rg'].astype('float')
rg_weather['long_w_rg'] = rg_weather['long_w_rg'].astype('float')
rg_weather

Unnamed: 0,lat_w_rg,long_w_rg,suburb_w,region_w,state_w,country
0,-34.81641,147.19577,Coolamon,Coolamon,New South Wales,AU
1,-34.81641,147.19577,Coolamon,Coolamon,New South Wales,AU
2,-34.81641,147.19577,Coolamon,Coolamon,New South Wales,AU
3,-34.81641,147.19577,Coolamon,Coolamon,New South Wales,AU
4,-34.81641,147.19577,Coolamon,Coolamon,New South Wales,AU
...,...,...,...,...,...,...
2237297,-34.81641,147.19577,Coolamon,Coolamon,New South Wales,AU
2237298,-34.81641,147.19577,Coolamon,Coolamon,New South Wales,AU
2237299,-34.81641,147.19577,Coolamon,Coolamon,New South Wales,AU
2237300,-34.81641,147.19577,Coolamon,Coolamon,New South Wales,AU


In [14]:
weather = pd.concat([weather,rg_weather], axis=1, ignore_index=True)
weather.columns =['date', 'maxC', 'minC', 'sunHour', 'uv', 'DewPointC', 'FeelsLikeC',
                  'HeatIndexC', 'WindChillC', 'WindGustKmph', 'cloudcover', 'humidity',
                  'precipMM', 'pressure', 'visibility', 'windAZI', 'windKmph',
                  'lat_w', 'long_w','lat_w_rg', 'long_w_rg','suburb_w','region_w','state_w','country']

weather = weather[(weather[['country']] == 'AU').all(axis=1)].reset_index(drop=True)
weather = weather.drop(['country'], axis=1)
weather = weather.sort_values(['suburb_w', 'date'], ascending=[True, True]).reset_index(drop=True)

weather

Unnamed: 0,date,maxC,minC,sunHour,uv,DewPointC,FeelsLikeC,HeatIndexC,WindChillC,WindGustKmph,...,visibility,windAZI,windKmph,lat_w,long_w,lat_w_rg,long_w_rg,suburb_w,region_w,state_w
0,2020-12-01,31,11,13.4,6,12,23,23,22,18,...,8,216,13,-35.27767,149.11829,-35.27767,149.11829,Acton,,Australian Capital Territory
1,2020-12-02,24,12,14.5,6,7,19,20,19,16,...,10,271,12,-35.27767,149.11829,-35.27767,149.11829,Acton,,Australian Capital Territory
2,2020-12-03,26,12,13.4,5,8,18,19,18,14,...,10,197,11,-35.27767,149.11829,-35.27767,149.11829,Acton,,Australian Capital Territory
3,2020-12-04,26,11,14.5,6,4,20,20,20,16,...,10,245,11,-35.27767,149.11829,-35.27767,149.11829,Acton,,Australian Capital Territory
4,2020-12-05,27,11,10.2,5,11,18,18,18,21,...,9,229,15,-35.27767,149.11829,-35.27767,149.11829,Acton,,Australian Capital Territory
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2237297,2021-04-17,21,18,4.2,4,16,20,20,20,17,...,9,186,12,-27.35591,153.04453,-27.35591,153.04453,Zillmere,Brisbane,Queensland
2237298,2021-04-18,21,15,5.7,4,13,18,18,18,15,...,10,201,10,-27.35591,153.04453,-27.35591,153.04453,Zillmere,Brisbane,Queensland
2237299,2021-04-19,23,15,8.7,5,13,19,19,19,12,...,10,154,8,-27.35591,153.04453,-27.35591,153.04453,Zillmere,Brisbane,Queensland
2237300,2021-04-20,24,17,8.7,6,15,20,21,20,14,...,10,253,10,-27.35591,153.04453,-27.35591,153.04453,Zillmere,Brisbane,Queensland


In [15]:
weather[weather.region_w==''].groupby('suburb_w').count()

Unnamed: 0_level_0,date,maxC,minC,sunHour,uv,DewPointC,FeelsLikeC,HeatIndexC,WindChillC,WindGustKmph,...,pressure,visibility,windAZI,windKmph,lat_w,long_w,lat_w_rg,long_w_rg,region_w,state_w
suburb_w,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Acton,142,142,142,142,142,142,142,142,142,142,...,142,142,142,142,142,142,142,142,142,142
Alexandria,142,142,142,142,142,142,142,142,142,142,...,142,142,142,142,142,142,142,142,142,142
Alyangula,2425,2425,2425,2425,2425,2425,2425,2425,2425,2425,...,2425,2425,2425,2425,2425,2425,2425,2425,2425,2425
Ballarat East,2425,2425,2425,2425,2425,2425,2425,2425,2425,2425,...,2425,2425,2425,2425,2425,2425,2425,2425,2425,2425
Belconnen,2425,2425,2425,2425,2425,2425,2425,2425,2425,2425,...,2425,2425,2425,2425,2425,2425,2425,2425,2425,2425
Black Hill,142,142,142,142,142,142,142,142,142,142,...,142,142,142,142,142,142,142,142,142,142
Booker Bay,2425,2425,2425,2425,2425,2425,2425,2425,2425,2425,...,2425,2425,2425,2425,2425,2425,2425,2425,2425,2425
Brown Hill,142,142,142,142,142,142,142,142,142,142,...,142,142,142,142,142,142,142,142,142,142
Canberra,2425,2425,2425,2425,2425,2425,2425,2425,2425,2425,...,2425,2425,2425,2425,2425,2425,2425,2425,2425,2425
Chinderah,2425,2425,2425,2425,2425,2425,2425,2425,2425,2425,...,2425,2425,2425,2425,2425,2425,2425,2425,2425,2425


In [16]:
import warnings
warnings.filterwarnings("ignore")

weather.region_w[weather.state_w == 'Australian Capital Territory'] = 'Australian Capital Territory'
weather.region_w[(weather.suburb_w == 'Ballarat East') | (weather.suburb_w == 'Black Hill') 
            | (weather.suburb_w == 'Brown Hill') | (weather.suburb_w == 'Mount Pleasant')] = 'Ballarat'
weather.region_w[weather.suburb_w == 'Heatherton'] = 'Kingston'
weather.region_w[weather.suburb_w == 'Heidelberg West'] = 'Banyule'
weather.region_w[weather.suburb_w == 'Plenty'] = 'Nillumbik'
weather.region_w[weather.suburb_w == 'Somers'] = 'Mornington Peninsula'
weather.region_w[weather.suburb_w == 'Alexandria'] = 'City of Sydney'
weather.region_w[weather.suburb_w == 'Booker Bay'] = 'Central Coast'
weather.region_w[weather.suburb_w == 'Chinderah'] = 'Tweed'
weather.region_w[weather.suburb_w == 'Alyangula'] = 'East Arnhem'
weather.region_w[weather.suburb_w == 'Port Denison'] = 'Irwin'

weather[weather.region_w==''].groupby('suburb_w').count()

Unnamed: 0_level_0,date,maxC,minC,sunHour,uv,DewPointC,FeelsLikeC,HeatIndexC,WindChillC,WindGustKmph,...,pressure,visibility,windAZI,windKmph,lat_w,long_w,lat_w_rg,long_w_rg,region_w,state_w
suburb_w,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Innamincka,2425,2425,2425,2425,2425,2425,2425,2425,2425,2425,...,2425,2425,2425,2425,2425,2425,2425,2425,2425,2425
Koomooloo,142,142,142,142,142,142,142,142,142,142,...,142,142,142,142,142,142,142,142,142,142
The Sill,2425,2425,2425,2425,2425,2425,2425,2425,2425,2425,...,2425,2425,2425,2425,2425,2425,2425,2425,2425,2425


In [17]:
# assigning 'nil' to any records where there is no council defined by state government 
weather['region_w'] = weather['region_w'].replace({'': 'nil'})
weather[weather.region_w==''].groupby('suburb_w').count()

temp1 = weather[weather.region_w == 'nil'].groupby('suburb_w')
print('\nCount of records with value of nil for region = ',len(weather[weather.region_w == 'nil'].index),'\n')
temp1.first()


Count of records with value of nil for region =  4992 



Unnamed: 0_level_0,date,maxC,minC,sunHour,uv,DewPointC,FeelsLikeC,HeatIndexC,WindChillC,WindGustKmph,...,pressure,visibility,windAZI,windKmph,lat_w,long_w,lat_w_rg,long_w_rg,region_w,state_w
suburb_w,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Innamincka,2014-09-01,29,16,11.3,6,-3,22,22,22,37,...,1014,10,258,25,-27.7073,140.737,-27.70728,140.73697,nil,South Australia
Koomooloo,2020-12-01,21,13,12.3,4,6,18,18,18,40,...,1012,10,217,31,-33.5547,139.46805,-33.5547,139.46805,nil,South Australia
The Sill,2014-09-01,35,24,11.6,7,22,32,32,29,13,...,1012,10,241,9,-28.1605,138.675,-28.16048,138.6754,nil,South Australia


In [18]:
print('size before remove = ',len(weather.index))
weather = weather.loc[(weather.lat_w != -33.5475) & (weather.long_w != 151.2191)]
print('size after remove = ',len(weather.index))

size before remove =  2237302
size after remove =  2235019


In [19]:
weather_locats2021 = weather.drop_duplicates(subset=['suburb_w', 'state_w'], keep='last').reset_index(drop=True)
weather_locats2021 = weather_locats2021[['lat_w', 'long_w','suburb_w', 'region_w', 'state_w']]
weather_locats2021 = weather_locats2021.sort_values(['suburb_w', 'state_w'], ascending=[True, True]).reset_index(drop=True)
weather_locats2021

Unnamed: 0,lat_w,long_w,suburb_w,region_w,state_w
0,-35.27767,149.11829,Acton,Australian Capital Territory,Australian Capital Territory
1,-34.91119,138.70735,Adelaide Hills,Adelaide Hills,South Australia
2,-35.00310,117.86595,Albany,Albany,Western Australia
3,-34.85925,138.52138,Alberton,Port Adelaide Enfield,South Australia
4,-35.28333,138.48333,Aldinga,Onkaparinga,South Australia
...,...,...,...,...,...
1101,-23.12683,150.74406,Yeppoon,Rockhampton,Queensland
1102,-31.88809,116.76780,York,York,Western Australia
1103,-16.80278,145.72083,Yorkeys Knob,Cairns,Queensland
1104,-34.31350,148.30107,Young,Young,New South Wales


In [20]:
fire_grouped = fire.groupby(['acq_date', 'lat_suburb', 'long_suburb','suburb','state','region']).agg({'frp': ['max', 'mean']})
fire_grouped.columns = ['frp_max', 'frp_mean']

fire_grouped = fire_grouped.reset_index() 
fire_grouped

Unnamed: 0,acq_date,lat_suburb,long_suburb,suburb,state,region,frp_max,frp_mean
0,2014-09-01,-41.15780,147.51727,Scottsdale,Tasmania,Dorset,2.81,2.810000
1,2014-09-01,-36.30768,140.77167,Bordertown,South Australia,Tatiara,2.68,2.625000
2,2014-09-01,-34.95778,117.93833,Lower King,Western Australia,Albany,10.23,8.280000
3,2014-09-01,-34.81826,138.96478,Birdwood,South Australia,Adelaide Hills,22.12,16.133333
4,2014-09-01,-34.18551,142.16251,Mildura,Victoria,Mildura Shire,1.13,1.020000
...,...,...,...,...,...,...,...,...
203316,2021-04-20,-15.77813,128.74414,Kununurra,Western Australia,Wyndham-East Kimberley,54.80,8.088586
203317,2021-04-20,-14.46517,132.26347,Katherine,Northern Territory,Katherine,102.20,8.788636
203318,2021-04-20,-13.85413,136.42129,Alyangula,Northern Territory,East Arnhem,6.80,5.100000
203319,2021-04-20,-12.55397,131.11165,McMinns Lagoon,Northern Territory,Litchfield,11.10,4.237500


In [21]:
weather = weather[weather.date <= fire.acq_date.max()]
weather.date.max()

Timestamp('2021-04-20 00:00:00')

In [22]:
weather_fire = pd.merge(weather,fire_grouped, how='left',left_on=['date','suburb_w','state_w'],right_on=['acq_date','suburb','state'])
weather_fire

Unnamed: 0,date,maxC,minC,sunHour,uv,DewPointC,FeelsLikeC,HeatIndexC,WindChillC,WindGustKmph,...,region_w,state_w,acq_date,lat_suburb,long_suburb,suburb,state,region,frp_max,frp_mean
0,2020-12-01,31,11,13.4,6,12,23,23,22,18,...,Australian Capital Territory,Australian Capital Territory,NaT,,,,,,,
1,2020-12-02,24,12,14.5,6,7,19,20,19,16,...,Australian Capital Territory,Australian Capital Territory,NaT,,,,,,,
2,2020-12-03,26,12,13.4,5,8,18,19,18,14,...,Australian Capital Territory,Australian Capital Territory,NaT,,,,,,,
3,2020-12-04,26,11,14.5,6,4,20,20,20,16,...,Australian Capital Territory,Australian Capital Territory,NaT,,,,,,,
4,2020-12-05,27,11,10.2,5,11,18,18,18,21,...,Australian Capital Territory,Australian Capital Territory,NaT,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2233921,2021-04-16,25,18,8.7,6,16,22,22,21,18,...,Brisbane,Queensland,NaT,,,,,,,
2233922,2021-04-17,21,18,4.2,4,16,20,20,20,17,...,Brisbane,Queensland,NaT,,,,,,,
2233923,2021-04-18,21,15,5.7,4,13,18,18,18,15,...,Brisbane,Queensland,NaT,,,,,,,
2233924,2021-04-19,23,15,8.7,5,13,19,19,19,12,...,Brisbane,Queensland,NaT,,,,,,,


In [23]:
weather_fire[weather_fire.frp_mean.notnull()]

Unnamed: 0,date,maxC,minC,sunHour,uv,DewPointC,FeelsLikeC,HeatIndexC,WindChillC,WindGustKmph,...,region_w,state_w,acq_date,lat_suburb,long_suburb,suburb,state,region,frp_max,frp_mean
143,2020-12-03,26,10,14.5,6,6,18,18,18,12,...,Adelaide Hills,South Australia,2020-12-03,-34.91119,138.70735,Adelaide Hills,South Australia,Adelaide Hills,11.40,6.866667
286,2014-09-05,20,12,8.7,4,12,15,15,15,27,...,Albany,Western Australia,2014-09-05,-35.00310,117.86595,Albany,Western Australia,Albany,0.55,0.550000
300,2014-09-19,20,13,11.6,5,13,15,16,15,18,...,Albany,Western Australia,2014-09-19,-35.00310,117.86595,Albany,Western Australia,Albany,3.53,3.530000
357,2014-11-15,19,8,14.0,5,8,13,14,13,11,...,Albany,Western Australia,2014-11-15,-35.00310,117.86595,Albany,Western Australia,Albany,29.05,14.488333
367,2014-11-25,20,12,14.5,5,10,15,16,15,25,...,Albany,Western Australia,2014-11-25,-35.00310,117.86595,Albany,Western Australia,Albany,5.95,5.950000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2233780,2021-04-16,17,6,8.7,4,3,12,12,12,10,...,Young,New South Wales,2021-04-16,-34.31350,148.30107,Young,New South Wales,Young,27.40,14.150000
2233781,2021-04-17,19,7,8.7,4,4,13,14,13,10,...,Young,New South Wales,2021-04-17,-34.31350,148.30107,Young,New South Wales,Young,30.70,15.245455
2233782,2021-04-18,18,7,8.7,4,4,12,13,12,11,...,Young,New South Wales,2021-04-18,-34.31350,148.30107,Young,New South Wales,Young,14.20,12.466667
2233783,2021-04-19,16,7,8.7,4,3,11,12,11,12,...,Young,New South Wales,2021-04-19,-34.31350,148.30107,Young,New South Wales,Young,43.60,9.542105


In [24]:
weather_fire = weather_fire.drop(['lat_suburb', 'long_suburb', 'lat_w_rg', 'long_w_rg','suburb', 'region', 'state'], axis = 1)
weather_fire.rename({'lat_w':'lat','long_w':'long','suburb_w':'suburb','region_w':'region','state_w':'state'},axis='columns',inplace = True)  
weather_fire.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2233926 entries, 0 to 2233925
Data columns (total 25 columns):
date            datetime64[ns]
maxC            int64
minC            int64
sunHour         float64
uv              int64
DewPointC       int64
FeelsLikeC      int64
HeatIndexC      int64
WindChillC      int64
WindGustKmph    int64
cloudcover      int64
humidity        int64
precipMM        float64
pressure        int64
visibility      int64
windAZI         int64
windKmph        int64
lat             float64
long            float64
suburb          object
region          object
state           object
acq_date        datetime64[ns]
frp_max         float64
frp_mean        float64
dtypes: datetime64[ns](2), float64(6), int64(14), object(3)
memory usage: 443.1+ MB


In [55]:
# replacing 'acq_date' and 'suburb' NaN values with corresponding values from 'location' and 'date'
weather_fire.acq_date.fillna(weather_fire.date, inplace=True) 

# replacing frp NaN values with 0
weather_fire.iloc[:,23:] = weather_fire.iloc[:,23:].fillna(0)

weather_fire

Unnamed: 0,date,maxC,minC,sunHour,uv,DewPointC,FeelsLikeC,HeatIndexC,WindChillC,WindGustKmph,cloudcover,humidity,precipMM,pressure,visibility,windAZI,windKmph,lat,long,suburb,region,state,acq_date,frp_max,frp_mean
0,2020-12-01,31,11,13.4,6,12,23,23,22,18,32,63,0.2,1009,8,216,13,-35.27767,149.11829,Acton,Australian Capital Territory,Australian Capital Territory,2020-12-01,0.0,0.0
1,2020-12-02,24,12,14.5,6,7,19,20,19,16,24,56,0.0,1018,10,271,12,-35.27767,149.11829,Acton,Australian Capital Territory,Australian Capital Territory,2020-12-02,0.0,0.0
2,2020-12-03,26,12,13.4,5,8,18,19,18,14,29,57,0.0,1019,10,197,11,-35.27767,149.11829,Acton,Australian Capital Territory,Australian Capital Territory,2020-12-03,0.0,0.0
3,2020-12-04,26,11,14.5,6,4,20,20,20,16,1,45,0.0,1015,10,245,11,-35.27767,149.11829,Acton,Australian Capital Territory,Australian Capital Territory,2020-12-04,0.0,0.0
4,2020-12-05,27,11,10.2,5,11,18,18,18,21,66,71,33.6,1007,9,229,15,-35.27767,149.11829,Acton,Australian Capital Territory,Australian Capital Territory,2020-12-05,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2233921,2021-04-16,25,18,8.7,6,16,22,22,21,18,23,71,0.0,1016,10,213,12,-27.35591,153.04453,Zillmere,Brisbane,Queensland,2021-04-16,0.0,0.0
2233922,2021-04-17,21,18,4.2,4,16,20,20,20,17,86,77,6.1,1018,9,186,12,-27.35591,153.04453,Zillmere,Brisbane,Queensland,2021-04-17,0.0,0.0
2233923,2021-04-18,21,15,5.7,4,13,18,18,18,15,51,75,1.0,1016,10,201,10,-27.35591,153.04453,Zillmere,Brisbane,Queensland,2021-04-18,0.0,0.0
2233924,2021-04-19,23,15,8.7,5,13,19,19,19,12,12,70,0.1,1017,10,154,8,-27.35591,153.04453,Zillmere,Brisbane,Queensland,2021-04-19,0.0,0.0


In [56]:
print('','Records per State',weather_fire['state'].value_counts(),sep='\n')
print('','Counts of Missing values',weather_fire.isnull().sum(),sep='\n')


Records per State
New South Wales                 778473
Victoria                        381219
Queensland                      355542
Western Australia               279411
South Australia                 268332
Tasmania                        131460
Northern Territory               33936
Australian Capital Territory      5553
Name: state, dtype: int64

Counts of Missing values
date            0
maxC            0
minC            0
sunHour         0
uv              0
DewPointC       0
FeelsLikeC      0
HeatIndexC      0
WindChillC      0
WindGustKmph    0
cloudcover      0
humidity        0
precipMM        0
pressure        0
visibility      0
windAZI         0
windKmph        0
lat             0
long            0
suburb          0
region          0
state           0
acq_date        0
frp_max         0
frp_mean        0
dtype: int64


In [57]:
if weather_fire[weather_fire.frp_mean>0].date.max() < weather_fire.date.max():
    print('\nOH NO!')
    print('\nOH NO!')
    print('\nOH NO!')
    print('\ndate mismatch ALERT!!!!')
    print('\nWARNING - DOES THE FIRE DATA DATE RANGE MATCH THE WEATHER DATA DATE RANGE?\n')
    print('max WEATHER date = ',weather_fire.date.max())
    print('max FIRE date    = ',weather_fire[weather_fire.frp_mean>0].date.max())
    print('\nIf there is no fire data for the max date in weather data all predictions will be effected\n')
elif weather_fire[weather_fire.frp_mean>0].date.max() == weather_fire.date.max():
  print('\nNo stress - date ranges match real nice :) ')


No stress - date ranges match real nice :) 


In [66]:
drop_features3 = ['date','acq_date','suburb','region','state','frp_mean','frp_max','suburb','region']

In [92]:
wf1b = weather_fire.copy()

wf1b['cat'] = 0

# # BINARY - frp_MAX 
wf1b['cat'] = np.where((wf1b['frp_max'] < 12) & (wf1b['date'] != wf1b.date.max()), 0, wf1b['cat'])
wf1b['cat'] = np.where((wf1b['frp_max'] >= 12) & (wf1b['date'] != wf1b.date.max()), 1 , wf1b['cat'])

# wf1b_ed = wf1b.drop(drop_features3, axis = 1) 
# We filter for Albany
# wf1_alb=wf1b_ed[wf1b_ed.location == 'Albany'] --> use this if wanted to test on individual region and comment the wf1_alb=wf1b_ed
wf1_alb=wf1b

pd.set_option('display.max_columns', 34)
wf1_alb

Unnamed: 0,date,maxC,minC,sunHour,uv,DewPointC,FeelsLikeC,HeatIndexC,WindChillC,WindGustKmph,cloudcover,humidity,precipMM,pressure,visibility,windAZI,windKmph,lat,long,suburb,region,state,acq_date,frp_max,frp_mean,cat
0,2020-12-01,31,11,13.4,6,12,23,23,22,18,32,63,0.2,1009,8,216,13,-35.27767,149.11829,Acton,Australian Capital Territory,Australian Capital Territory,2020-12-01,0.0,0.0,0
1,2020-12-02,24,12,14.5,6,7,19,20,19,16,24,56,0.0,1018,10,271,12,-35.27767,149.11829,Acton,Australian Capital Territory,Australian Capital Territory,2020-12-02,0.0,0.0,0
2,2020-12-03,26,12,13.4,5,8,18,19,18,14,29,57,0.0,1019,10,197,11,-35.27767,149.11829,Acton,Australian Capital Territory,Australian Capital Territory,2020-12-03,0.0,0.0,0
3,2020-12-04,26,11,14.5,6,4,20,20,20,16,1,45,0.0,1015,10,245,11,-35.27767,149.11829,Acton,Australian Capital Territory,Australian Capital Territory,2020-12-04,0.0,0.0,0
4,2020-12-05,27,11,10.2,5,11,18,18,18,21,66,71,33.6,1007,9,229,15,-35.27767,149.11829,Acton,Australian Capital Territory,Australian Capital Territory,2020-12-05,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2233921,2021-04-16,25,18,8.7,6,16,22,22,21,18,23,71,0.0,1016,10,213,12,-27.35591,153.04453,Zillmere,Brisbane,Queensland,2021-04-16,0.0,0.0,0
2233922,2021-04-17,21,18,4.2,4,16,20,20,20,17,86,77,6.1,1018,9,186,12,-27.35591,153.04453,Zillmere,Brisbane,Queensland,2021-04-17,0.0,0.0,0
2233923,2021-04-18,21,15,5.7,4,13,18,18,18,15,51,75,1.0,1016,10,201,10,-27.35591,153.04453,Zillmere,Brisbane,Queensland,2021-04-18,0.0,0.0,0
2233924,2021-04-19,23,15,8.7,5,13,19,19,19,12,12,70,0.1,1017,10,154,8,-27.35591,153.04453,Zillmere,Brisbane,Queensland,2021-04-19,0.0,0.0,0


In [93]:
def series_to_supervised(data,n_in=1,n_out=1,dropnan = True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols = list()
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
    for i in range(0, n_out):
        cols.append(df.shift(-i))
    agg = concat(cols, axis=1)
    if dropnan:
        agg.dropna(inplace=True)
    return agg.values

In [94]:
from numpy import asarray
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.metrics import mean_absolute_error
from xgboost import XGBClassifier
from matplotlib import pyplot
from datetime import datetime, timedelta

In [95]:
dataset = wf1_alb
max_date = dataset.date.max()
dataset.name = 'wf1_alb'
max_date_minus_1=max_date-timedelta(1)

test_data = dataset.loc[dataset.date == max_date].reset_index(drop=True)

# ----------------------------------------------------------------------------
train = dataset.loc[dataset.date != max_date].reset_index(drop=True)
# test = dataset.loc[dataset.date == max_date].reset_index(drop=True)
test = pd.concat([dataset.loc[dataset.date == max_date_minus_1].reset_index(drop=True),dataset.loc[dataset.date == max_date].reset_index(drop=True)], axis=1)



train = train.drop(drop_features3,axis=1).values
test = test.drop(drop_features3,axis=1).values

# Use the code between the lines if divide the dataset and apply supervised

train = pd.DataFrame(series_to_supervised(train))
# test = pd.DataFrame(series_to_supervised(test))
# ---------------------------------------------------------------------------
# Use this part to apply supervised and to divide 
# dataset = pd.DataFrame(series_to_supervised(dataset))
# train = dataset.loc[dataset.loc[:,0] != max_date].reset_index(drop=True)
# test = dataset.loc[dataset.loc[:,0] == max_date].reset_index(drop=True)
#--------------------------------------------------------------------------

cat_labels = labels=["low","high"]


y_train = train.iloc[:,-1]
x_train = train.iloc[:,:-1]

y_test = test[:,-1]
x_test = test[:,:-1]


In [96]:
print('train shape =',train.shape)
print('test shape = ',test.shape,'\n')
print('x_train shape =',x_train.shape)
print('y_train shape =',y_train.shape,'\n')
print('x_test shape = ',x_test.shape)
print('y_test shape = ',y_test.shape,'\n')

print('Classes = ',cat_labels,'\n')

train shape = (2232832, 38)
test shape =  (1093, 38) 

x_train shape = (2232832, 37)
y_train shape = (2232832,) 

x_test shape =  (1093, 37)
y_test shape =  (1093,) 

Classes =  ['low', 'high'] 



In [97]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

undersample = RandomUnderSampler(sampling_strategy='majority', random_state=24)
x_under, y_under = undersample.fit_resample(x_train, y_train)
print('')
print(Counter(y_train))
print(Counter(y_under))


Counter({0.0: 2162325, 1.0: 70507})
Counter({0.0: 70507, 1.0: 70507})


In [98]:
import xgboost as xgb
model = xgb.XGBClassifier(n_estimators=1000)
model.fit(x_under,y_under)
y_pred = model.predict(x_test)




In [99]:
daily_predict = test_data.copy()
daily_predict['pred'] = y_pred

daily_predict

Unnamed: 0,date,maxC,minC,sunHour,uv,DewPointC,FeelsLikeC,HeatIndexC,WindChillC,WindGustKmph,cloudcover,humidity,precipMM,pressure,visibility,windAZI,windKmph,lat,long,suburb,region,state,acq_date,frp_max,frp_mean,cat,pred
0,2021-04-20,17,5,8.7,4,4,11,11,11,20,29,65,0.1,1014,10,284,14,-35.27767,149.11829,Acton,Australian Capital Territory,Australian Capital Territory,2021-04-20,0.0,0.00,0,0.0
1,2021-04-20,17,10,4.1,3,10,13,13,13,17,53,79,4.4,1017,10,243,11,-34.91119,138.70735,Adelaide Hills,Adelaide Hills,South Australia,2021-04-20,0.0,0.00,0,0.0
2,2021-04-20,17,14,5.7,4,11,15,16,15,13,67,73,0.6,1021,9,141,8,-35.00310,117.86595,Albany,Albany,Western Australia,2021-04-20,0.0,0.00,0,0.0
3,2021-04-20,18,13,5.7,4,11,16,16,16,26,54,70,4.8,1017,10,224,18,-34.85925,138.52138,Alberton,Port Adelaide Enfield,South Australia,2021-04-20,0.0,0.00,0,0.0
4,2021-04-20,17,12,4.1,4,10,15,15,15,29,68,72,5.0,1017,10,220,20,-35.28333,138.48333,Aldinga,Onkaparinga,South Australia,2021-04-20,0.0,0.00,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1088,2021-04-20,27,17,8.7,6,14,23,23,22,16,7,66,0.0,1017,10,102,10,-23.12683,150.74406,Yeppoon,Rockhampton,Queensland,2021-04-20,0.0,0.00,0,1.0
1089,2021-04-20,23,14,8.7,5,8,17,18,17,18,10,56,0.0,1019,10,130,11,-31.88809,116.76780,York,York,Western Australia,2021-04-20,5.3,3.07,0,1.0
1090,2021-04-20,26,22,5.9,5,22,26,26,24,21,70,89,11.0,1012,10,144,13,-16.80278,145.72083,Yorkeys Knob,Cairns,Queensland,2021-04-20,0.0,0.00,0,0.0
1091,2021-04-20,19,7,8.7,4,5,13,13,13,15,42,64,0.2,1015,10,219,10,-34.31350,148.30107,Young,Young,New South Wales,2021-04-20,44.7,14.71,0,1.0
