In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
pd.set_option('max_colwidth', None)
var=pd.read_csv('/content/drive/MyDrive/fire-extent-prediction-challenge-120240716-19716-ztmgh3/variable_definitions.csv')
var

Unnamed: 0,Variable,Description
0,ID,The IDs take the form of [area ID]_yyyy-mm-dd. There are 533 area squares each with a unique ID ranging from 0 to 532.
1,area,Area ID
2,date,The date that the data is aggregated over
3,lat,Latitude of the center of the area
4,lon,Longitude of the center of the area
5,burn_area,Percentage of the area burnt
6,climate_aet,"Actual evapotranspiration, derived using a one-dimensional soil water balance model"
7,climate_def,"Climate water deficit, derived using a one-dimensional soil water balance model"
8,climate_pdsi,Palmer Drought Severity Index
9,climate_pet,Reference evapotranspiration (ASCE Penman-Montieth)


In [4]:
train=pd.read_csv('/content/drive/MyDrive/fire-extent-prediction-challenge-120240716-19716-ztmgh3/Train.csv')

In [5]:
pd.set_option('display.max_columns', None)
train

Unnamed: 0,ID,lat,lon,burn_area,climate_aet,climate_def,climate_pdsi,climate_pet,climate_pr,climate_ro,climate_soil,climate_srad,climate_swe,climate_tmmn,climate_tmmx,climate_vap,climate_vpd,climate_vs,elevation,landcover_0,landcover_1,landcover_2,landcover_3,landcover_4,landcover_5,landcover_6,landcover_7,landcover_8,precipitation
0,0_2001-01-01,-15.858835,29.237029,0.0,1195,0,263,1195,206,10,1692,1861,0,211,317,2493,111,141,413.474762,0.0,0.0,0.018654,0.0,0.714446,0.012174,0.244890,0.009836,0.0,0.256932
1,1_2001-01-01,-15.858835,29.487029,0.0,1196,0,232,1196,201,10,1859,1867,0,211,318,2497,112,138,429.034543,0.0,0.0,0.000000,0.0,0.654783,0.000095,0.345121,0.000000,0.0,0.273093
2,2_2001-01-01,-15.858835,29.737029,0.0,1190,0,314,1190,192,10,1677,1861,0,208,317,2486,109,141,477.246432,0.0,0.0,0.000000,0.0,0.516421,0.000000,0.483579,0.000000,0.0,0.285109
3,3_2001-01-01,-15.858835,29.987029,0.0,1144,0,321,1144,186,66,1061,1864,0,196,303,2333,98,124,646.388681,0.0,0.0,0.000000,0.0,0.299000,0.163902,0.537098,0.000000,0.0,0.298418
4,4_2001-01-01,-15.858835,30.237029,0.0,1187,0,413,1187,186,9,1714,1852,0,204,314,2426,109,145,582.340637,0.0,0.0,0.000000,0.0,0.277392,0.067742,0.654866,0.000000,0.0,0.315621
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83143,528_2013-12-01,-22.108835,30.737029,0.0,1210,324,142,1535,127,6,51,2233,0,206,319,2111,148,223,464.141579,0.0,0.0,0.000000,0.0,0.008768,0.000000,0.991232,0.000000,0.0,0.132015
83144,529_2013-12-01,-22.108835,30.987029,0.0,1295,195,126,1490,136,7,61,2210,0,203,320,2156,143,206,402.388535,0.0,0.0,0.000000,0.0,0.216560,0.000000,0.783440,0.000000,0.0,0.124280
83145,530_2013-12-01,-22.108835,31.237029,0.0,1254,272,111,1527,132,7,29,2205,0,211,333,2225,160,198,366.349846,0.0,0.0,0.000000,0.0,0.392621,0.000000,0.607379,0.000000,0.0,0.115464
83146,531_2013-12-01,-22.108835,31.487029,0.0,1221,289,68,1510,128,6,62,2156,0,212,332,2258,155,208,293.380329,0.0,0.0,0.000000,0.0,0.637696,0.000000,0.362304,0.000000,0.0,0.119895


In [6]:
# Display information of the train dataset
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83148 entries, 0 to 83147
Data columns (total 29 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID             83148 non-null  object 
 1   lat            83148 non-null  float64
 2   lon            83148 non-null  float64
 3   burn_area      83148 non-null  float64
 4   climate_aet    83148 non-null  int64  
 5   climate_def    83148 non-null  int64  
 6   climate_pdsi   83148 non-null  int64  
 7   climate_pet    83148 non-null  int64  
 8   climate_pr     83148 non-null  int64  
 9   climate_ro     83148 non-null  int64  
 10  climate_soil   83148 non-null  int64  
 11  climate_srad   83148 non-null  int64  
 12  climate_swe    83148 non-null  int64  
 13  climate_tmmn   83148 non-null  int64  
 14  climate_tmmx   83148 non-null  int64  
 15  climate_vap    83148 non-null  int64  
 16  climate_vpd    83148 non-null  int64  
 17  climate_vs     83148 non-null  int64  
 18  elevat

In [7]:
train.isnull().sum()

Unnamed: 0,0
ID,0
lat,0
lon,0
burn_area,0
climate_aet,0
climate_def,0
climate_pdsi,0
climate_pet,0
climate_pr,0
climate_ro,0


In [8]:
train.describe()

Unnamed: 0,lat,lon,burn_area,climate_aet,climate_def,climate_pdsi,climate_pet,climate_pr,climate_ro,climate_soil,climate_srad,climate_swe,climate_tmmn,climate_tmmx,climate_vap,climate_vpd,climate_vs,elevation,landcover_0,landcover_1,landcover_2,landcover_3,landcover_4,landcover_5,landcover_6,landcover_7,landcover_8,precipitation
count,83148.0,83148.0,83148.0,83148.0,83148.0,83148.0,83148.0,83148.0,83148.0,83148.0,83148.0,83148.0,83148.0,83148.0,83148.0,83148.0,83148.0,83148.0,83148.0,83148.0,83148.0,83148.0,83148.0,83148.0,83148.0,83148.0,83148.0,83148.0
mean,-19.014557,29.860856,0.007632,484.491124,828.892721,-30.806237,1313.382992,54.576322,6.215134,305.477919,2281.967901,0.0,140.830303,285.288654,1554.532123,127.380117,208.414574,978.992612,0.006158,1.7e-05,0.008583,4.066313e-08,0.160628,0.037453,0.785382,0.000354,0.001424,0.078551
std,1.588444,1.75109,0.029226,470.46748,546.934469,235.80293,315.209838,71.997561,21.480399,400.956123,333.049616,0.0,44.631924,34.840502,458.461471,44.172381,47.550163,297.497348,0.053111,0.000141,0.059046,3.535121e-06,0.192957,0.10605,0.233953,0.001773,0.014233,0.104464
min,-22.358835,25.487029,0.0,0.0,0.0,-563.0,587.0,0.0,0.0,1.0,1167.0,0.0,19.0,139.0,704.0,32.0,48.0,265.379418,0.0,0.0,0.0,0.0,0.0,0.0,1.9e-05,0.0,0.0,0.0
25%,-20.358835,28.487029,0.0,50.0,375.0,-212.0,1081.0,1.0,0.0,70.0,2028.0,0.0,103.0,262.0,1139.0,97.0,174.0,768.375959,0.0,0.0,0.0,0.0,0.014993,0.0,0.655322,0.0,0.0,0.002017
50%,-18.858835,29.987029,0.0,302.0,858.0,-108.0,1290.0,22.0,1.0,149.0,2272.0,0.0,150.0,287.0,1544.0,118.0,205.0,1022.813217,0.0,0.0,0.0,0.0,0.076706,0.000139,0.8806,0.0,0.0,0.028503
75%,-17.858835,31.237029,0.0,931.0,1211.0,142.0,1517.0,87.0,4.0,329.0,2545.0,0.0,177.0,310.0,1942.0,149.0,241.0,1197.977194,0.0,0.0,0.0,0.0,0.246498,0.015166,0.971209,0.0,0.0,0.1255
max,-15.858835,32.987029,0.843886,1713.0,2614.0,851.0,2620.0,501.0,342.0,3319.0,3170.0,0.0,244.0,400.0,2786.0,344.0,393.0,1771.930689,0.828131,0.004918,0.711603,0.0003073689,0.981472,0.881323,1.0,0.016708,0.301823,0.801328


In [9]:
#drop all duplicates
train.drop_duplicates(inplace=True)

In [10]:
# Split the ID to get date string
train['date'] = pd.to_datetime(train['ID'].apply(lambda x: x.split('_')[1]))

In [11]:
# Extract year,month,day from datetime column
train['year'] = pd.to_datetime(train['date']).dt.year
train ['month'] = pd.to_datetime(train['date']).dt.month
train['day']=pd.to_datetime(train['date']).dt.day

In [12]:
train.nunique()

Unnamed: 0,0
ID,83148
lat,27
lon,31
burn_area,16152
climate_aet,1584
climate_def,2348
climate_pdsi,1249
climate_pet,1779
climate_pr,420
climate_ro,272


In [13]:
#drop date column and other columns with one value(day ,climate)
train.drop(columns=['day','date','climate_swe'],inplace=True)

In [14]:
train

Unnamed: 0,ID,lat,lon,burn_area,climate_aet,climate_def,climate_pdsi,climate_pet,climate_pr,climate_ro,climate_soil,climate_srad,climate_tmmn,climate_tmmx,climate_vap,climate_vpd,climate_vs,elevation,landcover_0,landcover_1,landcover_2,landcover_3,landcover_4,landcover_5,landcover_6,landcover_7,landcover_8,precipitation,year,month
0,0_2001-01-01,-15.858835,29.237029,0.0,1195,0,263,1195,206,10,1692,1861,211,317,2493,111,141,413.474762,0.0,0.0,0.018654,0.0,0.714446,0.012174,0.244890,0.009836,0.0,0.256932,2001,1
1,1_2001-01-01,-15.858835,29.487029,0.0,1196,0,232,1196,201,10,1859,1867,211,318,2497,112,138,429.034543,0.0,0.0,0.000000,0.0,0.654783,0.000095,0.345121,0.000000,0.0,0.273093,2001,1
2,2_2001-01-01,-15.858835,29.737029,0.0,1190,0,314,1190,192,10,1677,1861,208,317,2486,109,141,477.246432,0.0,0.0,0.000000,0.0,0.516421,0.000000,0.483579,0.000000,0.0,0.285109,2001,1
3,3_2001-01-01,-15.858835,29.987029,0.0,1144,0,321,1144,186,66,1061,1864,196,303,2333,98,124,646.388681,0.0,0.0,0.000000,0.0,0.299000,0.163902,0.537098,0.000000,0.0,0.298418,2001,1
4,4_2001-01-01,-15.858835,30.237029,0.0,1187,0,413,1187,186,9,1714,1852,204,314,2426,109,145,582.340637,0.0,0.0,0.000000,0.0,0.277392,0.067742,0.654866,0.000000,0.0,0.315621,2001,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83143,528_2013-12-01,-22.108835,30.737029,0.0,1210,324,142,1535,127,6,51,2233,206,319,2111,148,223,464.141579,0.0,0.0,0.000000,0.0,0.008768,0.000000,0.991232,0.000000,0.0,0.132015,2013,12
83144,529_2013-12-01,-22.108835,30.987029,0.0,1295,195,126,1490,136,7,61,2210,203,320,2156,143,206,402.388535,0.0,0.0,0.000000,0.0,0.216560,0.000000,0.783440,0.000000,0.0,0.124280,2013,12
83145,530_2013-12-01,-22.108835,31.237029,0.0,1254,272,111,1527,132,7,29,2205,211,333,2225,160,198,366.349846,0.0,0.0,0.000000,0.0,0.392621,0.000000,0.607379,0.000000,0.0,0.115464,2013,12
83146,531_2013-12-01,-22.108835,31.487029,0.0,1221,289,68,1510,128,6,62,2156,212,332,2258,155,208,293.380329,0.0,0.0,0.000000,0.0,0.637696,0.000000,0.362304,0.000000,0.0,0.119895,2013,12


In [15]:
test=pd.read_csv('/content/drive/MyDrive/fire-extent-prediction-challenge-120240716-19716-ztmgh3/Test.csv')
test

Unnamed: 0,ID,lat,lon,climate_aet,climate_def,climate_pdsi,climate_pet,climate_pr,climate_ro,climate_soil,climate_srad,climate_swe,climate_tmmn,climate_tmmx,climate_vap,climate_vpd,climate_vs,elevation,landcover_0,landcover_1,landcover_2,landcover_3,landcover_4,landcover_5,landcover_6,landcover_7,landcover_8,precipitation
0,0_2014-01-01,-15.858835,29.237029,1146,74,-451,1220,120,6,212,1974,0,217,303,2481,99,148,413.474762,0.0,0.000615,0.015418,0.0,0.416704,0.023724,0.530016,0.013524,0.0,0.404243
1,1_2014-01-01,-15.858835,29.487029,1118,98,-405,1216,117,6,330,1973,0,216,304,2481,100,143,429.034543,0.0,0.000000,0.000000,0.0,0.406436,0.000446,0.593118,0.000000,0.0,0.421489
2,2_2014-01-01,-15.858835,29.737029,1067,137,-468,1204,111,6,194,1958,0,214,302,2465,97,143,477.246432,0.0,0.000000,0.000000,0.0,0.254791,0.000000,0.745209,0.000000,0.0,0.419122
3,3_2014-01-01,-15.858835,29.987029,969,187,-533,1156,101,5,78,1954,0,202,290,2317,88,122,646.388681,0.0,0.000000,0.000000,0.0,0.219340,0.182153,0.598507,0.000000,0.0,0.404596
4,4_2014-01-01,-15.858835,30.237029,927,270,-505,1197,96,5,139,1939,0,210,301,2412,99,143,582.340637,0.0,0.000000,0.000000,0.0,0.220239,0.097748,0.682013,0.000000,0.0,0.371100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25579,528_2017-12-01,-22.108835,30.737029,802,1030,168,1832,84,4,81,2757,0,202,328,1958,173,239,464.141579,0.0,0.000000,0.000000,0.0,0.029888,0.000000,0.970112,0.000000,0.0,0.049016
25580,529_2017-12-01,-22.108835,30.987029,819,971,169,1790,85,4,81,2725,0,199,330,1980,172,221,402.388535,0.0,0.000000,0.000000,0.0,0.396455,0.000000,0.603545,0.000000,0.0,0.041220
25581,530_2017-12-01,-22.108835,31.237029,756,1076,156,1833,79,4,88,2710,0,208,343,2021,194,213,366.349846,0.0,0.000000,0.000000,0.0,0.455142,0.000000,0.544858,0.000000,0.0,0.031323
25582,531_2017-12-01,-22.108835,31.487029,749,1071,178,1820,78,4,102,2653,0,210,342,2051,190,223,293.380329,0.0,0.000000,0.000000,0.0,0.680874,0.000000,0.319126,0.000000,0.0,0.032025


In [16]:
test.drop_duplicates(inplace=True)

In [17]:
# Split the ID to get the date string
test['date'] = pd.to_datetime(test['ID'].apply(lambda x: x.split('_')[1]))

In [18]:
#extract year,month,day from datetime column
test['year'] = pd.to_datetime(test['date']).dt.year
test['month'] = pd.to_datetime(test['date']).dt.month
test['day']=pd.to_datetime(test['date']).dt.day

In [19]:
test.nunique()

Unnamed: 0,0
ID,25584
lat,27
lon,31
climate_aet,1626
climate_def,2248
climate_pdsi,1990
climate_pet,1701
climate_pr,473
climate_ro,340
climate_soil,1754


In [20]:
#drop date column and other columns with one value(day ,climate)
test.drop(columns=['day','date','climate_swe'],inplace=True)

In [21]:
test

Unnamed: 0,ID,lat,lon,climate_aet,climate_def,climate_pdsi,climate_pet,climate_pr,climate_ro,climate_soil,climate_srad,climate_tmmn,climate_tmmx,climate_vap,climate_vpd,climate_vs,elevation,landcover_0,landcover_1,landcover_2,landcover_3,landcover_4,landcover_5,landcover_6,landcover_7,landcover_8,precipitation,year,month
0,0_2014-01-01,-15.858835,29.237029,1146,74,-451,1220,120,6,212,1974,217,303,2481,99,148,413.474762,0.0,0.000615,0.015418,0.0,0.416704,0.023724,0.530016,0.013524,0.0,0.404243,2014,1
1,1_2014-01-01,-15.858835,29.487029,1118,98,-405,1216,117,6,330,1973,216,304,2481,100,143,429.034543,0.0,0.000000,0.000000,0.0,0.406436,0.000446,0.593118,0.000000,0.0,0.421489,2014,1
2,2_2014-01-01,-15.858835,29.737029,1067,137,-468,1204,111,6,194,1958,214,302,2465,97,143,477.246432,0.0,0.000000,0.000000,0.0,0.254791,0.000000,0.745209,0.000000,0.0,0.419122,2014,1
3,3_2014-01-01,-15.858835,29.987029,969,187,-533,1156,101,5,78,1954,202,290,2317,88,122,646.388681,0.0,0.000000,0.000000,0.0,0.219340,0.182153,0.598507,0.000000,0.0,0.404596,2014,1
4,4_2014-01-01,-15.858835,30.237029,927,270,-505,1197,96,5,139,1939,210,301,2412,99,143,582.340637,0.0,0.000000,0.000000,0.0,0.220239,0.097748,0.682013,0.000000,0.0,0.371100,2014,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25579,528_2017-12-01,-22.108835,30.737029,802,1030,168,1832,84,4,81,2757,202,328,1958,173,239,464.141579,0.0,0.000000,0.000000,0.0,0.029888,0.000000,0.970112,0.000000,0.0,0.049016,2017,12
25580,529_2017-12-01,-22.108835,30.987029,819,971,169,1790,85,4,81,2725,199,330,1980,172,221,402.388535,0.0,0.000000,0.000000,0.0,0.396455,0.000000,0.603545,0.000000,0.0,0.041220,2017,12
25581,530_2017-12-01,-22.108835,31.237029,756,1076,156,1833,79,4,88,2710,208,343,2021,194,213,366.349846,0.0,0.000000,0.000000,0.0,0.455142,0.000000,0.544858,0.000000,0.0,0.031323,2017,12
25582,531_2017-12-01,-22.108835,31.487029,749,1071,178,1820,78,4,102,2653,210,342,2051,190,223,293.380329,0.0,0.000000,0.000000,0.0,0.680874,0.000000,0.319126,0.000000,0.0,0.032025,2017,12


model building

In [22]:
# Features to use
train_features = ['lat','lon','climate_aet','climate_def','climate_pdsi','climate_pet','climate_pr','climate_ro',
                   'climate_soil','climate_srad','climate_tmmn','climate_tmmx','climate_vap','climate_vpd',
                   'climate_vs','elevation','landcover_0','landcover_1','landcover_2','landcover_3','landcover_4',
                   'landcover_5','landcover_6','landcover_7','landcover_8','precipitation','year','month']

In [23]:
# Define features and target variable
X_train = train[train_features]
y_train = train['burn_area']
X_test = test[train_features]

In [24]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [25]:
# Feature Selection with RFE
xgb = XGBRegressor(random_state=42)
selector = RFECV(estimator=xgb, step=1, cv=5)
selector = selector.fit(X_train_scaled, y_train)
X_train_selected = selector.transform(X_train_scaled)
X_test_selected = selector.transform(X_test_scaled)

In [26]:
# Initialize XGBoost model
xgb = XGBRegressor(random_state=42)

In [27]:
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
# grid_search.fit(X_train, y_train)

In [28]:

#Define parameter grid for GridSearchCV
param_grid = {'n_estimators': [100, 200, 300],
                       'max_depth': [3, 5, 7, 10],
                       'learning_rate': [0.01, 0.1, 0.2],
                       'subsample': [0.8, 0.9, 1.0],
                       'colsample_bytree': [0.8, 0.9, 1.0]}

In [29]:
# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring='neg_mean_squared_error',n_jobs=-1, verbose=2)
grid_search.fit(X_train_selected, y_train)

# Best parameters from Grid Search
print(f"Best Parameters: {grid_search.best_params_}")

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200, 'subsample': 1.0}


In [30]:
# Train the model with the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_selected, y_train)

In [31]:
# Evaluate on the validation set
val_predictions = best_model.predict(X_train_selected)
val_mse = mean_squared_error(y_train, val_predictions)
rmse = mean_squared_error(y_train, val_predictions,squared=False)
val_r2 = r2_score(y_train, val_predictions)

In [32]:
print(f"Training rmse: {rmse}")

Training rmse: 0.02035244294567497


In [33]:
print(f"Training Mean Squared Error: {val_mse}")
print(f"Training R^2 Score: {val_r2}")

Training Mean Squared Error: 0.00041422193385695486
Training R^2 Score: 0.5150493817938411


In [34]:
# Make predictions on the test set
test_predictions = best_model.predict(X_test_selected)

In [35]:
# Prepare the submission file
submission = pd.DataFrame({
    'ID': test['ID'],
    'burn_area': test_predictions
})
submission.to_csv('submission.csv', index=False)

In [36]:
pd.set_option('display.max_row', None)
submission

Unnamed: 0,ID,burn_area
0,0_2014-01-01,0.0002177294
1,1_2014-01-01,-0.0005695627
2,2_2014-01-01,0.000590094
3,3_2014-01-01,-0.00015164
4,4_2014-01-01,0.0003202027
5,5_2014-01-01,0.0004805573
6,6_2014-01-01,0.002130142
7,7_2014-01-01,0.00147383
8,8_2014-01-01,-0.0004977161
9,9_2014-01-01,-0.0001843375
