**Import Libraries**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

**Load data**

In [None]:
dataset_path = '/content/data/Problem3.csv'
data_df = pd.read_csv(dataset_path)
data_df

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,4.468204,26.2,94.3,1.808289,8.2,51,6.7,False,0.000000
1,7,4,oct,tue,4.517431,35.4,669.1,2.041220,18.0,33,0.9,False,0.000000
2,7,4,oct,sat,4.517431,43.7,686.9,2.041220,14.6,33,1.3,False,0.000000
3,8,6,mar,fri,4.529368,33.3,77.5,2.302585,8.3,97,4.0,True,0.000000
4,8,6,mar,sun,4.503137,51.3,102.2,2.360854,11.4,99,1.8,False,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,4,3,aug,sun,4.414010,56.7,665.6,1.064711,27.8,32,2.7,False,2.006871
506,2,4,aug,sun,4.414010,56.7,665.6,1.064711,21.9,71,5.8,False,4.012592
507,7,4,aug,sun,4.414010,56.7,665.6,1.064711,21.2,70,6.7,False,2.498152
508,1,4,aug,sat,4.558079,146.0,614.7,2.509599,25.6,42,4.0,False,0.000000


Convert all data into numbers. In this case, convert the collumn (month, day: string -> number) and (rain: from boolean to number)

In [None]:
categorical_cols = data_df.select_dtypes(include=['object', 'bool']).columns.to_list()
for col_name in categorical_cols:
  n_categories = data_df[col_name].nunique()
  print(f'Number of categories in {col_name}: {n_categories}')

ordinal_encoder = OrdinalEncoder()
encoded_categorical_cols = ordinal_encoder.fit_transform(data_df[categorical_cols ])
encoded_categorical_df = pd.DataFrame(encoded_categorical_cols ,
                                      columns = categorical_cols)

numerical_df = data_df.drop(categorical_cols, axis =1)
encoded_df = pd.concat([numerical_df, encoded_categorical_df], axis =1)

Number of categories in month: 12
Number of categories in day: 7
Number of categories in rain: 2


In [None]:
encoded_df  #new dataframe after being processed

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,area,month,day,rain
0,7,5,4.468204,26.2,94.3,1.808289,8.2,51,6.7,0.000000,7.0,0.0,0.0
1,7,4,4.517431,35.4,669.1,2.041220,18.0,33,0.9,0.000000,10.0,5.0,0.0
2,7,4,4.517431,43.7,686.9,2.041220,14.6,33,1.3,0.000000,10.0,2.0,0.0
3,8,6,4.529368,33.3,77.5,2.302585,8.3,97,4.0,0.000000,7.0,0.0,1.0
4,8,6,4.503137,51.3,102.2,2.360854,11.4,99,1.8,0.000000,7.0,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,4,3,4.414010,56.7,665.6,1.064711,27.8,32,2.7,2.006871,1.0,3.0,0.0
506,2,4,4.414010,56.7,665.6,1.064711,21.9,71,5.8,4.012592,1.0,3.0,0.0
507,7,4,4.414010,56.7,665.6,1.064711,21.2,70,6.7,2.498152,1.0,3.0,0.0
508,1,4,4.558079,146.0,614.7,2.509599,25.6,42,4.0,0.000000,1.0,2.0,0.0


**Split data into Train and Test, and get the labeled output y (because here is Supervised Learning)**

In [None]:
X = encoded_df.drop(columns=['area']) # Features
y = encoded_df['area']  #labeled output y

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3, random_state =7)

**Build XGBoost model, with seed=7, learning_rate=0.01, n_estimators=102, max_depth=3**

In [None]:
xg_reg = xgb.XGBRegressor(seed=7,
learning_rate=0.01 ,
n_estimators=102 ,
max_depth=3)

xg_reg.fit( X_train , y_train )

**Get the predictions and Verify the accuracy results(MSE/MAE)**

In [None]:
preds = xg_reg.predict(X_test)  # get the predictions of the model using test dataset
print(f'Predictions: {preds}')

# calculate the Error
mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
print('--------------------------------------------------------------------')
print('Evaluation results on test set:')
print(f'Mean Absolute Error : {mae}')
print(f'Mean Squared Error : {mse}')

Predictions: [1.0842953  1.1191528  0.8486002  2.0224624  1.2788543  1.2250828
 1.3104433  1.2034497  1.049782   1.1554059  1.199015   1.1923933
 1.0917312  1.0008674  1.0240752  1.2901965  0.8638762  1.0937988
 0.99773127 1.1649472  1.1940109  1.3383218  1.0232271  1.2133856
 1.1919765  1.1378535  1.0194002  1.0811108  1.184996   1.3049333
 1.1830077  1.3067408  1.1667005  1.3425988  1.3937967  1.0007488
 1.208391   1.2034497  1.0767705  1.2819778  1.1705064  1.2479149
 0.92552763 1.8053247  1.0428364  1.0156305  1.1495866  1.1290253
 1.026823   1.174421   1.0199474  1.1087017  1.3341991  1.292637
 1.3510199  1.0883718  0.9601164  1.2643764  1.117172   1.3104433
 1.1043178  1.1449616  1.0224761  1.1609155  1.2342656  1.1611481
 1.4087317  1.0568017  0.9867578  1.0175698  0.9044889  1.1912946
 1.3288516  1.2732868  1.0695987  1.272253   1.1667005  1.3258281
 0.94080156 1.0633507  1.2068255  1.117172   1.366845   1.1897882
 1.075942   1.1825707  1.0438967  1.3289585  1.0032521  0.827287