# 1.Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

# 2.Load data

In [2]:
! gdown https://drive.google.com/uc?id=1xwJmYJxEia06sxUdJyGO7JFx4DNK1fbp

Downloading...
From: https://drive.google.com/uc?id=1xwJmYJxEia06sxUdJyGO7JFx4DNK1fbp
To: /content/Problem3.csv
  0% 0.00/37.4k [00:00<?, ?B/s]100% 37.4k/37.4k [00:00<00:00, 75.3MB/s]


In [3]:
dataset_path = '/content/Problem3.csv'
data_df = pd.read_csv(dataset_path)
data_df

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,4.468204,26.2,94.3,1.808289,8.2,51,6.7,False,0.000000
1,7,4,oct,tue,4.517431,35.4,669.1,2.041220,18.0,33,0.9,False,0.000000
2,7,4,oct,sat,4.517431,43.7,686.9,2.041220,14.6,33,1.3,False,0.000000
3,8,6,mar,fri,4.529368,33.3,77.5,2.302585,8.3,97,4.0,True,0.000000
4,8,6,mar,sun,4.503137,51.3,102.2,2.360854,11.4,99,1.8,False,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,4,3,aug,sun,4.414010,56.7,665.6,1.064711,27.8,32,2.7,False,2.006871
506,2,4,aug,sun,4.414010,56.7,665.6,1.064711,21.9,71,5.8,False,4.012592
507,7,4,aug,sun,4.414010,56.7,665.6,1.064711,21.2,70,6.7,False,2.498152
508,1,4,aug,sat,4.558079,146.0,614.7,2.509599,25.6,42,4.0,False,0.000000


In [5]:
data_df.info()
data_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 510 entries, 0 to 509
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X       510 non-null    int64  
 1   Y       510 non-null    int64  
 2   month   510 non-null    object 
 3   day     510 non-null    object 
 4   FFMC    510 non-null    float64
 5   DMC     510 non-null    float64
 6   DC      510 non-null    float64
 7   ISI     510 non-null    float64
 8   temp    510 non-null    float64
 9   RH      510 non-null    int64  
 10  wind    510 non-null    float64
 11  rain    510 non-null    bool   
 12  area    510 non-null    float64
dtypes: bool(1), float64(7), int64(3), object(2)
memory usage: 48.4+ KB


Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,area
count,510.0,510.0,510.0,510.0,510.0,510.0,510.0,510.0,510.0,510.0
mean,4.680392,4.294118,4.522609,111.837647,550.470392,2.228457,18.951569,44.029412,4.017255,1.121351
std,2.320534,1.234636,0.034979,63.853719,246.630662,0.428759,5.78993,15.968323,1.788793,1.401431
min,1.0,2.0,4.332048,3.0,7.9,0.741937,2.2,15.0,0.4,0.0
25%,3.0,4.0,4.514151,73.25,442.3,2.04122,15.625,32.25,2.7,0.0
50%,4.0,4.0,4.528829,108.4,664.2,2.24071,19.3,41.5,4.0,0.457245
75%,7.0,5.0,4.54223,142.4,714.2,2.484907,22.8,53.0,4.9,2.025183
max,9.0,9.0,4.576771,291.3,860.6,4.044804,33.3,99.0,9.4,6.99562


# 3.Process category columns

In [9]:
categorical_cols = data_df.select_dtypes(include = ['object', 'bool']).columns.to_list()

for col_name in categorical_cols:
    n_categories = data_df[col_name].nunique()
    print(f'Number of categories in { col_name }: { n_categories }')

ordinal_encoder = OrdinalEncoder()
encoded_categorical_cols = ordinal_encoder.fit_transform(data_df[categorical_cols])
print(encoded_categorical_cols)

encoded_categorical_df = pd.DataFrame(encoded_categorical_cols, columns = categorical_cols)
print(encoded_categorical_df)

numerical_df = data_df.drop(categorical_cols, axis =1)
print(numerical_df)

encoded_df = pd.concat([numerical_df, encoded_categorical_df], axis =1)
print(encoded_df)

Number of categories in month: 12
Number of categories in day: 7
Number of categories in rain: 2
[[ 7.  0.  0.]
 [10.  5.  0.]
 [10.  2.  0.]
 ...
 [ 1.  3.  0.]
 [ 1.  2.  0.]
 [ 9.  5.  0.]]
     month  day  rain
0      7.0  0.0   0.0
1     10.0  5.0   0.0
2     10.0  2.0   0.0
3      7.0  0.0   1.0
4      7.0  3.0   0.0
..     ...  ...   ...
505    1.0  3.0   0.0
506    1.0  3.0   0.0
507    1.0  3.0   0.0
508    1.0  2.0   0.0
509    9.0  5.0   0.0

[510 rows x 3 columns]
     X  Y      FFMC    DMC     DC       ISI  temp  RH  wind      area
0    7  5  4.468204   26.2   94.3  1.808289   8.2  51   6.7  0.000000
1    7  4  4.517431   35.4  669.1  2.041220  18.0  33   0.9  0.000000
2    7  4  4.517431   43.7  686.9  2.041220  14.6  33   1.3  0.000000
3    8  6  4.529368   33.3   77.5  2.302585   8.3  97   4.0  0.000000
4    8  6  4.503137   51.3  102.2  2.360854  11.4  99   1.8  0.000000
..  .. ..       ...    ...    ...       ...   ...  ..   ...       ...
505  4  3  4.414010   56.7  6

# 4.Seperate features and label

In [10]:
X = encoded_df.drop(columns = ['area'])
y = encoded_df['area']

# 5.Split dataset to train and test

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,random_state = 7)

# 6.Build model XGBoost for Regression

In [12]:
xg_reg = xgb.XGBRegressor(seed=7, learning_rate=0.01, n_estimators=102, max_depth =3)

xg_reg.fit(X_train, y_train)

# 7.Predict on test dataset and evaluate the results of model

In [15]:
preds = xg_reg.predict(X_test)

mae = mean_absolute_error (y_test , preds )
mse = mean_squared_error (y_test , preds )

print ('Evaluation results on test set:')
print (f'Mean Absolute Error : {mae}')
print (f'Mean Squared Error : {mse}')

Evaluation results on test set:
Mean Absolute Error : 1.1484401341167767
Mean Squared Error : 1.8845074196256495
