# Baseline - xgboost

We create a simple baseline where we just throw all columns at xgboost model to see how well we can estimate emissions an how can we improve

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

data_path = "./data/"
df_train = pd.read_csv(data_path + "train.csv")
df_test = pd.read_csv(data_path + "test.csv")

id_columns = "ID_LAT_LON_YEAR_WEEK"
target_column = "emission"
#df_train.drop('id', axis=1, inplace=True)

df_train.head(10)

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,...,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,emission
0,ID_-0.510_29.290_2019_00,-0.51,29.29,2019,0,-0.000108,0.603019,-6.5e-05,0.255668,-98.593887,...,3664.436218,61085.80957,2615.120483,15.568533,0.272292,-12.628986,35.632416,-138.786423,30.75214,3.750994
1,ID_-0.510_29.290_2019_01,-0.51,29.29,2019,1,2.1e-05,0.728214,1.4e-05,0.130988,16.592861,...,3651.190311,66969.478735,3174.572424,8.690601,0.25683,30.359375,39.557633,-145.18393,27.251779,4.025176
2,ID_-0.510_29.290_2019_02,-0.51,29.29,2019,2,0.000514,0.748199,0.000385,0.110018,72.795837,...,4216.986492,60068.894448,3516.282669,21.10341,0.251101,15.377883,30.401823,-142.519545,26.193296,4.231381
3,ID_-0.510_29.290_2019_03,-0.51,29.29,2019,3,,,,,,...,5228.507736,51064.547339,4180.973322,15.386899,0.262043,-11.293399,24.380357,-132.665828,28.829155,4.305286
4,ID_-0.510_29.290_2019_04,-0.51,29.29,2019,4,-7.9e-05,0.676296,-4.8e-05,0.121164,4.121269,...,3980.59812,63751.125781,3355.710107,8.114694,0.235847,38.532263,37.392979,-141.509805,22.204612,4.347317
5,ID_-0.510_29.290_2019_05,-0.51,29.29,2019,5,0.000294,0.871713,0.000242,0.227656,-13.45369,...,5530.354546,49846.00101,4495.301362,7.556143,0.250228,30.128401,37.00702,-137.388719,21.15782,4.310819
6,ID_-0.510_29.290_2019_06,-0.51,29.29,2019,6,-0.000285,0.791956,-0.000226,0.119397,72.198647,...,4378.614648,58804.276563,3537.050659,10.493107,0.240828,4.585368,30.491541,-128.196338,21.721202,4.269334
7,ID_-0.510_29.290_2019_07,-0.51,29.29,2019,7,-1.6e-05,0.976311,-1.6e-05,0.106799,-100.956055,...,3468.924146,63697.007267,2597.071563,20.5336,0.250168,16.374829,35.840248,-122.250699,20.432005,4.251361
8,ID_-0.510_29.290_2019_08,-0.51,29.29,2019,8,-0.000115,0.796941,-0.000101,0.164971,-40.179074,...,3160.26467,67222.556159,2530.851476,24.253928,0.260543,-12.021751,42.688789,-114.296369,22.365602,4.281937
9,ID_-0.510_29.290_2019_09,-0.51,29.29,2019,9,5.6e-05,0.998541,1.4e-05,0.157726,-100.108744,...,4141.07334,60298.717969,3510.293652,13.813379,0.289295,-66.258392,28.596211,-105.687177,21.084021,4.352933


In [8]:
df_train.drop(id_columns, axis=1, inplace=True)

# we must impute missing values because xgboost can't handle them

# impute missing values with mean of given latitude, longitude
place_mean = df_train.groupby(['latitude', 'longitude']).mean()

# impute missing values with mean of given latitude, longitude from place_mean
df_train = df_train.groupby(['latitude', 'longitude']).transform(lambda x: x.fillna(x.mean()))
df_train

Unnamed: 0,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,SulphurDioxide_sensor_zenith_angle,SulphurDioxide_solar_azimuth_angle,SulphurDioxide_solar_zenith_angle,...,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,emission
0,2019,0,-0.000108,0.603019,-0.000065,0.255668,-98.593887,50.843559,-130.050797,35.874496,...,3664.436218,61085.809570,2615.120483,15.568533,0.272292,-12.628986,35.632416,-138.786423,30.752140,3.750994
1,2019,1,0.000021,0.728214,0.000014,0.130988,16.592861,39.137194,-140.874435,28.965133,...,3651.190311,66969.478735,3174.572424,8.690601,0.256830,30.359375,39.557633,-145.183930,27.251779,4.025176
2,2019,2,0.000514,0.748199,0.000385,0.110018,72.795837,52.868816,-150.191757,23.206415,...,4216.986492,60068.894448,3516.282669,21.103410,0.251101,15.377883,30.401823,-142.519545,26.193296,4.231381
3,2019,3,0.000080,0.856283,0.000066,0.141984,2.757432,38.027457,-87.673337,27.081862,...,5228.507736,51064.547339,4180.973322,15.386899,0.262043,-11.293399,24.380357,-132.665828,28.829155,4.305286
4,2019,4,-0.000079,0.676296,-0.000048,0.121164,4.121269,35.515587,-137.409159,24.331972,...,3980.598120,63751.125781,3355.710107,8.114694,0.235847,38.532263,37.392979,-141.509805,22.204612,4.347317
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79018,2021,48,0.000284,1.195643,0.000340,0.191313,72.820518,55.988022,-140.821274,25.148563,...,5459.185355,60657.101913,4590.879504,20.245954,0.304797,-35.140368,40.113533,-129.935508,32.095214,29.404171
79019,2021,49,0.000083,1.130868,0.000063,0.177222,-12.856753,19.435339,-131.114411,31.197638,...,5606.449457,60168.191528,4659.130378,6.104610,0.314015,4.667058,47.528435,-134.252871,30.771469,29.186497
79020,2021,50,0.000003,0.944682,0.000005,0.169407,-1.474139,37.973965,-81.200528,27.685121,...,6222.646776,56596.027209,5222.646823,14.817885,0.288058,-0.340922,35.328098,-134.731723,30.716166,29.131205
79021,2021,51,-0.000034,0.879397,-0.000028,0.184209,-100.344827,32.599393,-129.573396,33.906037,...,7896.456885,46533.348194,6946.858022,32.594768,0.274047,8.427699,48.295652,-139.447849,29.112868,28.125792


In [None]:
# check if there are any missing values

df_train.isnull().sum()

year                                                0
week_no                                             0
SulphurDioxide_SO2_column_number_density          477
SulphurDioxide_SO2_column_number_density_amf      477
SulphurDioxide_SO2_slant_column_number_density    477
                                                 ... 
Cloud_sensor_azimuth_angle                        477
Cloud_sensor_zenith_angle                         477
Cloud_solar_azimuth_angle                         477
Cloud_solar_zenith_angle                          477
emission                                            0
Length: 73, dtype: int64

In [None]:
# check if the imputed values are actually the column mean for given latitude, longitud
