In [7]:
 # linear algebra
 # data processing, CSV file I/O (e.g. pd.read_csv)

In [8]:
%matplotlib inline

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import matplotlib
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
import xgboost as xgb

from IPython.display import display
from collections import defaultdict

Dataset Description
The objective of this challenge is to create machine learning models that use open-source emissions data (from Sentinel-5P satellite observations) to predict carbon emissions.

Approximately 497 unique locations were selected from multiple areas in Rwanda, with a distribution around farm lands, cities and power plants. The data for this competition is split by time; the years 2019 - 2021 are included in the training data, and your task is to predict the CO2 emissions data for 2022 through November.

Seven main features were extracted weekly from Sentinel-5P from January 2019 to November 2022. Each feature (Sulphur Dioxide, Carbon Monoxide, etc) contain sub features such as column_number_density which is the vertical column density at ground level, calculated using the DOAS technique. You can read more about each feature in the below links, including how they are measured and variable definitions. You are given the values of these features in the test set and your goal to predict CO2 emissions using time information as well as these features.

Sulphur Dioxide - COPERNICUS/S5P/NRTI/L3_SO2
Carbon Monoxide - COPERNICUS/S5P/NRTI/L3_CO
Nitrogen Dioxide - COPERNICUS/S5P/NRTI/L3_NO2
Formaldehyde - COPERNICUS/S5P/NRTI/L3_HCHO
UV Aerosol Index - COPERNICUS/S5P/NRTI/L3_AER_AI
Ozone - COPERNICUS/S5P/NRTI/L3_O3
Cloud - COPERNICUS/S5P/OFFL/L3_CLOUD
Important: Please only use the data provided for this challenge as part of your modeling effort. Do not use any external data, including any data from Sentinel-5P not provided on this page.

Files
train.csv - the training set
test.csv - the test set; your task is to predict the emission target for each week at each location
sample_submission.csv - a sample submission file in the correct format

In [9]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/co2-rwanda-data/train.csv
/kaggle/input/co2-rwanda-data/test.csv


In [10]:
train_data = pd.read_csv('/kaggle/input/co2-rwanda-data/train.csv')
test_data = pd.read_csv('/kaggle/input/co2-rwanda-data/test.csv')

# Display the first few rows of the train and test data
print("Train data:")
print(train_data.head())

print("\nTest data:")
print(test_data.head())

Train data:
       ID_LAT_LON_YEAR_WEEK  latitude  longitude  year  week_no  \
0  ID_-0.510_29.290_2019_00     -0.51      29.29  2019        0   
1  ID_-0.510_29.290_2019_01     -0.51      29.29  2019        1   
2  ID_-0.510_29.290_2019_02     -0.51      29.29  2019        2   
3  ID_-0.510_29.290_2019_03     -0.51      29.29  2019        3   
4  ID_-0.510_29.290_2019_04     -0.51      29.29  2019        4   

   SulphurDioxide_SO2_column_number_density  \
0                                 -0.000108   
1                                  0.000021   
2                                  0.000514   
3                                       NaN   
4                                 -0.000079   

   SulphurDioxide_SO2_column_number_density_amf  \
0                                      0.603019   
1                                      0.728214   
2                                      0.748199   
3                                           NaN   
4                                      0.676296

In [11]:
##TRAIN TEST DATA 

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score



In [12]:

def display_all(data):
    with pd.option_context('display.max_rows', 1000):
        with pd.option_context('display.max_columns', 1000):
            return display(data)

In [15]:
print(train_data.shape)
print(test_data.shape)

(79023, 76)
(24353, 75)


In [18]:
train_data.columns

Index(['ID_LAT_LON_YEAR_WEEK', 'latitude', 'longitude', 'year', 'week_no',
       'SulphurDioxide_SO2_column_number_density',
       'SulphurDioxide_SO2_column_number_density_amf',
       'SulphurDioxide_SO2_slant_column_number_density',
       'SulphurDioxide_cloud_fraction', 'SulphurDioxide_sensor_azimuth_angle',
       'SulphurDioxide_sensor_zenith_angle',
       'SulphurDioxide_solar_azimuth_angle',
       'SulphurDioxide_solar_zenith_angle',
       'SulphurDioxide_SO2_column_number_density_15km',
       'CarbonMonoxide_CO_column_number_density',
       'CarbonMonoxide_H2O_column_number_density',
       'CarbonMonoxide_cloud_height', 'CarbonMonoxide_sensor_altitude',
       'CarbonMonoxide_sensor_azimuth_angle',
       'CarbonMonoxide_sensor_zenith_angle',
       'CarbonMonoxide_solar_azimuth_angle',
       'CarbonMonoxide_solar_zenith_angle',
       'NitrogenDioxide_NO2_column_number_density',
       'NitrogenDioxide_tropospheric_NO2_column_number_density',
       'NitrogenDioxide

In [19]:
print(train_data.year.value_counts(normalize=True))

2019    0.333333
2020    0.333333
2021    0.333333
Name: year, dtype: float64


In [20]:
display_all(train_data.head())

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,SulphurDioxide_sensor_zenith_angle,SulphurDioxide_solar_azimuth_angle,SulphurDioxide_solar_zenith_angle,SulphurDioxide_SO2_column_number_density_15km,CarbonMonoxide_CO_column_number_density,CarbonMonoxide_H2O_column_number_density,CarbonMonoxide_cloud_height,CarbonMonoxide_sensor_altitude,CarbonMonoxide_sensor_azimuth_angle,CarbonMonoxide_sensor_zenith_angle,CarbonMonoxide_solar_azimuth_angle,CarbonMonoxide_solar_zenith_angle,NitrogenDioxide_NO2_column_number_density,NitrogenDioxide_tropospheric_NO2_column_number_density,NitrogenDioxide_stratospheric_NO2_column_number_density,NitrogenDioxide_NO2_slant_column_number_density,NitrogenDioxide_tropopause_pressure,NitrogenDioxide_absorbing_aerosol_index,NitrogenDioxide_cloud_fraction,NitrogenDioxide_sensor_altitude,NitrogenDioxide_sensor_azimuth_angle,NitrogenDioxide_sensor_zenith_angle,NitrogenDioxide_solar_azimuth_angle,NitrogenDioxide_solar_zenith_angle,Formaldehyde_tropospheric_HCHO_column_number_density,Formaldehyde_tropospheric_HCHO_column_number_density_amf,Formaldehyde_HCHO_slant_column_number_density,Formaldehyde_cloud_fraction,Formaldehyde_solar_zenith_angle,Formaldehyde_solar_azimuth_angle,Formaldehyde_sensor_zenith_angle,Formaldehyde_sensor_azimuth_angle,UvAerosolIndex_absorbing_aerosol_index,UvAerosolIndex_sensor_altitude,UvAerosolIndex_sensor_azimuth_angle,UvAerosolIndex_sensor_zenith_angle,UvAerosolIndex_solar_azimuth_angle,UvAerosolIndex_solar_zenith_angle,Ozone_O3_column_number_density,Ozone_O3_column_number_density_amf,Ozone_O3_slant_column_number_density,Ozone_O3_effective_temperature,Ozone_cloud_fraction,Ozone_sensor_azimuth_angle,Ozone_sensor_zenith_angle,Ozone_solar_azimuth_angle,Ozone_solar_zenith_angle,UvAerosolLayerHeight_aerosol_height,UvAerosolLayerHeight_aerosol_pressure,UvAerosolLayerHeight_aerosol_optical_depth,UvAerosolLayerHeight_sensor_zenith_angle,UvAerosolLayerHeight_sensor_azimuth_angle,UvAerosolLayerHeight_solar_azimuth_angle,UvAerosolLayerHeight_solar_zenith_angle,Cloud_cloud_fraction,Cloud_cloud_top_pressure,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,emission
0,ID_-0.510_29.290_2019_00,-0.51,29.29,2019,0,-0.000108,0.603019,-6.5e-05,0.255668,-98.593887,50.843559,-130.050797,35.874496,-2.7e-05,0.03537,1589.024536,4061.098145,829530.5,71.111977,52.775928,-149.875565,25.965214,,,,,,,,,,,,,0.000117,0.86323,3.8e-05,0.255668,35.874496,-130.050797,50.843559,-98.593887,-1.280761,829864.546875,-12.628979,35.632416,-138.786446,30.752128,0.115927,2.506609,0.295663,225.731144,0.595473,-12.628979,35.632416,-138.786446,30.752128,,,,,,,,0.595473,53534.732422,3664.436218,61085.80957,2615.120483,15.568533,0.272292,-12.628986,35.632416,-138.786423,30.75214,3.750994
1,ID_-0.510_29.290_2019_01,-0.51,29.29,2019,1,2.1e-05,0.728214,1.4e-05,0.130988,16.592861,39.137194,-140.874435,28.965133,1.2e-05,0.036526,1772.574405,1869.040414,829787.28713,-1.019594,38.982368,-140.158048,29.562,4.7e-05,1.639765e-05,3e-05,9.3e-05,7311.869141,-1.935386,0.067038,829859.960368,5.471037,35.265195,-138.343908,30.054262,0.00017,1.172826,0.000143,0.200754,29.071781,-141.814827,43.050213,4.678839,-1.548119,829747.856973,16.152492,43.485327,-142.786141,28.573627,0.116775,2.657704,0.315733,226.17217,0.175166,24.464335,42.596541,-143.097868,28.213655,,,,,,,,0.213608,63790.296241,3651.190311,66969.478735,3174.572424,8.690601,0.25683,30.359375,39.557633,-145.18393,27.251779,4.025176
2,ID_-0.510_29.290_2019_02,-0.51,29.29,2019,2,0.000514,0.748199,0.000385,0.110018,72.795837,52.868816,-150.191757,23.206415,0.000154,0.035338,2703.2368,2809.138386,829883.828686,-54.801144,52.344378,-133.683714,31.586838,3.1e-05,4.267369e-07,3e-05,8e-05,7311.869141,-2.754374,0.072135,829527.125,72.795837,52.868816,-150.191757,23.206415,8e-05,1.175467,1.9e-05,0.279449,30.99429,-135.66716,52.810561,-41.363579,-1.038673,829892.960629,-41.557633,41.269033,-135.364627,30.273304,0.117039,2.619104,0.310828,227.469292,0.606091,-41.557633,41.269033,-135.364627,30.273304,,,,,,,,0.70354,55923.790554,4216.986492,60068.894448,3516.282669,21.10341,0.251101,15.377883,30.401823,-142.519545,26.193296,4.231381
3,ID_-0.510_29.290_2019_03,-0.51,29.29,2019,3,,,,,,,,,,0.03679,2076.073332,3917.707873,829657.163571,28.916541,39.676184,-142.575915,24.810699,,,,,,,,,,,,,,,,,,,,,-0.626435,829794.848214,-0.00127,34.45874,-137.489602,26.936477,0.116434,2.525818,0.297966,225.58944,0.787398,-0.00127,34.45874,-137.489602,26.936477,,,,,,,,0.782806,44569.130636,5228.507736,51064.547339,4180.973322,15.386899,0.262043,-11.293399,24.380357,-132.665828,28.829155,4.305286
4,ID_-0.510_29.290_2019_04,-0.51,29.29,2019,4,-7.9e-05,0.676296,-4.8e-05,0.121164,4.121269,35.515587,-137.409159,24.331972,-2.8e-05,0.034675,2053.60849,2667.310013,829735.09375,-12.501663,33.703073,-134.854258,24.629593,5.1e-05,2.056437e-05,3e-05,9.3e-05,7637.262458,-1.450563,0.049393,829744.84375,-13.431798,35.078624,-136.257947,24.729026,0.000269,0.869081,0.000146,0.16009,25.977935,-134.826557,39.949069,-12.837398,-1.584896,829736.142857,-0.604325,41.794705,-136.448518,25.045785,0.117373,2.572243,0.306688,228.95584,0.215739,-0.604325,41.794705,-136.448518,25.045785,,,,,,,,0.189336,59904.314844,3980.59812,63751.125781,3355.710107,8.114694,0.235847,38.532263,37.392979,-141.509805,22.204612,4.347317


In [21]:
display_all(test_data.head())

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,SulphurDioxide_sensor_zenith_angle,SulphurDioxide_solar_azimuth_angle,SulphurDioxide_solar_zenith_angle,SulphurDioxide_SO2_column_number_density_15km,CarbonMonoxide_CO_column_number_density,CarbonMonoxide_H2O_column_number_density,CarbonMonoxide_cloud_height,CarbonMonoxide_sensor_altitude,CarbonMonoxide_sensor_azimuth_angle,CarbonMonoxide_sensor_zenith_angle,CarbonMonoxide_solar_azimuth_angle,CarbonMonoxide_solar_zenith_angle,NitrogenDioxide_NO2_column_number_density,NitrogenDioxide_tropospheric_NO2_column_number_density,NitrogenDioxide_stratospheric_NO2_column_number_density,NitrogenDioxide_NO2_slant_column_number_density,NitrogenDioxide_tropopause_pressure,NitrogenDioxide_absorbing_aerosol_index,NitrogenDioxide_cloud_fraction,NitrogenDioxide_sensor_altitude,NitrogenDioxide_sensor_azimuth_angle,NitrogenDioxide_sensor_zenith_angle,NitrogenDioxide_solar_azimuth_angle,NitrogenDioxide_solar_zenith_angle,Formaldehyde_tropospheric_HCHO_column_number_density,Formaldehyde_tropospheric_HCHO_column_number_density_amf,Formaldehyde_HCHO_slant_column_number_density,Formaldehyde_cloud_fraction,Formaldehyde_solar_zenith_angle,Formaldehyde_solar_azimuth_angle,Formaldehyde_sensor_zenith_angle,Formaldehyde_sensor_azimuth_angle,UvAerosolIndex_absorbing_aerosol_index,UvAerosolIndex_sensor_altitude,UvAerosolIndex_sensor_azimuth_angle,UvAerosolIndex_sensor_zenith_angle,UvAerosolIndex_solar_azimuth_angle,UvAerosolIndex_solar_zenith_angle,Ozone_O3_column_number_density,Ozone_O3_column_number_density_amf,Ozone_O3_slant_column_number_density,Ozone_O3_effective_temperature,Ozone_cloud_fraction,Ozone_sensor_azimuth_angle,Ozone_sensor_zenith_angle,Ozone_solar_azimuth_angle,Ozone_solar_zenith_angle,UvAerosolLayerHeight_aerosol_height,UvAerosolLayerHeight_aerosol_pressure,UvAerosolLayerHeight_aerosol_optical_depth,UvAerosolLayerHeight_sensor_zenith_angle,UvAerosolLayerHeight_sensor_azimuth_angle,UvAerosolLayerHeight_solar_azimuth_angle,UvAerosolLayerHeight_solar_zenith_angle,Cloud_cloud_fraction,Cloud_cloud_top_pressure,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle
0,ID_-0.510_29.290_2022_00,-0.51,29.29,2022,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-0.561522,829937.0,-100.113785,33.696957,-133.047562,33.779572,0.112175,2.374888,0.269846,227.1931,0.48534,-100.113785,33.696957,-133.047562,33.779572,,,,,,,,0.485129,36022.027344,8472.313477,41047.9375,7472.313477,7.935617,0.240773,-100.113792,33.697044,-133.047546,33.779583
1,ID_-0.510_29.290_2022_01,-0.51,29.29,2022,1,0.000456,0.691164,0.000316,0.0,76.239196,15.600607,-140.529848,28.896124,0.000157,0.037641,1688.656342,2814.309683,829652.957598,26.072167,25.189549,-142.612636,28.318923,4.8e-05,1.117653e-05,3.7e-05,9.3e-05,7311.869141,-1.416309,0.036769,829736.125,76.239196,15.600607,-140.529848,28.896124,0.000123,0.957096,5.8e-05,0.0,28.896124,-140.529848,15.600607,76.239196,-0.823662,829753.051343,-0.009998,34.745542,-139.171039,30.405075,0.112643,2.537305,0.290184,228.489219,0.515679,-0.009998,34.745542,-139.171039,30.405075,,,,,,,,0.711262,48539.737242,6476.147323,54915.708579,5476.147161,11.448437,0.293119,-30.510319,42.402593,-138.632822,31.01238
2,ID_-0.510_29.290_2022_02,-0.51,29.29,2022,2,0.000161,0.605107,0.000106,0.07987,-42.055341,39.88906,-136.908976,30.054682,5.3e-05,0.037795,2629.692089,3233.654973,829589.971617,14.881259,40.833582,-143.94934,26.760219,5.5e-05,1.679787e-05,3.8e-05,0.000109,8613.426417,-0.008104,0.03649,829922.5625,-99.816841,36.693165,-131.658058,32.784893,0.000277,0.764546,0.000151,0.225136,27.418839,-143.602313,47.325487,3.650128,-1.034709,829642.010417,15.736805,42.092969,-142.989044,27.440719,0.112113,2.602328,0.295976,226.155523,0.355003,15.736805,42.092969,-142.989044,27.440719,,,,,,,,0.425682,34133.080469,8984.795703,39006.09375,7984.795703,10.753179,0.26713,39.087361,45.93648,-144.784988,26.743361
3,ID_-0.510_29.290_2022_03,-0.51,29.29,2022,3,0.00035,0.696917,0.000243,0.201028,72.169566,58.862543,-152.99944,21.806625,9.3e-05,0.039743,1905.403107,3157.021515,829691.671875,-12.772312,49.306163,-138.109138,28.208616,3.4e-05,-2.399639e-07,3.7e-05,0.000109,11191.20061,-1.279531,0.134641,829375.749671,72.169566,58.862543,-152.99944,21.806625,0.000151,0.93521,7.8e-05,0.272235,23.121284,-148.466265,48.366559,73.226178,-0.641272,829800.856545,-24.466617,42.147727,-135.011886,29.596561,0.111779,2.654894,0.300949,228.088779,0.695621,-24.466617,42.147727,-135.011886,29.596561,,,,,,,,0.71216,50854.991076,6014.724059,57646.368368,5014.724115,11.764556,0.304679,-24.465127,42.140419,-135.027891,29.604774
4,ID_-0.510_29.290_2022_04,-0.51,29.29,2022,4,-0.000317,0.580527,-0.000184,0.204352,76.190865,15.646016,-135.794754,24.328299,-7.7e-05,0.039232,1307.190702,2228.402754,829847.718216,-40.978057,30.043796,-130.176646,28.060722,,,,,,,,,,,,,0.000273,0.776969,0.00014,0.332917,22.856263,-142.256966,34.42743,74.476742,-0.712758,829761.772395,-12.909506,30.140668,-135.499142,26.273346,0.113054,2.352987,0.270268,229.342882,0.657205,-12.909506,30.140668,-135.499142,26.273346,,,,,,,,0.667876,46594.685145,6849.280477,52896.541873,5849.280394,13.065317,0.284221,-12.90785,30.122641,-135.500119,26.276807


In [38]:
train_data.replace(to_replace=-1, value=np.nan, inplace=True)
test_data.replace(to_replace=-1, value=np.nan, inplace=True)

In [39]:
display_all(test_data.head())

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,SulphurDioxide_sensor_zenith_angle,SulphurDioxide_solar_azimuth_angle,SulphurDioxide_solar_zenith_angle,SulphurDioxide_SO2_column_number_density_15km,CarbonMonoxide_CO_column_number_density,CarbonMonoxide_H2O_column_number_density,CarbonMonoxide_cloud_height,CarbonMonoxide_sensor_altitude,CarbonMonoxide_sensor_azimuth_angle,CarbonMonoxide_sensor_zenith_angle,CarbonMonoxide_solar_azimuth_angle,CarbonMonoxide_solar_zenith_angle,NitrogenDioxide_NO2_column_number_density,NitrogenDioxide_tropospheric_NO2_column_number_density,NitrogenDioxide_stratospheric_NO2_column_number_density,NitrogenDioxide_NO2_slant_column_number_density,NitrogenDioxide_tropopause_pressure,NitrogenDioxide_absorbing_aerosol_index,NitrogenDioxide_cloud_fraction,NitrogenDioxide_sensor_altitude,NitrogenDioxide_sensor_azimuth_angle,NitrogenDioxide_sensor_zenith_angle,NitrogenDioxide_solar_azimuth_angle,NitrogenDioxide_solar_zenith_angle,Formaldehyde_tropospheric_HCHO_column_number_density,Formaldehyde_tropospheric_HCHO_column_number_density_amf,Formaldehyde_HCHO_slant_column_number_density,Formaldehyde_cloud_fraction,Formaldehyde_solar_zenith_angle,Formaldehyde_solar_azimuth_angle,Formaldehyde_sensor_zenith_angle,Formaldehyde_sensor_azimuth_angle,UvAerosolIndex_absorbing_aerosol_index,UvAerosolIndex_sensor_altitude,UvAerosolIndex_sensor_azimuth_angle,UvAerosolIndex_sensor_zenith_angle,UvAerosolIndex_solar_azimuth_angle,UvAerosolIndex_solar_zenith_angle,Ozone_O3_column_number_density,Ozone_O3_column_number_density_amf,Ozone_O3_slant_column_number_density,Ozone_O3_effective_temperature,Ozone_cloud_fraction,Ozone_sensor_azimuth_angle,Ozone_sensor_zenith_angle,Ozone_solar_azimuth_angle,Ozone_solar_zenith_angle,UvAerosolLayerHeight_aerosol_height,UvAerosolLayerHeight_aerosol_pressure,UvAerosolLayerHeight_aerosol_optical_depth,UvAerosolLayerHeight_sensor_zenith_angle,UvAerosolLayerHeight_sensor_azimuth_angle,UvAerosolLayerHeight_solar_azimuth_angle,UvAerosolLayerHeight_solar_zenith_angle,Cloud_cloud_fraction,Cloud_cloud_top_pressure,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle
0,ID_-0.510_29.290_2022_00,-0.51,29.29,2022,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-0.561522,829937.0,-100.113785,33.696957,-133.047562,33.779572,0.112175,2.374888,0.269846,227.1931,0.48534,-100.113785,33.696957,-133.047562,33.779572,,,,,,,,0.485129,36022.027344,8472.313477,41047.9375,7472.313477,7.935617,0.240773,-100.113792,33.697044,-133.047546,33.779583
1,ID_-0.510_29.290_2022_01,-0.51,29.29,2022,1,0.000456,0.691164,0.000316,0.0,76.239196,15.600607,-140.529848,28.896124,0.000157,0.037641,1688.656342,2814.309683,829652.957598,26.072167,25.189549,-142.612636,28.318923,4.8e-05,1.117653e-05,3.7e-05,9.3e-05,7311.869141,-1.416309,0.036769,829736.125,76.239196,15.600607,-140.529848,28.896124,0.000123,0.957096,5.8e-05,0.0,28.896124,-140.529848,15.600607,76.239196,-0.823662,829753.051343,-0.009998,34.745542,-139.171039,30.405075,0.112643,2.537305,0.290184,228.489219,0.515679,-0.009998,34.745542,-139.171039,30.405075,,,,,,,,0.711262,48539.737242,6476.147323,54915.708579,5476.147161,11.448437,0.293119,-30.510319,42.402593,-138.632822,31.01238
2,ID_-0.510_29.290_2022_02,-0.51,29.29,2022,2,0.000161,0.605107,0.000106,0.07987,-42.055341,39.88906,-136.908976,30.054682,5.3e-05,0.037795,2629.692089,3233.654973,829589.971617,14.881259,40.833582,-143.94934,26.760219,5.5e-05,1.679787e-05,3.8e-05,0.000109,8613.426417,-0.008104,0.03649,829922.5625,-99.816841,36.693165,-131.658058,32.784893,0.000277,0.764546,0.000151,0.225136,27.418839,-143.602313,47.325487,3.650128,-1.034709,829642.010417,15.736805,42.092969,-142.989044,27.440719,0.112113,2.602328,0.295976,226.155523,0.355003,15.736805,42.092969,-142.989044,27.440719,,,,,,,,0.425682,34133.080469,8984.795703,39006.09375,7984.795703,10.753179,0.26713,39.087361,45.93648,-144.784988,26.743361
3,ID_-0.510_29.290_2022_03,-0.51,29.29,2022,3,0.00035,0.696917,0.000243,0.201028,72.169566,58.862543,-152.99944,21.806625,9.3e-05,0.039743,1905.403107,3157.021515,829691.671875,-12.772312,49.306163,-138.109138,28.208616,3.4e-05,-2.399639e-07,3.7e-05,0.000109,11191.20061,-1.279531,0.134641,829375.749671,72.169566,58.862543,-152.99944,21.806625,0.000151,0.93521,7.8e-05,0.272235,23.121284,-148.466265,48.366559,73.226178,-0.641272,829800.856545,-24.466617,42.147727,-135.011886,29.596561,0.111779,2.654894,0.300949,228.088779,0.695621,-24.466617,42.147727,-135.011886,29.596561,,,,,,,,0.71216,50854.991076,6014.724059,57646.368368,5014.724115,11.764556,0.304679,-24.465127,42.140419,-135.027891,29.604774
4,ID_-0.510_29.290_2022_04,-0.51,29.29,2022,4,-0.000317,0.580527,-0.000184,0.204352,76.190865,15.646016,-135.794754,24.328299,-7.7e-05,0.039232,1307.190702,2228.402754,829847.718216,-40.978057,30.043796,-130.176646,28.060722,,,,,,,,,,,,,0.000273,0.776969,0.00014,0.332917,22.856263,-142.256966,34.42743,74.476742,-0.712758,829761.772395,-12.909506,30.140668,-135.499142,26.273346,0.113054,2.352987,0.270268,229.342882,0.657205,-12.909506,30.140668,-135.499142,26.273346,,,,,,,,0.667876,46594.685145,6849.280477,52896.541873,5849.280394,13.065317,0.284221,-12.90785,30.122641,-135.500119,26.276807


In [23]:
pred_columns = train_data.columns[2:].values

In [24]:
### Let's do the column dtype conversions
pred_columns = train_data.columns[2:]
bin_cols = [c for c in pred_columns if 'bin' in c]
cat_cols = [c for c in pred_columns if 'cat' in c]
num_cols= [c for c in pred_columns if c not in bin_cols and c not in cat_cols]

In [26]:
for c in pred_columns:
    if 'cat' in c or 'bin' in c:
        print(f'Column: {c.upper()}')
        print('Train Summary')
        print(f'Cardinality {len(train_data[c].unique())}')
        print(train_data[c].value_counts(dropna=False))
        print('Test Summary')
        print(f'Cardinality {len(train_data[c].unique())}')
        print(test_data[c].value_counts(dropna=False))
        print()
        print()

Column: NITROGENDIOXIDE_ABSORBING_AEROSOL_INDEX
Train Summary
Cardinality 60694
 NaN         18320
-1.026992        2
-1.944507        2
-1.764952        2
-0.188380        2
             ...  
-2.493408        1
-2.128244        1
-0.930142        1
-1.857277        1
-1.102552        1
Name: NitrogenDioxide_absorbing_aerosol_index, Length: 60694, dtype: int64
Test Summary
Cardinality 60694
 NaN         5655
 0.597650       2
-0.499108       1
-0.392971       1
-1.335838       1
             ... 
-0.653249       1
-0.872600       1
-0.902581       1
-0.113438       1
-1.786733       1
Name: NitrogenDioxide_absorbing_aerosol_index, Length: 18698, dtype: int64


Column: UVAEROSOLINDEX_ABSORBING_AEROSOL_INDEX
Train Summary
Cardinality 78485
 NaN         539
-1.280761      1
-1.936038      1
-1.518674      1
-1.580908      1
            ... 
-1.478669      1
-1.820849      1
-1.444432      1
-2.014702      1
-0.942918      1
Name: UvAerosolIndex_absorbing_aerosol_index, Length: 78485, dty

In [43]:
def subsample(data, ratio=0.5):
    subsample, _ = train_test_split(data, test_size =ratio, stratify=train_data.Year)
    return subsample

In [44]:
subsample = subsample(train_data)

AttributeError: 'DataFrame' object has no attribute 'Year'

In [42]:
# If feature is not binary include in correlation matrix
corr_cols = [c for c in num_cols if len(train_data[c].unique()) > 2]
plt.imshow((subsample[corr_cols].corr()), cmap='hot', interpolation='nearest')
plt.show()

TypeError: 'function' object is not subscriptable