## Kaggle Playground Series 3 - Episode 20

predict CO2 emission in Rwanda

In [21]:
import os
import pandas as pd

from tqdm import tqdm
import torch
import torch.nn as nn 
import torch.nn.functional as F
from torch.utils.data import * 

### Utility Functions
some functions which makes our lifes easier

In [22]:
def moveTo(obj, device): 
    """ 
    obj: the python object to move to a device, or to move its
    ➥ contents to a device
    device: the compute device to move objects to 
    """
    if isinstance(obj, list): 
        return [moveTo(x, device) for x in obj] 
    elif isinstance(obj, tuple): 
        return tuple(moveTo(list(obj), device)) 
    elif isinstance(obj, set): 
        return set(moveTo(list(obj), device)) 
    elif isinstance(obj, dict): 
        to_ret = dict() 
        for key, value in obj.items(): 
            to_ret[moveTo(key, device)] = moveTo(value, device) 
        return to_ret 
    elif hasattr(obj, "to"): 
        return obj.to(device) 
    else: 
        return obj

In [23]:
def train_simple_network(model, loss_func, training_loader, epochs=20, device="cpu"): 
    optimizer = torch.optim.SGD(model.parameters(), lr=0.001)                    

    model.to(device)                                                             

    for epoch in range(epochs):
        print('{0}/{1}'.format(epoch, epochs))                              
        model = model.train()                                                    
        running_loss = 0.0

        for inputs, labels in tqdm(training_loader, total=len(training_loader)):  
            inputs = moveTo(inputs, device)                                      
            labels = moveTo(labels, device)                                      

            optimizer.zero_grad()                                                


            y_hat = model(inputs)                                                

            loss = loss_func(y_hat, labels)                                      
            loss.backward()                                                      
            optimizer.step()                                                     
            running_loss += loss.item()

        print('loss: {0}'.format(running_loss))

In [24]:
df_train_path = r'C:\data\playground_s3_e20\playground-series-s3e20\train.csv'
df_val_path = r'C:\data\playground_s3_e20\playground-series-s3e20\test.csv'
df_test_path = r'C:\data\playground_s3_e20\playground-series-s3e20\sample_submission.csv'

In [25]:
df_train = pd.read_csv(df_train_path)

In [26]:
df_train.head()

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,...,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,emission
0,ID_-0.510_29.290_2019_00,-0.51,29.29,2019,0,-0.000108,0.603019,-6.5e-05,0.255668,-98.593887,...,3664.436218,61085.80957,2615.120483,15.568533,0.272292,-12.628986,35.632416,-138.786423,30.75214,3.750994
1,ID_-0.510_29.290_2019_01,-0.51,29.29,2019,1,2.1e-05,0.728214,1.4e-05,0.130988,16.592861,...,3651.190311,66969.478735,3174.572424,8.690601,0.25683,30.359375,39.557633,-145.18393,27.251779,4.025176
2,ID_-0.510_29.290_2019_02,-0.51,29.29,2019,2,0.000514,0.748199,0.000385,0.110018,72.795837,...,4216.986492,60068.894448,3516.282669,21.10341,0.251101,15.377883,30.401823,-142.519545,26.193296,4.231381
3,ID_-0.510_29.290_2019_03,-0.51,29.29,2019,3,,,,,,...,5228.507736,51064.547339,4180.973322,15.386899,0.262043,-11.293399,24.380357,-132.665828,28.829155,4.305286
4,ID_-0.510_29.290_2019_04,-0.51,29.29,2019,4,-7.9e-05,0.676296,-4.8e-05,0.121164,4.121269,...,3980.59812,63751.125781,3355.710107,8.114694,0.235847,38.532263,37.392979,-141.509805,22.204612,4.347317


### Convert the data into numpy array to consume

In [27]:
len(df_train)

79023

In [28]:
df_train_data = df_train.drop(columns=['emission', 'ID_LAT_LON_YEAR_WEEK'])
df_train_data = df_train_data[['SulphurDioxide_SO2_column_number_density', 'SulphurDioxide_SO2_column_number_density_amf']]
df_train_data = df_train_data.dropna()

x_train = df_train_data.to_numpy()

n_features = x_train.shape[1]
y_train = df_train[['emission']].to_numpy()

In [29]:
df_train_data.head()

Unnamed: 0,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf
0,-0.000108,0.603019
1,2.1e-05,0.728214
2,0.000514,0.748199
4,-7.9e-05,0.676296
5,0.000294,0.871713


In [30]:
class RegressionDataset(Dataset): 
    def __init__(self, X, y):
        super(RegressionDataset, self).__init__() 
        self.X = X.reshape(-1, n_features) 
        self.y = y.reshape(-1, 1) 
    def __getitem__(self, index):
        return torch.tensor(self.X[index,:], dtype=torch.float32), torch.tensor(self.y[index], dtype=torch.float32)

    def __len__(self): 
        return self.X.shape[0] 

training_loader = DataLoader(RegressionDataset(x_train, y_train), shuffle=True, batch_size=16)

In [31]:
model = nn.Sequential(
    nn.Linear(n_features,  30),
    # nn.Tanh(),
    # nn.Linear(30,  30),
    # nn.Tanh(),
    # nn.Linear(30,  30),
    # nn.Tanh(),
    nn.Linear(30,  1),
)

In [32]:
out_features = 1
# model = nn.Linear(n_features, out_features)
loss_func = nn.MSELoss()
device = torch.device('cuda')
train_simple_network(model, loss_func, training_loader, device=device, epochs=2)

0/2


100%|██████████| 4026/4026 [00:09<00:00, 437.92it/s]


loss: 100983432.6147461
1/2


100%|██████████| 4026/4026 [00:09<00:00, 446.57it/s]

loss: 100408860.8178711





: 