In [1]:
import os
os.chdir("..")

# MLP - Multi-Layer Perceptron

In this notebook, I create a deep learning model to predict the `diameter` and `albedo` values. Then save the model for use in `data-cleaning` notebook`.

## Designing the Architecture

In [2]:
from typing import List

import torch
from torch.nn import LazyLinear, Dropout, ReLU, Sequential, LayerNorm
from torch.nn import MSELoss

torch.manual_seed(29)

<torch._C.Generator at 0x7f73dbf04af0>

In [3]:
class SimpleBlock(torch.nn.Module):
    def __init__(self, num_output: int, dropout: float):
        super().__init__()

        self.net = Sequential(
            LazyLinear(num_output),
            LayerNorm(num_output),
            ReLU(),
            Dropout(dropout),
        )

    def __call__(self, X):
        return self.net(X)


block = SimpleBlock(64, 0.2)
test = torch.arange(60, dtype=torch.float32)
block(test)



tensor([0.0000, 0.0000, 0.0000, 2.9995, 0.0956, 0.0641, 0.0000, 1.8861, 0.0000,
        0.0000, 0.0000, 0.0342, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0211, 0.0000, 0.0000, 2.9773, 0.0000, 0.7055, 1.3559, 0.0000,
        0.6901, 0.2212, 0.0000, 0.7959, 0.0000, 0.0000, 0.0000, 1.7026, 0.0000,
        0.0000, 0.0000, 1.9092, 0.0000, 0.0000, 0.2701, 2.1247, 0.0000, 0.3357,
        0.0000, 0.0852, 0.0000, 1.1907, 0.0000, 0.0000, 0.0000, 0.0000, 0.8020,
        0.0000, 0.0000, 0.0000, 0.0000, 0.8124, 0.7961, 0.0000, 0.0000, 0.0729,
        0.0000], grad_fn=<MulBackward0>)

In [4]:
class Block(torch.nn.Module):
    def __init__(
        self, num_output_list: List[int], dropout_list: List[float], num_output: int
    ) -> None:
        super().__init__()

        self.layers = [
            SimpleBlock(no, d) for no, d in zip(num_output_list, dropout_list)
        ]

        self.layer_out = LazyLinear(num_output)

    def __call__(self, X):
        output = X
        
        for layer in self.layers:
            output = torch.concat([output, layer(X)])
        
        return self.layer_out(output)


block = Block(num_output_list=[32, 16, 16], dropout_list=[0.3, 0.2, 0.2], num_output=64)
block(test).shape

torch.Size([64])

In [5]:
class MLP(torch.nn.Module):
    def __init__(self, n: int, block_io_shape: int = 64):
        super().__init__()

        self.layer_in = LazyLinear(block_io_shape)

        self.layers = [
            Block(
                num_output_list=[32, 16, 16],
                dropout_list=[0.3, 0.2, 0.2],
                num_output=block_io_shape,
            )
            for _ in range(n)
        ]
        
        self.output_layer = LazyLinear(2)

        self.apply(self._init)

        self.loss = MSELoss()

    def _init(self, module):
        if type(module) is torch.nn.Linear:
            torch.nn.init.xavier_uniform_(module.weight)
            torch.nn.init.zeros_(module.bias)

    def __call__(self, X):
        output = self.layer_in(X)

        for l in self.layers:
            output = l(output) + output

        return self.output_layer(output)

    def loss(self, y, pred):
        return self.loss(y, pred)


mlp = MLP(3)
mlp(test)

tensor([-25.0437, -13.5082], grad_fn=<ViewBackward0>)

## Setup Dataset

In [6]:
import pandas as pd

In [7]:
df = pd.read_csv("data/Asteroid_Imputed.csv")
df.shape

  df = pd.read_csv("data/Asteroid_Imputed.csv")


(839714, 21)

In [8]:
df.columns

Index(['a', 'e', 'i', 'om', 'w', 'q', 'ad', 'per_y', 'data_arc',
       'condition_code', 'n_obs_used', 'H', 'neo', 'pha', 'diameter', 'albedo',
       'moid', 'class', 'n', 'per', 'ma'],
      dtype='object')

In [9]:
df.drop(df[(df.condition_code == 'D') | (df.condition_code == 'E')].index, inplace=True)

In [10]:
df.condition_code = df.condition_code.map(lambda x: int(x))

In [11]:
df_processed = df.copy(deep=True)

Normalizing numerical columns using min-max scaling.

In [12]:
from sklearn.preprocessing import MinMaxScaler

In [13]:
categorical_columns = ["pha", "neo", "condition_code", "class"]
numerical_columns = df.columns[~df.columns.isin(categorical_columns)]
print(numerical_columns)

Index(['a', 'e', 'i', 'om', 'w', 'q', 'ad', 'per_y', 'data_arc', 'n_obs_used',
       'H', 'diameter', 'albedo', 'moid', 'n', 'per', 'ma'],
      dtype='object')


Before **Min-Max Normalization**

I won't be normalizing the values for `diameter` and `albedo`, because I want the deep learning model to predict these values.

In [14]:
numerical_columns = numerical_columns.drop(["diameter", "albedo"])

In [15]:
df_processed[numerical_columns].sample(3)

Unnamed: 0,a,e,i,om,w,q,ad,per_y,data_arc,n_obs_used,H,moid,n,per,ma
614610,3.198038,0.308327,11.802038,187.64074,135.594892,1.826876,3.938094,5.454249,39.0,36,16.5,0.835575,0.172337,2088.929536,290.19197
395427,2.28828,0.073797,3.467121,102.449277,295.556111,2.457067,2.588706,4.800755,7110.0,121,17.8,1.46627,0.284735,1264.335197,46.269512
399609,3.209218,0.209471,10.722528,346.063768,76.635257,2.010219,3.759093,4.183545,5321.0,202,16.4,1.05357,0.171437,2099.893013,244.960696


In [16]:
for column in numerical_columns:
    values = df_processed[column].values.reshape(-1, 1)
    df_processed[column] = MinMaxScaler().fit_transform(values)

After **Min-Max Normalization**

In [17]:
df_processed[numerical_columns].sample(3)

Unnamed: 0,a,e,i,om,w,q,ad,per_y,data_arc,n_obs_used,H,moid,n,per,ma
127971,0.971667,0.063012,0.04309,0.999725,0.644162,0.026015,0.000361,2.7e-05,0.084958,0.067789,0.483965,0.014599,0.114174,1.9e-05,0.691284
580755,0.971673,0.043696,0.028716,0.791756,0.044079,0.032894,0.000466,2.2e-05,0.000165,0.000858,0.516035,0.021354,0.079457,2.9e-05,0.181094
716423,0.971674,0.323119,0.051522,0.476163,0.5331,0.0153,0.00041,2e-05,0.001623,0.005899,0.511458,0.003746,0.075963,3e-05,0.706963


One hot encode categorical columns.

In [18]:
df_processed = pd.get_dummies(
    df_processed,
    columns=categorical_columns,
    dummy_na=True,
)
df_processed.shape

(839559, 49)

In [19]:
df_processed.columns

Index(['a', 'e', 'i', 'om', 'w', 'q', 'ad', 'per_y', 'data_arc', 'n_obs_used',
       'H', 'diameter', 'albedo', 'moid', 'n', 'per', 'ma', 'pha_N', 'pha_Y',
       'pha_nan', 'neo_N', 'neo_Y', 'neo_nan', 'condition_code_0.0',
       'condition_code_1.0', 'condition_code_2.0', 'condition_code_3.0',
       'condition_code_4.0', 'condition_code_5.0', 'condition_code_6.0',
       'condition_code_7.0', 'condition_code_8.0', 'condition_code_9.0',
       'condition_code_nan', 'class_AMO', 'class_APO', 'class_AST',
       'class_ATE', 'class_CEN', 'class_HYA', 'class_IEO', 'class_IMB',
       'class_MBA', 'class_MCA', 'class_OMB', 'class_PAA', 'class_TJN',
       'class_TNO', 'class_nan'],
      dtype='object')

Now, I'll split them into two sets. 

* **Set 1**: Diameter and albedo are both not null. These will be used for training purposes.

* **Set 2**: Diameter or albedo are null. These will be used for inferencing purposes. The model will predict their values given the other column values, and then I'll use the predicted values to impute the missing values.

In [20]:
set_1 = df_processed[df_processed.diameter.notnull() & df_processed.albedo.notnull()]
set_2 = df_processed[~df_processed.isin(set_1)]

print(f"Number of examples for training purposes: {set_1.shape[0]}")
print(f"Number of examples for inference purposes: {set_2.shape[0]}")

Number of examples for training purposes: 136406
Number of examples for inference purposes: 839559


I'll now split the first set into

1. **Training set**: Used specifically to train the model.
2. **Validation set**: Used to check model's performance on unseen data.
3. **Test set**: Used to check generalization performance.

In [21]:
import numpy as np

In [22]:
train = set_1.sample(frac=0.9, random_state=29)
valid = set_1[~set_1.isin(train)].dropna()
test = valid.sample(frac=0.5)
valid = valid[~valid.isin(test)].dropna()

train.shape, valid.shape, test.shape

((122765, 49), (6821, 49), (6820, 49))

In [23]:
train_X = train.drop(columns=["diameter", "albedo"]).values.astype(np.float64)
test_X = valid.drop(columns=["diameter", "albedo"]).values.astype(np.float64)
valid_X = test.drop(columns=["diameter", "albedo"]).values.astype(np.float64)

print(f"Training set X shape: {train_X.shape}")
print(f"Validation set X shape: {valid_X.shape}")
print(f"Test set X shape: {test_X.shape}")

Training set X shape: (122765, 47)
Validation set X shape: (6820, 47)
Test set X shape: (6821, 47)


In [24]:
train_Y = train[["diameter", "albedo"]].values.astype(np.float64)
valid_Y = valid[["diameter", "albedo"]].values.astype(np.float64)
test_Y = test[["diameter", "albedo"]].values.astype(np.float64)

print(f"Training set Y shape: {train_Y.shape}")
print(f"Validation set Y shape: {valid_Y.shape}")
print(f"Test set Y shape: {test_Y.shape}")

Training set Y shape: (122765, 2)
Validation set Y shape: (6821, 2)
Test set Y shape: (6820, 2)


In [25]:
del train, valid, test

Creating a Data Loader for batch processing

In [26]:
from torch.utils.data import DataLoader

In [27]:
TRAIN_BATCH_SIZE = 32
VALIDATION_BATCH_SIZE = 64
TEST_BATCH_SIZE = 64

In [28]:
train_X = torch.tensor(train_X)
train_Y = torch.tensor(train_Y)

valid_X = torch.tensor(valid_X)
valid_Y = torch.tensor(valid_Y)

test_X = torch.tensor(test_X)
test_Y = torch.tensor(test_Y)

In [29]:
train_loader = DataLoader(
    list(zip(train_X, train_Y)),
    shuffle=True,
    batch_size=TRAIN_BATCH_SIZE,
)

valid_loader = DataLoader(
    list(zip(valid_X, valid_Y)),
    shuffle=True,
    batch_size=VALIDATION_BATCH_SIZE,
)

test_loader = DataLoader(
    list(zip(test_X, test_Y)),
    shuffle=True,
    batch_size=TEST_BATCH_SIZE,
)

In [31]:
for X, y in train_loader:
    print(X.shape, y.shape)
    break

torch.Size([32, 47]) torch.Size([32, 2])


## Training Script