In [1]:
import os

os.chdir("../")

# Data Cleaning

This notebook deals with everything associated with data cleaning. Which includes filling in missing values, handling noise, tackling inconsistancies, etc.

## Loading Dataset

In [2]:
import pandas as pd
import plotly.express as px

In [3]:
df = pd.read_csv("data/Asteroid_Data.csv", low_memory=False)
print(f"Number of (rows, columns) = {df.shape}")

Number of (rows, columns) = (1340607, 43)


In [4]:
df.sample(3)

Unnamed: 0,full_name,a,e,i,om,w,q,ad,per_y,data_arc,...,moid,moid_ld,sigma_e,sigma_a,sigma_q,sigma_i,sigma_per,class,first_obs,last_obs
378060,378061 (2006 TZ91),2.303,0.1618,2.57,248.39,138.95,1.931,2.68,3.5,6175.0,...,0.935,364.0,4.3e-08,7.9e-09,1e-07,5e-06,7e-06,MBA,2006-10-13,2023-09-09
846878,(2013 GK125),2.661,0.2665,8.25,142.64,67.35,1.952,3.37,4.34,1436.0,...,0.964,375.0,4.1e-06,2.6e-06,1.3e-05,5.3e-05,0.0023,MBA,2013-04-11,2017-03-17
5466,5467 (1988 AG),2.807,0.1696,8.76,339.01,141.03,2.331,3.28,4.7,18165.0,...,1.35,526.0,3e-09,7.6e-09,4e-09,7e-08,7e-06,MBA,1974-02-16,2023-11-11


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1340607 entries, 0 to 1340606
Data columns (total 43 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   full_name       1340607 non-null  object 
 1   a               1340607 non-null  float64
 2   e               1340607 non-null  float64
 3   i               1340607 non-null  float64
 4   om              1340607 non-null  float64
 5   w               1340607 non-null  float64
 6   q               1340607 non-null  float64
 7   ad              1340602 non-null  float64
 8   per_y           1340602 non-null  float64
 9   data_arc        1340063 non-null  float64
 10  condition_code  1340584 non-null  object 
 11  n_obs_used      1340607 non-null  int64  
 12  n_del_obs_used  1034 non-null     float64
 13  n_dop_obs_used  1034 non-null     float64
 14  H               1339457 non-null  float64
 15  epoch_mjd       1340607 non-null  int64  
 16  ma              1340606 non-null  fl

## Missing Values

This section deals with handling missing values.

### Identify Missing Columns

In this subsection, I'll identify which columns have missing values. What percentage of the values are missing. I visualize the missing statistics in a bar plot. I, then, chart a course on how to handle the different levels of missing values.

In [6]:
missing = pd.DataFrame(
    df.apply(lambda x: x.isna(), axis=1).sum().sort_values(ascending=True)
).reset_index()

missing.rename(columns={0: "Missing", "index": "Column"}, inplace=True)
missing["Percent"] = missing["Missing"] / df.shape[0] * 100

In [7]:
fig = px.bar(missing[missing.Missing > 0], x="Column", y="Percent", text="Missing")
fig.update_layout(
    height=600,
    width=800,
    title_x=0.5,
    title_text=f"Bar Chart<br><sup>Missing Values of each column</sup>",
)
fig.show()

**Observation 1**

Nearly all values in `rot_per` to `IR` are missing. Predicting them from the existing ones will be hard as there isn't enough data. 

    The best way to deal with these columns is to drop them. If I learn of a better way to handle these missing values, I'll come and deal with them later on.

In [8]:
missing[missing.Percent > 90]

Unnamed: 0,Column,Missing,Percent
32,rot_per,1306504,97.456152
33,spec_B,1338941,99.875728
34,n_del_obs_used,1339573,99.922871
35,n_dop_obs_used,1339573,99.922871
36,BV,1339586,99.92384
37,spec_T,1339627,99.926899
38,UB,1339628,99.926973
39,G,1340488,99.991123
40,extent,1340587,99.998508
41,GM,1340592,99.998881


**Observation 2**

A big chunk of `diameter` and `albedo` values are missing. 

    Predicting them with a Machine Learning model should be possible from the 20\% data that is available. I'll use a simple deep learning model to do this.

In [9]:
missing[missing.Column.isin(["diameter", "albedo"])]

Unnamed: 0,Column,Missing,Percent
29,diameter,1200983,89.585016
31,albedo,1202111,89.669157


**Observation 3**

Some columns have absolutely no missing values. 

    Nothing needs to be done for these columns. I'll use these to help me in imputing other missing values.

In [10]:
missing[missing.Missing == 0]

Unnamed: 0,Column,Missing,Percent
0,full_name,0,0.0
1,class,0,0.0
2,n,0,0.0
3,first_obs,0,0.0
4,epoch_mjd,0,0.0
5,n_obs_used,0,0.0
6,last_obs,0,0.0
7,a,0,0.0
8,e,0,0.0
9,w,0,0.0


**Observation 4**

Most columns have $<5\%$ data is missing. 

    These can be filled in using imputation techniques. For numerical columns, I'll use imputation by group median. For categorical, I'll impute by group mode.

In [11]:
missing[(missing.Percent < 5) & (missing.Missing > 0)]

Unnamed: 0,Column,Missing,Percent
13,ma,1,7.5e-05
14,per,5,0.000373
15,neo,5,0.000373
16,ad,5,0.000373
17,per_y,5,0.000373
18,condition_code,23,0.001716
19,data_arc,544,0.040579
20,H,1150,0.085782
21,moid_ld,2112,0.157541
22,moid,2112,0.157541


### Dropping Columns

In this subsection, I drop the columns that have more than 90\% of their values missing.

In [12]:
df.drop(
    columns=[
        "full_name",
        "rot_per",
        "spec_B",
        "spec_T",
        "G",
        "BV",
        "UB",
        "IR",
        "GM",
        "extent",
        "n_del_obs_used",
        "n_dop_obs_used",
        "sigma_i",
        "sigma_q",
        "sigma_a",
        "sigma_e",
        "sigma_per",
        "diameter_sigma",
    ],
    inplace=True,
)

print(f"After dropping, dataframe shape = {df.shape}")

After dropping, dataframe shape = (1340607, 25)


### Imputation

#### Imputation by Group

In this subsection, I'll impute missing values for columns with less than 5\% of their data missing.

##### `neo` column

In [13]:
df[df.neo.isna()]

Unnamed: 0,a,e,i,om,w,q,ad,per_y,data_arc,condition_code,...,albedo,neo,pha,n,per,moid,moid_ld,class,first_obs,last_obs
1084193,-57.17,1.0219,145.42,166.04,77.96,1.25,,,17.0,,...,,,N,0.00228,,0.601,234.0,HYA,2016-12-11,2016-12-28
1125900,-1.272,1.2011,122.74,24.6,241.81,0.256,,,80.0,,...,,,N,0.6867,,0.0958,37.3,HYA,2017-10-14,2018-01-02
1220367,-50670.0,1.0011,72.83,287.13,29.58,53.433,,,56.0,,...,,,N,8.641e-08,,52.5,20400.0,HYA,2020-06-20,2020-08-15
1285026,-2290.0,1.0013,137.14,228.8,193.28,2.995,,,389.0,,...,,,N,8.991e-06,,2.01,784.0,HYA,2021-10-30,2022-11-23
1328329,-15820.0,1.0001,12.11,347.39,86.85,1.358,,,25.0,,...,,,N,4.955e-07,,0.445,173.0,HYA,2023-09-15,2023-10-10


I can use `pha` column to group and impute. Other categorical columns, 

* `condition_code` is null for all these rows.
* `class` can't be used. Because all instances of the groups have null `neo` values.

In [14]:
df.groupby("pha").neo.apply(lambda x: x.mode().iloc[0])

pha
N    N
Y    Y
Name: neo, dtype: object

Rows with `pha` of **N** has a `neo` mode of **N**. So, I'll impute the missing `neo` values with this.

In [15]:
df.neo.fillna("N", inplace=True)

To confirm, I should have zero missing values now.

In [16]:
df.neo.isna().sum()

0

##### `condition_code` column

In [17]:
df["condition_code"] = (
    df.groupby(["class", "neo"])
    .transform(lambda x: x.fillna(x.mode().iloc[0]))
    .condition_code
)

In [18]:
df[df.condition_code.isna()]

Unnamed: 0,a,e,i,om,w,q,ad,per_y,data_arc,condition_code,...,albedo,neo,pha,n,per,moid,moid_ld,class,first_obs,last_obs
1084193,-57.17,1.0219,145.42,166.04,77.96,1.25,,,17.0,,...,,N,N,0.00228,,0.601,234.0,HYA,2016-12-11,2016-12-28
1125900,-1.272,1.2011,122.74,24.6,241.81,0.256,,,80.0,,...,,N,N,0.6867,,0.0958,37.3,HYA,2017-10-14,2018-01-02
1220367,-50670.0,1.0011,72.83,287.13,29.58,53.433,,,56.0,,...,,N,N,8.641e-08,,52.5,20400.0,HYA,2020-06-20,2020-08-15
1285026,-2290.0,1.0013,137.14,228.8,193.28,2.995,,,389.0,,...,,N,N,8.991e-06,,2.01,784.0,HYA,2021-10-30,2022-11-23
1328329,-15820.0,1.0001,12.11,347.39,86.85,1.358,,,25.0,,...,,N,N,4.955e-07,,0.445,173.0,HYA,2023-09-15,2023-10-10


In [19]:
df.groupby("neo").apply(lambda x: x.mode().iloc[0]).condition_code

neo
N    0
Y    7
Name: condition_code, dtype: object

Rows with `neo` value **N** has mostly have a `condition_code` value of 0.

In [20]:
df.condition_code.fillna("0", inplace=True)

In [21]:
df.condition_code.isna().sum()

0

##### `pha` column

In [22]:
df["pha"] = (
    df.groupby(["class", "neo"]).transform(lambda x: x.fillna(x.mode().iloc[0])).pha
)

df["pha"].isnull().sum()

0

##### Numerical Columns

In [23]:
impute_columns = ["ma", "per", "ad", "per_y", "data_arc", "H", "moid", "moid_ld"]

df[impute_columns] = (
    df.groupby(by=["neo", "condition_code"])[impute_columns]
    .apply(lambda x: x.fillna(x.median()))
    .reset_index()[impute_columns]
)

df[impute_columns].isnull().sum()

ma          0
per         0
ad          0
per_y       0
data_arc    0
H           1
moid        1
moid_ld     1
dtype: int64

In [24]:
df[df["H"].isnull()].dropna(how="all")

Unnamed: 0,a,e,i,om,w,q,ad,per_y,data_arc,condition_code,...,albedo,neo,pha,n,per,moid,moid_ld,class,first_obs,last_obs
1306656,3.171,0.1008,8.52,342.48,8.14,2.851,3.53,4.62,2.0,7,...,,N,N,0.1745,1690.0,,,MBA,2022-09-16,2022-09-28


In [25]:
df.groupby("class")[impute_columns].apply(lambda x: x.median()).loc[
    "MBA", ["H", "moid", "moid_ld"]
]

H           17.40
moid         1.25
moid_ld    488.00
Name: MBA, dtype: float64

In [26]:
df.H.fillna(17.40, inplace=True)
df.moid.fillna(1.25, inplace=True)
df.moid_ld.fillna(488.00, inplace=True)

In [27]:
df[impute_columns].isnull().sum()

ma          0
per         0
ad          0
per_y       0
data_arc    0
H           0
moid        0
moid_ld     0
dtype: int64

#### Impute using MLP

##### `albedo` column

In [28]:
import torch
torch.manual_seed(29)

from src.deep_learning import mlp, train_script, create_dataloader

In [29]:
categorical_columns = ["pha", "neo", "condition_code", "class"]
numerical_columns = df.columns.drop(categorical_columns).drop(["first_obs", "last_obs", "diameter"])
target_columns = ["albedo"]
exclude_columns = ["first_obs", "last_obs", "diameter"]

In [30]:
train_loader, valid_loader, inf_loader = create_dataloader.create_dataloader(
    df,
    numerical_columns,
    categorical_columns,
    target_columns,
    exclude_columns,
    2048,
    False,
)

Number of examples for training purposes: 138496
Number of examples for inference purposes: 1202111
Training X shape: torch.Size([128256, 45])
Training y shape: torch.Size([128256, 1])
Validation X shape: torch.Size([10240, 45])
Validation y shape: torch.Size([10240, 1])
Inference X shape: torch.Size([1202111, 45])
Inference y shape: torch.Size([1202111, 1])


In [107]:
import importlib
importlib.reload(mlp)
importlib.reload(train_script)

<module 'src.deep_learning.train_script' from '/home/ishrak/Documents/Asteroid-Mining-Analysis/src/deep_learning/train_script.py'>

In [108]:
model = mlp.MLP_Albedo(
    n=3,
    num_output_list=[256, 128, 64],
    dropout_list=[0.2, 0.15, 0.1],
    device=mlp.device,
)

model = train_script.train_epoch(
    model,
    device=mlp.device,
    num_epochs=10000,
    learning_rate=1e-2,
    gamma=0.999,
    patience=50,
    root_save_dir="model_dir/model_albedo",
    model_name="resnet",
    train_loader=train_loader,
    valid_loader=valid_loader,
)


Lazy modules are a new feature under heavy development so changes to the API or functionality can happen at any moment.



The training process has done!


In [109]:
model.load_state_dict(torch.load("model_dir/model_albedo/resnet"))

<All keys matched successfully>

In [110]:
predictions = []

for X, _ in inf_loader:
    X = X.to(mlp.device)
    predictions += model(X).cpu().tolist()
    
len(predictions)

1202111

In [111]:
df[target_columns].describe()

Unnamed: 0,albedo
count,138496.0
mean,0.130099
std,0.110358
min,0.001
25%,0.053
50%,0.078
75%,0.189
max,1.0


In [118]:
df.loc[(df["albedo"].isnull()), "albedo"] = predictions
df["albedo"].describe()

Unnamed: 0,a,e,i,om,w,q,ad,per_y,data_arc,n_obs_used,H,epoch_mjd,ma,diameter,albedo,n,per,moid,moid_ld
count,1340607.0,1340607.0,1340607.0,1340607.0,1340607.0,1340607.0,1340607.0,1340607.0,1340607.0,1340607.0,1340607.0,1340607.0,1340607.0,139624.0,1340607.0,1340607.0,1340607.0,1340607.0,1340607.0
mean,2.852933,0.1582127,9.277613,169.1431,181.8311,2.395253,3.413375,8.491103,6188.569,308.4926,17.4333,60124.31,177.5757,5.458924,0.1409907,0.2354267,3102.006,1.411861,549.4518
std,48.65619,0.09366901,6.640524,102.8926,103.7753,2.091184,32.0209,1750.673,3947.785,611.8063,1.82014,551.7375,105.8745,9.308008,0.0517633,0.08185746,639765.2,2.087489,812.3503
min,-50670.0,0.0,0.01,0.0,0.0,0.07,0.65,0.314,1.0,3.0,-1.22,25051.0,-0.01,0.0025,-0.05229052,8.641e-08,115.0,3.93e-07,0.000153
25%,2.397,0.0931,4.37,81.23,91.88,1.972,2.8,3.71,4013.0,44.0,16.57,60200.0,84.875,2.763,0.1144609,0.1878,1360.0,0.98,381.0
50%,2.662,0.1472,7.82,160.34,183.62,2.238,3.08,4.34,6260.0,105.0,17.44,60200.0,175.5,3.949,0.1370179,0.2269,1590.0,1.25,487.0
75%,3.021,0.2035,12.65,254.17,271.7,2.588,3.39,5.25,8118.0,286.0,18.29,60200.0,269.6,5.731,0.159909,0.2657,1920.0,1.6,623.0
max,14510.0,1.2011,178.46,360.0,360.0,80.538,29020.06,1750000.0,79979.0,17783.0,33.58,60296.0,367.88,939.4,1.0,3.142,639000000.0,79.6,31000.0


In [123]:
df.drop(index=df[df.albedo < 0].index, inplace=True)

##### `diameter` column

In [124]:
categorical_columns = ["pha", "neo", "condition_code", "class"]
numerical_columns = df.columns.drop(categorical_columns).drop(["first_obs", "last_obs"])
target_columns = ["diameter"]
exclude_columns = ["first_obs", "last_obs"]

In [125]:
train_loader, valid_loader, inf_loader = create_dataloader.create_dataloader(
    df,
    numerical_columns,
    categorical_columns,
    target_columns,
    exclude_columns,
    2048,
    False,
)

Number of examples for training purposes: 139624
Number of examples for inference purposes: 1200975
Training X shape: torch.Size([129384, 46])
Training y shape: torch.Size([129384, 1])
Validation X shape: torch.Size([10240, 46])
Validation y shape: torch.Size([10240, 1])
Inference X shape: torch.Size([1200975, 46])
Inference y shape: torch.Size([1200975, 1])


In [141]:
import importlib
importlib.reload(mlp)
importlib.reload(train_script)

<module 'src.deep_learning.train_script' from '/home/ishrak/Documents/Asteroid-Mining-Analysis/src/deep_learning/train_script.py'>

In [151]:
model = mlp.MLP_Diameter(
    n=3,
    num_output_list=[256, 128, 64],
    dropout_list=[0.2, 0.15, 0.1],
    device=mlp.device,
)

model = train_script.train_epoch(
    model,
    device=mlp.device,
    num_epochs=1000,
    learning_rate=1e-4,
    gamma=0.99,
    patience=50,
    root_save_dir="model_dir/model_diameter",
    model_name="resnet",
    train_loader=train_loader,
    valid_loader=valid_loader,
)


Lazy modules are a new feature under heavy development so changes to the API or functionality can happen at any moment.



The training process has done!


In [152]:
model.load_state_dict(torch.load("model_dir/model_diameter/resnet"))

<All keys matched successfully>

In [153]:
predictions = []

for X, _ in inf_loader:
    X = X.to(mlp.device)
    predictions += model(X).cpu().tolist()
    
len(predictions)

1200975

In [154]:
df[target_columns].describe()

Unnamed: 0,diameter
count,139624.0
mean,5.458924
std,9.308008
min,0.0025
25%,2.763
50%,3.949
75%,5.731
max,939.4


In [157]:
df.loc[(df["diameter"].isnull()), "diameter"] = predictions
df["diameter"].describe()

count    1.340599e+06
mean     2.968320e+00
std      3.139675e+00
min      2.500000e-03
25%      2.616776e+00
50%      2.807216e+00
75%      2.922971e+00
max      9.394000e+02
Name: diameter, dtype: float64

In [158]:
df.isna().sum()

a                 0
e                 0
i                 0
om                0
w                 0
q                 0
ad                0
per_y             0
data_arc          0
condition_code    0
n_obs_used        0
H                 0
epoch_mjd         0
ma                0
diameter          0
albedo            0
neo               0
pha               0
n                 0
per               0
moid              0
moid_ld           0
class             0
first_obs         0
last_obs          0
dtype: int64

In [159]:
df.to_csv("data/Asteroid_Imputed.csv")

## Resolving Inconsistences