In [155]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt

In [156]:
# read data from csv 
data = pd.read_csv('datasets/combined_data.csv')
df = pd.DataFrame(data)
df.sample(5) # random sample of 5 rows as df.head() shows first 5 rows


Unnamed: 0,RIDAGEYR,INDFMPIR,BMXBMI,BMXHT,Pulse,LBXWBCSI,LBXPLTSI,LBXHGB,LBXMCVSI,LBXSCR,...,RIDRETH3_7.0,Cardiovascular_target,Waist_Label,Triglycerides_Label,HDL_Label,BP_Label,Glucose_Label,ACR_Log,ALT_Log,data_type
39023,-0.023583,0.390057,1.073744,0.681405,2.500771,-0.601688,-0.070546,-0.551114,0.159845,0.44648,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.279768,2.772589,training
40622,-0.023583,1.575171,1.085977,0.054677,0.014118,0.276924,1.11236,0.31627,0.379046,-0.678388,...,0.0,0.0,1.0,,1.0,1.0,,0.609766,4.317488,training
19017,0.552643,-0.735004,1.293935,1.02908,-0.742689,-0.122445,-0.464849,0.116105,0.770476,0.06215,...,0.0,1.0,1.0,,1.0,0.0,,3.044522,,training
27833,1.746253,-0.784026,0.278613,0.35203,0.749303,-0.628313,0.017076,-1.15161,0.441674,0.985479,...,0.0,1.0,1.0,,0.0,,,2.467031,2.397895,training
1920,-0.105901,0.815326,1.416263,0.836944,2.095338,-0.441941,-0.362622,0.916767,-0.059356,0.235567,...,0.0,0.0,1.0,,1.0,1.0,,1.684896,3.663562,training


In [157]:
df.shape

(50946, 36)

In [158]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50946 entries, 0 to 50945
Data columns (total 36 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   RIDAGEYR                 50946 non-null  float64
 1   INDFMPIR                 50946 non-null  float64
 2   BMXBMI                   50946 non-null  float64
 3   BMXHT                    50946 non-null  float64
 4   Pulse                    50946 non-null  float64
 5   LBXWBCSI                 50946 non-null  float64
 6   LBXPLTSI                 50946 non-null  float64
 7   LBXHGB                   50946 non-null  float64
 8   LBXMCVSI                 50946 non-null  float64
 9   LBXSCR                   50946 non-null  float64
 10  LBXSASSI                 50946 non-null  float64
 11  LBXSTB                   50946 non-null  float64
 12  LBXSGTSI                 50946 non-null  float64
 13  LBXSUA                   50946 non-null  float64
 14  LBXSNASI              

In [159]:
df.columns

Index(['RIDAGEYR', 'INDFMPIR', 'BMXBMI', 'BMXHT', 'Pulse', 'LBXWBCSI',
       'LBXPLTSI', 'LBXHGB', 'LBXMCVSI', 'LBXSCR', 'LBXSASSI', 'LBXSTB',
       'LBXSGTSI', 'LBXSUA', 'LBXSNASI', 'LBXSKSI', 'LBXTC',
       'Alcohol_Drinks_Per_Week', 'SMQ040', 'RIAGENDR_1.0', 'RIAGENDR_2.0',
       'RIDRETH3_1.0', 'RIDRETH3_2.0', 'RIDRETH3_3.0', 'RIDRETH3_4.0',
       'RIDRETH3_6.0', 'RIDRETH3_7.0', 'Cardiovascular_target', 'Waist_Label',
       'Triglycerides_Label', 'HDL_Label', 'BP_Label', 'Glucose_Label',
       'ACR_Log', 'ALT_Log', 'data_type'],
      dtype='object')

In [160]:
df.describe() # .describe() is used to get the summary statistics of the data

Unnamed: 0,RIDAGEYR,INDFMPIR,BMXBMI,BMXHT,Pulse,LBXWBCSI,LBXPLTSI,LBXHGB,LBXMCVSI,LBXSCR,...,RIDRETH3_6.0,RIDRETH3_7.0,Cardiovascular_target,Waist_Label,Triglycerides_Label,HDL_Label,BP_Label,Glucose_Label,ACR_Log,ALT_Log
count,50946.0,50946.0,50946.0,50946.0,50946.0,50946.0,50946.0,50946.0,50946.0,50946.0,...,50946.0,50946.0,34097.0,45339.0,16870.0,39336.0,38904.0,16355.0,44626.0,34503.0
mean,0.000861,0.007494,0.021404,0.020932,0.128421,-0.006657,0.031322,-0.036764,-0.027022,-0.142193,...,0.102108,0.059298,0.119629,0.417786,0.186426,0.283455,0.270409,0.541975,2.475383,2.947035
std,1.000818,0.959924,0.973572,0.976718,0.955075,0.878313,0.961032,0.949224,0.956963,0.936877,...,0.302793,0.236184,0.324532,0.4932,0.389461,0.450681,0.444177,0.49825,1.039504,0.522609
min,-1.464147,-1.488721,-1.886592,-3.605053,-3.175283,-1.560175,-3.633871,-5.555251,-8.075846,-1.451734,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.078545,0.693147
25%,-0.970239,-0.808537,-0.687778,-0.19693,-0.553487,-0.378042,-0.610887,-0.657869,-0.591701,-0.608083,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.82735,2.564949
50%,-0.064742,-0.193921,-0.03944,0.257334,0.068176,-0.074521,-0.055943,-0.084061,0.050244,-0.256562,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.214616,2.890372
75%,0.923073,0.784687,0.561802,0.635659,0.753357,0.276924,0.5691,0.583158,0.582589,0.141828,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,2.813951,3.218876
max,1.746253,1.575171,8.046437,2.085827,8.663344,104.565548,11.159767,4.119415,4.653463,38.762289,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,9.959545,6.52503


In [161]:
df.nunique() # .nunique() is used to get the number of unique values in a column

RIDAGEYR                      79
INDFMPIR                    2613
BMXBMI                      1846
BMXHT                       2791
Pulse                       2261
LBXWBCSI                     926
LBXPLTSI                    2083
LBXHGB                      1211
LBXMCVSI                    3218
LBXSCR                      1457
LBXSASSI                     478
LBXSTB                       333
LBXSGTSI                     918
LBXSUA                       929
LBXSNASI                     153
LBXSKSI                     2619
LBXTC                       1336
Alcohol_Drinks_Per_Week     3577
SMQ040                        15
RIAGENDR_1.0                   2
RIAGENDR_2.0                   2
RIDRETH3_1.0                   2
RIDRETH3_2.0                   2
RIDRETH3_3.0                   2
RIDRETH3_4.0                   2
RIDRETH3_6.0                   2
RIDRETH3_7.0                   2
Cardiovascular_target          2
Waist_Label                    2
Triglycerides_Label            2
HDL_Label 

In [162]:

#df[target_column].value_counts(normalize=True)

### Data Cleaning

In [163]:
df.isnull().sum()

RIDAGEYR                       0
INDFMPIR                       0
BMXBMI                         0
BMXHT                          0
Pulse                          0
LBXWBCSI                       0
LBXPLTSI                       0
LBXHGB                         0
LBXMCVSI                       0
LBXSCR                         0
LBXSASSI                       0
LBXSTB                         0
LBXSGTSI                       0
LBXSUA                         0
LBXSNASI                       0
LBXSKSI                        0
LBXTC                          0
Alcohol_Drinks_Per_Week        0
SMQ040                         0
RIAGENDR_1.0                   0
RIAGENDR_2.0                   0
RIDRETH3_1.0                   0
RIDRETH3_2.0                   0
RIDRETH3_3.0                   0
RIDRETH3_4.0                   0
RIDRETH3_6.0                   0
RIDRETH3_7.0                   0
Cardiovascular_target      16849
Waist_Label                 5607
Triglycerides_Label        34076
HDL_Label 

In [164]:
# Check for missing values per column
missing_percent = df.isnull().sum() / len(df) * 100
missing_df = pd.DataFrame({
    'Missing Count': df.isnull().sum(),
    'Missing Percent': missing_percent
}).sort_values(by='Missing Count', ascending=False)
missing_df = missing_df[missing_df['Missing Count'] > 0]
print(missing_df)

                       Missing Count  Missing Percent
Glucose_Label                  34591        67.897382
Triglycerides_Label            34076        66.886507
Cardiovascular_target          16849        33.072273
ALT_Log                        16443        32.275350
BP_Label                       12042        23.636792
HDL_Label                      11610        22.788835
ACR_Log                         6320        12.405292
Waist_Label                     5607        11.005771


In [165]:
df.duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
50941    False
50942    False
50943    False
50944    False
50945    False
Length: 50946, dtype: bool

In [166]:
## add MASK for missing values
# Define feature columns, target column, and data type column
feature_cols = df.columns[:35].tolist()
target_col = 'ACR_Log'
data_type_col = 'data_type'

In [167]:
# Combine features and target for cleaning
data = df[feature_cols + [target_col, data_type_col]].copy()
data.dropna(subset=[target_col], inplace=True)
print ("After dropping rows with missing target values:", data.shape)

After dropping rows with missing target values: (44626, 37)


In [168]:
# 1a. Separate features and data_type
X_data = data[feature_cols]
data_types = data[data_type_col]
y_data = data[target_col].values # Reshape target to be 2D array with 1 column

In [169]:
# 1b. Create the Mask Matrix (True where NOT NaN, False where IS NaN)
mask_matrix_bool = ~X_data.isna()
mask_matrix = mask_matrix_bool.astype(np.float32).values

In [170]:
# 1c. Impute NaN values in the features with 0
X_imputed = X_data.fillna(0).values

In [171]:
print("Shape of original data:", df.shape)
print("Shape after operations:")
print("X_imputed shape:", X_imputed.shape)
print("y_data shape:", y_data.shape)
print("mask_matrix shape:", mask_matrix.shape)
print("data_types shape:", data_types.shape if hasattr(data_types, 'shape') else len(data_types))
print("Unique values in data_types:", data_types.unique() if hasattr(data_types, 'unique') else set(data_types))

Shape of original data: (50946, 36)
Shape after operations:
X_imputed shape: (44626, 36)
y_data shape: (44626, 2)
mask_matrix shape: (44626, 36)
data_types shape: (44626,)
Unique values in data_types: ['training' 'testing']


In [172]:
X_train = X_imputed[data_types == 'training']
y_train = y_data[data_types == 'training']
Mask_train = mask_matrix[data_types == 'training']

X_test = X_imputed[data_types == 'testing']
y_test = y_data[data_types == 'testing']
Mask_test = mask_matrix[data_types == 'testing']

In [173]:
# Convert to PyTorch Tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
Mask_train_tensor = torch.tensor(Mask_train, dtype=torch.float32)

X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)
Mask_test_tensor = torch.tensor(Mask_test, dtype=torch.float32)

In [174]:
#Update the INPUT_SIZE to the full number of features
INPUT_SIZE = X_train_tensor.shape[1] # Now 35
OUTPUT_SIZE = y_train_tensor.shape[1] # 1
print(f"New Input Size: {INPUT_SIZE} features")

New Input Size: 36 features


### Building Model

In [175]:
class RegressionModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(RegressionModel, self).__init__()
         # Deeper architecture with Batch Normalization and Dropout
        self.fc1 = nn.Linear(input_size, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.dropout1 = nn.Dropout(0.3)
        
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.dropout2 = nn.Dropout(0.3)
        
        self.fc3 = nn.Linear(64, 32)
        self.bn3 = nn.BatchNorm1d(32)
        self.dropout3 = nn.Dropout(0.2)
        
        self.fc4 = nn.Linear(32, output_size)

    def forward(self, x):
        x = torch.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        
        x = torch.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        
        x = torch.relu(self.bn3(self.fc3(x)))
        x = self.dropout3(x)
        
        return self.fc4(x)

In [176]:
model = RegressionModel(INPUT_SIZE, OUTPUT_SIZE) # input size , output size

In [177]:
#Loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001) 
num_epochs = 100
print("\nStarting Training with Masking...")
# Training loop with masking
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    masked_Inputs = X_train_tensor * Mask_train_tensor
    # Forward pass with masked inputs and targets 
    y_predicted = model(masked_Inputs)
    loss = criterion(y_predicted, y_train_tensor)
    loss.backward()
    optimizer.step()
    if (epoch + 1) % 10 == 0:
        # Placeholder output since the training execution failed in the environment
        # In a real environment, this would show the decreasing loss
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

print("Training finished.")


Starting Training with Masking...
Epoch [10/100], Loss: 7.8892
Epoch [20/100], Loss: 6.7644
Epoch [30/100], Loss: 5.9412
Epoch [40/100], Loss: 5.2215
Epoch [50/100], Loss: 4.6146
Epoch [60/100], Loss: 4.0322
Epoch [70/100], Loss: 3.5006
Epoch [80/100], Loss: 3.0648
Epoch [90/100], Loss: 2.6417
Epoch [100/100], Loss: 2.2510
Training finished.


In [178]:
model.eval() 
with torch.no_grad():
    # ðŸŒŸ APPLICATION OF THE MASK TO TEST DATA ðŸŒŸ
    masked_test_input = X_test_tensor * Mask_test_tensor
    
    y_test_pred = model(masked_test_input)
    
    test_loss = criterion(y_test_pred, y_test_tensor)
    
    # Placeholder output for test error
    print(f'\nTest Set Mean Squared Error (MSE): {test_loss.item():.4f}')
    print(f'Test Set Root Mean Squared Error (RMSE): {torch.sqrt(test_loss).item():.4f}')


Test Set Mean Squared Error (MSE): 2.0909
Test Set Root Mean Squared Error (RMSE): 1.4460


In [179]:
# Plotting
plt.figure(figsize=(10, 5))
plt.plot(training_loss, label='Training Loss')
plt.title('Training Loss over Epochs')
plt.xlabel('Epochs')
plt.ylabel('MSE Loss')
plt.legend()
plt.savefig('training_loss_curve.png')
print("Loss curve saved as 'training_loss_curve.png'")

NameError: name 'training_loss' is not defined

<Figure size 1000x500 with 0 Axes>