Note: All debugging, output and plotting statements have been commented out for the ease of the reader. Please uncomment whatever output/plot is to be analyzed.


Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np
from matplotlib_venn import venn2
import ast
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow
from tensorflow import keras
from PIL import Image
import io
from torch.utils.data import Dataset, DataLoader
from tensorflow.keras.utils import pad_sequences
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, LSTM, Dense, Concatenate, Flatten, Conv2D, MaxPooling2D, Dropout
)
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
import torch
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim

import warnings
warnings.filterwarnings('ignore')

Feature Data Loading

In [2]:
image_train = pd.read_csv('img_train.csv')
image_test = pd.read_csv('img_test.csv')
demo_train = pd.read_csv('demo_viome_train.csv')

cgm_train = pd.read_csv('cgm_train.csv')
cgm_test = pd.read_csv('cgm_test.csv')
demo_test = pd.read_csv('demo_viome_test.csv')

Data Preprocessing - Demographic Data (Part a)





In [3]:
race_mapping = {
    "Hispanic/Latino": 0,
    "White": 1,
    "African American": 2,
}

demo_train["Race"] = demo_train["Race"].map(race_mapping)
demo_test["Race"] = demo_test["Race"].map(race_mapping)

def calculate_average(viome_string):
      numbers = [float(num) for num in viome_string.split(',')]
      return sum(numbers) / len(numbers)

demo_train['Viome'] = demo_train['Viome'].apply(calculate_average)

Data Preprocessing - Images (Part a)

In [4]:
def preprocess_image_data(df, column_name, target_size=(64, 64, 3)):
    def process_single_image(img_str):
        try:
            img = eval(img_str)
            img_array = np.array(img)

            if img_array.size == 0: #empty
              return np.zeros(target_size)

            if len(img_array.shape) == 3 and img_array.shape[2] == 3:
                pil_img = Image.fromarray(img_array.astype('uint8'), 'RGB')
            elif len(img_array.shape) == 2:
                pil_img = Image.fromarray(img_array.astype('uint8'), 'L')
            else:
                pil_img = Image.fromarray(img_array.reshape(target_size[:2]).astype('uint8'), 'L')
            pil_img = pil_img.resize(target_size[:2])

            if pil_img.mode != 'RGB':
                pil_img = pil_img.convert('RGB')

            return np.array(pil_img).astype(float) / 255.0
        except Exception as e:
            #print(f"Error processing image: {e}")
            return np.zeros(target_size)

    processed_images = df[column_name].apply(process_single_image).values
    final_array = np.array(list(processed_images))

    return final_array

# Preprocess breakfast and lunch images
X_breakfast = preprocess_image_data(image_train, 'Image Before Breakfast')
X_lunch = preprocess_image_data(image_train, 'Image Before Lunch')

image_train['Image Before Breakfast'] = list(X_breakfast)
image_train['Image Before Lunch'] = list(X_lunch)
# print("X_breakfast shape:", X_breakfast.shape)
# print("X_lunch shape:", X_lunch.shape)


Loading Prediction Label (Part a)

In [5]:
label_train = pd.read_csv('label_train.csv')
y = label_train['Lunch Calories'].values

Preprocessing Time Series Label (Part a)

In [6]:
df = pd.DataFrame(cgm_train)

def preprocess_row(row): #skipping normalization on this since we did not see a need
    try:
        cgm_data = eval(row['CGM Data'])
    except:
        cgm_data = []

    cgm_values = [value for _, value in cgm_data] if cgm_data else []

    if cgm_values:
      return cgm_values
    else:
        return []

df['processed_cgm'] = df.apply(preprocess_row, axis=1)

X = df['processed_cgm'].tolist()

X_cgm = pad_sequences(X, padding='post', dtype='float32')

cgm_train['CGM Data'] = list(X_cgm)
# print("X (padded) shape:", X_cgm.shape)
# print("First padded sequence:", X_cgm[0])

### Explanation of Demographic Data Handling

For why demographic data is not pre-processed, refer to the report.

---

#### Summary:

1. The data in `label_train.csv` has the same lunch calories for each subject on a given day.
   - For example, Subject ID 1 on Day 2 has the same lunch calories as Subject ID 2 on Day 2, and so on for different subject IDs on the same days.

2. The demographic data does not include day-specific information. This creates two possibilities:
   
   a. **Averaging lunch calories across all days in `label_train.csv`:**  
      - This would result in the same average lunch calories for all subjects, rendering the demographic data ineffective.

   b. **Duplicating demographic data for each day:**  
      - This would lead to identical demographic data being associated with varying outputs, potentially worsening the model's performance.


Defining Kaggle Loss Function (Part d)

In [7]:
def rmsre(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(y_pred, tf.float32)
    return tf.sqrt(tf.reduce_mean(tf.square((y_true - y_pred) / y_true)))

def rmsre_torch(predictions, targets):
    relative_errors = (predictions - targets) / targets
    return torch.sqrt(torch.mean(relative_errors ** 2))


Creating DataLoader for all three modalities (Part b)

In [8]:
merged_data = cgm_train.merge(image_train, on=['Subject ID', 'Day'])
merged_with_demo = merged_data.merge(demo_train, on='Subject ID', how='left')
train_labels = pd.read_csv("label_train.csv")
final_merged = merged_with_demo.merge(train_labels, on=['Subject ID', 'Day'])
# print(final_merged)

numeric_columns = final_merged.select_dtypes(include=['int64', 'float64']).columns
final_merged[numeric_columns] = final_merged[numeric_columns].astype('float32')

categorical_columns = ['Breakfast Time', 'Lunch Time', 'CGM Data', 'Image Before Breakfast', 'Image Before Lunch']

label_encoder = LabelEncoder()
for col in categorical_columns:
    if final_merged[col].dtype == 'object':
        final_merged[col] = label_encoder.fit_transform(final_merged[col].astype(str))

final_merged.fillna(0, inplace=True)

# print(final_merged.dtypes)

feature_columns = [col for col in final_merged.columns if col not in ['Subject ID', 'Day', 'Lunch Calories']]
X = final_merged[feature_columns].values
y = final_merged['Lunch Calories'].values

X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

from torch.utils.data import DataLoader, TensorDataset
dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


Encoding data together

Includes : Data Preparation, Multimodal model implementation, Model Training using pytorch (Part c,d,e without hypertuning)

In [9]:
class LSTMEncoder(nn.Module):
    def __init__(self, sequence_length, num_features):
        super(LSTMEncoder, self).__init__()
        self.lstm = nn.LSTM(input_size=num_features, hidden_size=64, batch_first=True)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        return torch.tanh(h_n.squeeze(0))

class CNNEncoder(nn.Module):
    def __init__(self, image_shape):
        super(CNNEncoder, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(in_channels=image_shape[2], out_channels=32, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
        )
        flattened_size = self._get_flattened_size(image_shape)
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(flattened_size, 128),
            nn.ReLU(),
            nn.Dropout(0.3)
        )

    def _get_flattened_size(self, shape):
        with torch.no_grad():
            x = torch.zeros(1, *shape).permute(0, 3, 1, 2)
            x = self.cnn(x)
            return x.numel()

    def forward(self, x):
        x = x.permute(0, 3, 1, 2)
        x = self.cnn(x)
        x = self.fc(x)
        return x

class FusionModel(nn.Module):
    def __init__(self, sequence_length, num_features, image_shape):
        super(FusionModel, self).__init__()
        self.lstm_encoder = LSTMEncoder(sequence_length, num_features)
        self.cnn_breakfast_encoder = CNNEncoder(image_shape)
        self.cnn_lunch_encoder = CNNEncoder(image_shape)

        self.fc = nn.Sequential(
            nn.Linear(64 + 128 + 128, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, cgm, breakfast, lunch):
        lstm_out = self.lstm_encoder(cgm)
        breakfast_out = self.cnn_breakfast_encoder(breakfast)
        lunch_out = self.cnn_lunch_encoder(lunch)
        fusion = torch.cat([lstm_out, breakfast_out, lunch_out], dim=1)
        output = self.fc(fusion)
        return output

device = torch.device("cpu") #can be changed
model = FusionModel(sequence_length=98, num_features=1, image_shape=(64, 64, 3)).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 5
batch_size = 16

X_cgm = torch.tensor(X_cgm, dtype=torch.float32).unsqueeze(-1).to(device) #added dimension to make it consistent with other inputs
X_breakfast = torch.tensor(X_breakfast, dtype=torch.float32).to(device)
X_lunch = torch.tensor(X_lunch, dtype=torch.float32).to(device)
y = torch.tensor(y, dtype=torch.float32).to(device)
# print(X_cgm.shape)
# print(X_breakfast.shape)
# print(X_lunch.shape)

dataset = torch.utils.data.TensorDataset(X_cgm, X_breakfast, X_lunch, y)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

for epoch in range(epochs):
    model.train()
    epoch_loss = 0.0

    for batch_cgm, batch_breakfast, batch_lunch, batch_y in data_loader:
        optimizer.zero_grad()

        outputs = model(batch_cgm, batch_breakfast, batch_lunch).squeeze()
        loss = rmsre_torch(outputs, batch_y)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    #print(f"Epoch [{epoch + 1}/{epochs}], Loss: {epoch_loss / len(data_loader):.4f}") #debugger

# Model summary equivalent
# print(model)

model.eval()
X_test_breakfast = preprocess_image_data(image_test, 'Image Before Breakfast')

df_test = pd.DataFrame(cgm_test)
df_test['processed_cgm'] = df_test.apply(preprocess_row, axis=1)
X_test_cgm = pad_sequences(df_test['processed_cgm'].tolist(), padding='post', dtype='float32')

X_test_lunch = preprocess_image_data(image_test, 'Image Before Lunch')
X_test_cgm = torch.tensor(X_test_cgm, dtype=torch.float32).unsqueeze(-1).to(device)
X_test_breakfast = torch.tensor(X_test_breakfast, dtype=torch.float32).to(device)
X_test_lunch = torch.tensor(X_test_lunch, dtype=torch.float32).to(device)

with torch.no_grad():
    predictions = model(X_test_cgm, X_test_breakfast, X_test_lunch)

predictions_np = predictions.cpu().numpy()

submission_df = pd.DataFrame({
    'row_id': np.arange(len(predictions_np)),
    'label': predictions_np.flatten()
})

submission_df.to_csv('submission_torch_notuning.csv', index=False)


KeyboardInterrupt: 

Hyperameter Tuning (Part e)

Note: We designed a pytorch model as was required for part c but for hyperparameter tuning we have implemented the equivalent model in tensorflow since that is what we were more comfortable with.
Hyperparameter tuning involved dynamic size calculation and we were able to do it easier on tensorflow

In [11]:
hyperparams = {
    'lstm_units': [64, 128],
    'cnn_filters': [64, 96],
    'dense_units': [128, 256],
    'learning_rate': [0.001],
    'dropout_rate': [0.3, 0.5]
}

def create_model(lstm_units, cnn_filters, dense_units, learning_rate, dropout_rate):
    sequence_length = 98 #cgm sequences were padded to 98 earlier
    num_features = 1
    image_shape = (64, 64, 3)

    lstm_input = Input(shape=(sequence_length, num_features))
    lstm_out = LSTM(lstm_units, activation="tanh")(lstm_input)

    cnn_breakfast_input = Input(shape=image_shape)
    x = Conv2D(cnn_filters, (3, 3), activation="relu")(cnn_breakfast_input)
    x = MaxPooling2D((2, 2), name="breakfast_pool1")(x)
    x = Conv2D(cnn_filters*2, (3, 3), activation="relu")(x)
    x = MaxPooling2D((2, 2))(x)
    x = Flatten()(x)
    x = Dense(dense_units, activation="relu")(x)
    x = Dropout(dropout_rate)(x)

    cnn_lunch_input = Input(shape=image_shape)
    x2 = Conv2D(cnn_filters, (3, 3), activation="relu")(cnn_lunch_input)
    x2 = MaxPooling2D((2, 2))(x2)
    x2 = Conv2D(cnn_filters*2, (3, 3), activation="relu")(x2)
    x2 = MaxPooling2D((2, 2))(x2)
    x2 = Flatten()(x2)
    x2 = Dense(dense_units, activation="relu")(x2)
    x2 = Dropout(dropout_rate)(x2)

    fusion = Concatenate()([lstm_out, x, x2])

    fc = Dense(dense_units, activation="relu")(fusion)
    fc = Dropout(dropout_rate)(fc)
    fc = Dense(dense_units//2, activation="relu")(fc)
    output = Dense(1, activation="linear")(fc)

    model = Model(inputs=[lstm_input, cnn_breakfast_input, cnn_lunch_input], outputs=output)

    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss=rmsre, metrics=[rmsre])

    return model

best_model = None
best_history = None
best_score = float('inf')
best_params = None

for lstm_units in hyperparams['lstm_units']:
    for cnn_filters in hyperparams['cnn_filters']:
        for dense_units in hyperparams['dense_units']:
            for lr in hyperparams['learning_rate']:
                for dropout in hyperparams['dropout_rate']:
                    #print(f"\nTesting parameters: LSTM={lstm_units}, CNN={cnn_filters}, Dense={dense_units}, LR={lr}, Dropout={dropout}")

                    model = create_model(lstm_units, cnn_filters, dense_units, lr, dropout)

                    history = model.fit(
                        [X_cgm, X_breakfast, X_lunch],
                        y,
                        epochs=5,
                        batch_size=32,
                        verbose=1
                    )

                    train_loss = min(history.history['loss'])
                    if train_loss < best_score:
                        best_score = train_loss
                        best_params = {
                            'lstm_units': lstm_units,
                            'cnn_filters': cnn_filters,
                            'dense_units': dense_units,
                            'learning_rate': lr,
                            'dropout_rate': dropout
                        }
                        best_history = history
                        best_model = model

#print("\nBest parameters found:")
# for param, value in best_params.items():
#     #print(f"{param}: {value}")
# print(f"Best training loss: {best_score}")


# plt.figure(figsize=(10, 6))
# plt.plot(best_history.history['loss'], label='Training Loss')
# plt.title('Loss vs Epochs for Best Model')
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.legend()
# plt.grid(True)
# plt.show()


Epoch 1/5
[1m 7/11[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m5s[0m 1s/step - loss: 0.9869 - rmsre: 0.9869

KeyboardInterrupt: 

Processing test data similiar to the training data and Making Predictions

In [None]:

X_test_breakfast = preprocess_image_data(image_test, 'Image Before Breakfast')

df_test = pd.DataFrame(cgm_test)
df_test['processed_cgm'] = df_test.apply(preprocess_row, axis=1)
X_test_cgm = pad_sequences(df_test['processed_cgm'].tolist(), padding='post', dtype='float32')

X_test_lunch = preprocess_image_data(image_test, 'Image Before Lunch')

predictions = best_model.predict([X_test_cgm, X_test_breakfast, X_test_lunch])
predictions = predictions.flatten()

Making appropriate csv to submit to Kaggle

In [None]:
submission_df = pd.DataFrame({
    'row_id': np.arange(len(predictions)),
    'label': predictions
})

submission_df.to_csv('submission_tensorflow_tuning.csv', index=False)

NOTE: We removed splitting the data to validation data for the final run to make use of the full training dataset for submitting to Kaggle. While designing and picking the best models, etc. we did use validation