# Experiment 098: PCA-Reduced ChemBERTa Embeddings + Domain Constraints

**Goal**: Fix the dimensionality problem from exp_097 by applying PCA to reduce ChemBERTa embeddings from 768 to 20 dimensions.

**Rationale**:
- exp_097 used 786 features (768 ChemBERTa + 13 Spange + 5 Arrhenius) for ~600 samples
- Feature-to-sample ratio of 1.31 is WAY too high (should be < 0.1)
- Best model (exp_030) uses only 18 features
- PCA reduction to 20 components should help generalization

**Approach**:
1. Extract ChemBERTa embeddings (768-dim)
2. Apply PCA to reduce to 20 dimensions
3. Combine with Spange (13) + Arrhenius (5) = 38 total features
4. Train GP+MLP+LGBM ensemble
5. Apply domain constraints (mass balance)

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.multioutput import MultiOutputRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel
import lightgbm as lgb
from abc import ABC, abstractmethod
import warnings
warnings.filterwarnings('ignore')

print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

GPU available: True
GPU: NVIDIA H100 80GB HBM3


In [2]:
# Data loading functions
DATA_PATH = '/home/data'

INPUT_LABELS_NUMERIC = ["Residence Time", "Temperature"]
INPUT_LABELS_SINGLE_SOLVENT = ["Residence Time", "Temperature", "SOLVENT NAME"]
INPUT_LABELS_FULL_SOLVENT = ["Residence Time", "Temperature", "SOLVENT A NAME", "SOLVENT B NAME", "SolventB%"]
TARGET_LABELS = ["Product 2", "Product 3", "SM"]

def load_data(name="full"):
    if name == "full":
        df = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
        X = df[INPUT_LABELS_FULL_SOLVENT]
    else:
        df = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
        X = df[INPUT_LABELS_SINGLE_SOLVENT]
    Y = df[TARGET_LABELS]
    return X, Y

def load_features(name):
    return pd.read_csv(f'{DATA_PATH}/{name}_lookup.csv', index_col=0)

def generate_leave_one_out_splits(X, Y):
    for solvent in sorted(X["SOLVENT NAME"].unique()):
        mask = X["SOLVENT NAME"] != solvent
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

def generate_leave_one_ramp_out_splits(X, Y):
    ramps = X[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates()
    for _, row in ramps.iterrows():
        mask = ~((X["SOLVENT A NAME"] == row["SOLVENT A NAME"]) & (X["SOLVENT B NAME"] == row["SOLVENT B NAME"]))
        yield (X[mask], Y[mask]), (X[~mask], Y[~mask])

print('Data loading functions defined')

Data loading functions defined


In [3]:
# Base classes
class SmilesFeaturizer(ABC):
    def __init__(self):
        raise NotImplementedError

    def featurize(self, X):
        raise NotImplementedError

class BaseModel(ABC):
    def __init__(self):
        pass

    def train_model(self, X_train, y_train):
        raise NotImplementedError

    def predict(self, X):
        raise NotImplementedError

In [4]:
# Load SMILES and Spange descriptors
SMILES_DF = pd.read_csv(f'{DATA_PATH}/smiles_lookup.csv', index_col=0)
SPANGE_DF = load_features("spange_descriptors")

print(f"SMILES lookup: {SMILES_DF.shape}")
print(f"Spange: {SPANGE_DF.shape}")
print(f"\nSample SMILES:")
print(SMILES_DF.head())

SMILES lookup: (26, 1)
Spange: (26, 13)

Sample SMILES:
                                           solvent smiles
SOLVENT NAME                                             
Cyclohexane                                      C1CCCCC1
Ethyl Acetate                                   O=C(OCC)C
Acetic Acid                                       CC(=O)O
2-Methyltetrahydrofuran [2-MeTHF]              O1C(C)CCC1
1,1,1,3,3,3-Hexafluoropropan-2-ol  C(C(F)(F)F)(C(F)(F)F)O


In [5]:
# Load ChemBERTa model and extract embeddings
from transformers import AutoModel, AutoTokenizer

print("Loading ChemBERTa model...")
tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
chemberta_model = AutoModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
chemberta_model.eval()

print(f"Model loaded. Hidden size: {chemberta_model.config.hidden_size}")

Loading ChemBERTa model...


Model loaded. Hidden size: 768


In [6]:
# Extract embeddings for all solvents
def extract_chemberta_embedding(smiles, tokenizer, model):
    """Extract ChemBERTa embedding for a single SMILES string."""
    with torch.no_grad():
        inputs = tokenizer(smiles, return_tensors="pt", padding=True, truncation=True, max_length=512)
        outputs = model(**inputs)
        # Mean pool over token dimension (excluding padding)
        attention_mask = inputs['attention_mask']
        hidden_states = outputs.last_hidden_state
        # Mask out padding tokens
        mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
        sum_hidden = torch.sum(hidden_states * mask_expanded, dim=1)
        sum_mask = torch.clamp(mask_expanded.sum(dim=1), min=1e-9)
        embedding = sum_hidden / sum_mask
        return embedding.squeeze().numpy()

print("Extracting ChemBERTa embeddings for all solvents...")
chemberta_embeddings = {}
for solvent_name in SMILES_DF.index:
    smiles = SMILES_DF.loc[solvent_name, 'solvent smiles']
    embedding = extract_chemberta_embedding(smiles, tokenizer, chemberta_model)
    chemberta_embeddings[solvent_name] = embedding
    print(f"  {solvent_name}: {embedding.shape}")

# Convert to DataFrame
CHEMBERTA_DF = pd.DataFrame(chemberta_embeddings).T
CHEMBERTA_DF.columns = [f'chemberta_{i}' for i in range(768)]
print(f"\nChemBERTa embeddings shape: {CHEMBERTA_DF.shape}")

Extracting ChemBERTa embeddings for all solvents...


  Cyclohexane: (768,)
  Ethyl Acetate: (768,)
  Acetic Acid: (768,)
  2-Methyltetrahydrofuran [2-MeTHF]: (768,)
  1,1,1,3,3,3-Hexafluoropropan-2-ol: (768,)
  IPA [Propan-2-ol]: (768,)
  Ethanol: (768,)
  Methanol: (768,)
  Ethylene Glycol [1,2-Ethanediol]: (768,)
  Acetonitrile: (768,)
  Water: (768,)
  Diethyl Ether [Ether]: (768,)
  MTBE [tert-Butylmethylether]: (768,)
  Dimethyl Carbonate: (768,)
  tert-Butanol [2-Methylpropan-2-ol]: (768,)
  DMA [N,N-Dimethylacetamide]: (768,)
  2,2,2-Trifluoroethanol: (768,)
  Dihydrolevoglucosenone (Cyrene): (768,)
  Decanol: (768,)
  Butanone [MEK]: (768,)
  Ethyl Lactate: (768,)
  Methyl Propionate: (768,)
  THF [Tetrahydrofuran]: (768,)
  Water.Acetonitrile: (768,)
  Acetonitrile.Acetic Acid: (768,)
  Water.2,2,2-Trifluoroethanol: (768,)

ChemBERTa embeddings shape: (26, 768)


In [7]:
# Apply PCA to reduce ChemBERTa embeddings from 768 to 20 dimensions
print("Applying PCA to reduce ChemBERTa embeddings...")

# Fit PCA on all solvents
pca = PCA(n_components=20)
chemberta_reduced = pca.fit_transform(CHEMBERTA_DF.values)

print(f"Original shape: {CHEMBERTA_DF.shape}")
print(f"Reduced shape: {chemberta_reduced.shape}")
print(f"Explained variance ratio: {pca.explained_variance_ratio_.sum():.4f}")

# Create reduced DataFrame
CHEMBERTA_PCA_DF = pd.DataFrame(
    chemberta_reduced,
    index=CHEMBERTA_DF.index,
    columns=[f'chemberta_pca_{i}' for i in range(20)]
)
print(f"\nPCA-reduced ChemBERTa embeddings shape: {CHEMBERTA_PCA_DF.shape}")

Applying PCA to reduce ChemBERTa embeddings...
Original shape: (26, 768)
Reduced shape: (26, 20)
Explained variance ratio: 0.9959

PCA-reduced ChemBERTa embeddings shape: (26, 20)


In [10]:
# Define the PCA ChemBERTa Ensemble Model
class ChemBERTaPCAEnsembleModel(BaseModel):
    """GP + MLP + LGBM ensemble using PCA-reduced ChemBERTa embeddings."""
    
    def __init__(self, data='single', num_epochs=100):
        self.data = data
        self.num_epochs = num_epochs
        self.scaler = StandardScaler()
        self.gp_model = None
        self.mlp_model = None
        self.lgbm_model = None
        self.train_mean = None  # For conservative blending
        
    def _get_features(self, X):
        """Extract features from input data."""
        features_list = []
        
        if self.data == 'single':
            # Arrhenius features
            res_time = X['Residence Time'].values.reshape(-1, 1)
            temp = X['Temperature'].values.reshape(-1, 1)
            temp_kelvin = temp + 273.15
            inv_temp = 1.0 / temp_kelvin
            log_res_time = np.log(res_time + 1e-6)
            arrhenius = res_time * np.exp(-1000 / temp_kelvin)  # Simplified Arrhenius
            
            features_list.extend([res_time, temp, inv_temp, log_res_time, arrhenius])
            
            # Spange descriptors (13 features)
            spange_features = np.array([SPANGE_DF.loc[s].values for s in X['SOLVENT NAME']])
            features_list.append(spange_features)
            
            # PCA-reduced ChemBERTa embeddings (20 features)
            chemberta_features = np.array([CHEMBERTA_PCA_DF.loc[s].values for s in X['SOLVENT NAME']])
            features_list.append(chemberta_features)
            
        else:  # full data
            # Arrhenius features
            res_time = X['Residence Time'].values.reshape(-1, 1)
            temp = X['Temperature'].values.reshape(-1, 1)
            temp_kelvin = temp + 273.15
            inv_temp = 1.0 / temp_kelvin
            log_res_time = np.log(res_time + 1e-6)
            arrhenius = res_time * np.exp(-1000 / temp_kelvin)
            solvent_b_pct = X['SolventB%'].values.reshape(-1, 1)
            
            features_list.extend([res_time, temp, inv_temp, log_res_time, arrhenius, solvent_b_pct])
            
            # Spange descriptors - weighted average for mixtures
            spange_a = np.array([SPANGE_DF.loc[s].values for s in X['SOLVENT A NAME']])
            spange_b = np.array([SPANGE_DF.loc[s].values for s in X['SOLVENT B NAME']])
            weight_b = solvent_b_pct / 100.0
            spange_mixed = (1 - weight_b) * spange_a + weight_b * spange_b
            features_list.append(spange_mixed)
            
            # PCA-reduced ChemBERTa embeddings - weighted average for mixtures
            chemberta_a = np.array([CHEMBERTA_PCA_DF.loc[s].values for s in X['SOLVENT A NAME']])
            chemberta_b = np.array([CHEMBERTA_PCA_DF.loc[s].values for s in X['SOLVENT B NAME']])
            chemberta_mixed = (1 - weight_b) * chemberta_a + weight_b * chemberta_b
            features_list.append(chemberta_mixed)
        
        return np.hstack(features_list)
    
    def train_model(self, X, Y):
        """Train GP + MLP + LGBM ensemble."""
        features = self._get_features(X)
        features_scaled = self.scaler.fit_transform(features)
        y_values = Y.values
        
        # Store training mean for conservative blending
        self.train_mean = y_values.mean(axis=0)
        
        # 1. Train GP (on subset of features for efficiency)
        kernel = ConstantKernel(1.0) * Matern(length_scale=1.0, nu=2.5) + WhiteKernel(noise_level=0.1)
        self.gp_model = MultiOutputRegressor(
            GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=2, random_state=42)
        )
        self.gp_model.fit(features_scaled, y_values)
        
        # 2. Train MLP
        self.mlp_model = self._train_mlp(features_scaled, y_values)
        
        # 3. Train LGBM
        self.lgbm_model = MultiOutputRegressor(
            lgb.LGBMRegressor(n_estimators=100, learning_rate=0.05, max_depth=5, 
                             num_leaves=31, random_state=42, verbose=-1)
        )
        self.lgbm_model.fit(features_scaled, y_values)
        
    def _train_mlp(self, X, Y):
        """Train a simple MLP."""
        input_dim = X.shape[1]
        output_dim = Y.shape[1]
        
        model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, output_dim)
        )
        
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        criterion = nn.MSELoss()
        
        X_tensor = torch.FloatTensor(X)
        Y_tensor = torch.FloatTensor(Y)
        
        model.train()
        for epoch in range(self.num_epochs):
            optimizer.zero_grad()
            outputs = model(X_tensor)
            loss = criterion(outputs, Y_tensor)
            loss.backward()
            optimizer.step()
        
        model.eval()
        return model
    
    def predict(self, X):
        """Predict using ensemble with domain constraints."""
        features = self._get_features(X)
        features_scaled = self.scaler.transform(features)
        
        # Get predictions from each model
        gp_pred = self.gp_model.predict(features_scaled)
        
        with torch.no_grad():
            mlp_pred = self.mlp_model(torch.FloatTensor(features_scaled)).numpy()
        
        lgbm_pred = self.lgbm_model.predict(features_scaled)
        
        # Ensemble: weighted average (GP gets higher weight for uncertainty)
        ensemble_pred = 0.4 * gp_pred + 0.3 * mlp_pred + 0.3 * lgbm_pred
        
        # Apply domain constraints (mass balance)
        ensemble_pred = self._enforce_mass_balance(ensemble_pred)
        
        return torch.FloatTensor(ensemble_pred)
    
    def _enforce_mass_balance(self, predictions):
        """Post-process predictions to satisfy mass balance."""
        # Clip to [0, 1]
        predictions = np.clip(predictions, 0, 1)
        
        # Ensure sum doesn't exceed 1 (yields can't exceed 100% total)
        row_sums = predictions.sum(axis=1, keepdims=True)
        for i in range(len(predictions)):
            if row_sums[i, 0] > 1:
                predictions[i] = predictions[i] / row_sums[i, 0]
        
        return predictions

print("ChemBERTaPCAEnsembleModel defined")
print("Features: 5 (Arrhenius) + 13 (Spange) + 20 (ChemBERTa PCA) = 38 total")

ChemBERTaPCAEnsembleModel defined
Features: 5 (Arrhenius) + 13 (Spange) + 20 (ChemBERTa PCA) = 38 total


In [11]:
# Run CV
import tqdm

def compute_cv_score(verbose=True):
    """Compute CV score with PCA-reduced ChemBERTa embeddings."""
    
    # Single solvent CV
    X_single, Y_single = load_data("single_solvent")
    split_generator = generate_leave_one_out_splits(X_single, Y_single)
    
    single_mse_list = []
    for fold_idx, split in enumerate(split_generator):
        (train_X, train_Y), (test_X, test_Y) = split
        
        model = ChemBERTaPCAEnsembleModel(data='single', num_epochs=100)
        model.train_model(train_X, train_Y)
        
        predictions = model.predict(test_X)
        predictions_np = predictions.detach().cpu().numpy()
        
        mse = np.mean((predictions_np - test_Y.values) ** 2)
        single_mse_list.append(mse)
        if verbose:
            print(f"Single Fold {fold_idx}: MSE = {mse:.6f}")
    
    single_cv = np.mean(single_mse_list)
    if verbose:
        print(f"\nSingle Solvent CV MSE: {single_cv:.6f}")
    
    # Full data CV
    X_full, Y_full = load_data("full")
    split_generator = generate_leave_one_ramp_out_splits(X_full, Y_full)
    
    full_mse_list = []
    for fold_idx, split in enumerate(split_generator):
        (train_X, train_Y), (test_X, test_Y) = split
        
        model = ChemBERTaPCAEnsembleModel(data='full', num_epochs=100)
        model.train_model(train_X, train_Y)
        
        predictions = model.predict(test_X)
        predictions_np = predictions.detach().cpu().numpy()
        
        mse = np.mean((predictions_np - test_Y.values) ** 2)
        full_mse_list.append(mse)
        if verbose:
            print(f"Full Fold {fold_idx}: MSE = {mse:.6f}")
    
    full_cv = np.mean(full_mse_list)
    if verbose:
        print(f"\nFull Data CV MSE: {full_cv:.6f}")
    
    combined_cv = (single_cv + full_cv) / 2
    if verbose:
        print(f"\n=== Combined CV MSE: {combined_cv:.6f} ===")
    
    return single_cv, full_cv, combined_cv

print("Running CV with PCA-reduced ChemBERTa embeddings...")
single_cv, full_cv, combined_cv = compute_cv_score()

Running CV with PCA-reduced ChemBERTa embeddings...


Single Fold 0: MSE = 0.041562


Single Fold 1: MSE = 0.017304


Single Fold 2: MSE = 0.006700


Single Fold 3: MSE = 0.007438


Single Fold 4: MSE = 0.023706


Single Fold 5: MSE = 0.003313


Single Fold 6: MSE = 0.006805


Single Fold 7: MSE = 0.015436


Single Fold 8: MSE = 0.035206


Single Fold 9: MSE = 0.017351


Single Fold 10: MSE = 0.007266


Single Fold 11: MSE = 0.012168


Single Fold 12: MSE = 0.003641


Single Fold 13: MSE = 0.004840


Single Fold 14: MSE = 0.004309


Single Fold 15: MSE = 0.023920


Single Fold 16: MSE = 0.024051


Single Fold 17: MSE = 0.006335


Single Fold 18: MSE = 0.003236


Single Fold 19: MSE = 0.008072


Single Fold 20: MSE = 0.003486


Single Fold 21: MSE = 0.009069


Single Fold 22: MSE = 0.031475


Single Fold 23: MSE = 0.001435

Single Solvent CV MSE: 0.013255


Full Fold 0: MSE = 0.012540


Full Fold 1: MSE = 0.027881


Full Fold 2: MSE = 0.025746


Full Fold 3: MSE = 0.032094


Full Fold 4: MSE = 0.017479


Full Fold 5: MSE = 0.008385


Full Fold 6: MSE = 0.025360


Full Fold 7: MSE = 0.028409


Full Fold 8: MSE = 0.006475


Full Fold 9: MSE = 0.006530


Full Fold 10: MSE = 0.006947


Full Fold 11: MSE = 0.008297


Full Fold 12: MSE = 0.005972

Full Data CV MSE: 0.016317

=== Combined CV MSE: 0.014786 ===


In [None]:
# Save results
import json

results = {
    'cv_score': float(combined_cv),
    'single_cv': float(single_cv),
    'full_cv': float(full_cv),
    'model': 'ChemBERTaPCAEnsembleModel (GP+MLP+LGBM with PCA-reduced ChemBERTa)',
    'baseline_cv': 0.0081,
    'improvement': f"{(0.0081 - combined_cv) / 0.0081 * 100:.2f}%",
    'features': '5 (Arrhenius) + 13 (Spange) + 20 (ChemBERTa PCA) = 38 total',
    'pca_explained_variance': float(pca.explained_variance_ratio_.sum())
}

with open('/home/code/experiments/098_chemberta_pca/metrics.json', 'w') as f:
    json.dump(results, f, indent=2)

print("Results saved")
print(f"Combined CV: {combined_cv:.6f}")
print(f"Baseline CV: 0.0081")
print(f"Improvement: {(0.0081 - combined_cv) / 0.0081 * 100:.2f}%")

## Generate Submission (if CV is better than baseline)

The following cells follow the official template structure.

**CRITICAL**: The model class in submission cells MUST match the CV computation class (`ChemBERTaPCAEnsembleModel`).

In [None]:
# Check if we should generate submission
if combined_cv < 0.0081:
    print(f"CV {combined_cv:.6f} is BETTER than baseline 0.0081!")
    print("Generating submission...")
    GENERATE_SUBMISSION = True
else:
    print(f"CV {combined_cv:.6f} is WORSE than baseline 0.0081")
    print("Not generating submission.")
    GENERATE_SUBMISSION = False

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

import tqdm

X, Y = load_data("single_solvent")

split_generator = generate_leave_one_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = ChemBERTaPCAEnsembleModel(data='single', num_epochs=100)  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 0,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_single_solvent = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE THIRD LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

X, Y = load_data("full")

split_generator = generate_leave_one_ramp_out_splits(X, Y)
all_predictions = []

for fold_idx, split in tqdm.tqdm(enumerate(split_generator)):
    (train_X, train_Y), (test_X, test_Y) = split

    model = ChemBERTaPCAEnsembleModel(data='full', num_epochs=100)  # CHANGE THIS LINE ONLY
    model.train_model(train_X, train_Y)

    predictions = model.predict(test_X)  # Shape: [N, 3]

    # Move to CPU and convert to numpy
    predictions_np = predictions.detach().cpu().numpy()

    # Add metadata and flatten to long format
    for row_idx, row in enumerate(predictions_np):
        all_predictions.append({
            "task": 1,
            "fold": fold_idx,
            "row": row_idx,
            "target_1": row[0],
            "target_2": row[1],
            "target_3": row[2]
        })

# Save final submission
submission_full_data = pd.DataFrame(all_predictions)

########### DO NOT CHANGE ANYTHING IN THIS CELL OTHER THAN THE MODEL #################
########### THIS MUST BE THE SECOND LAST CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

In [None]:
########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################

submission = pd.concat([submission_single_solvent, submission_full_data])
submission = submission.reset_index()
submission.index.name = "id"
submission.to_csv("submission.csv", index=True)

# Also save to standard location
import shutil
import os
os.makedirs('/home/submission', exist_ok=True)
shutil.copy("submission.csv", "/home/submission/submission.csv")
print("Submission saved!")
print(f"Shape: {submission.shape}")
print(submission.head())

########### DO NOT CHANGE ANYTHING IN THIS CELL #################
########### THIS MUST BE THE FINAL CELL IN YOUR NOTEBOOK FOR A VALID SUBMISSION #################