# Neural Network

To predict the 6 analytes using the given spectral data

## Data Preparation

### Load the datasets

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [3]:
data = pd.read_csv('data/data.csv')
target = pd.read_csv('data/target.csv')

### Feature Scaling:

In [4]:
# feature scaling
scaler = StandardScaler()
data = scaler.fit_transform(data)

# convert array back to dataframe
data = pd.DataFrame(data)

In [5]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2790,2791,2792,2793,2794,2795,2796,2797,2798,2799
0,-0.316769,-0.315116,-0.313438,-0.311752,-0.310101,-0.308489,-0.306886,-0.305345,-0.303805,-0.302371,...,0.307467,0.308729,0.309938,0.311335,0.312953,0.314581,0.316054,0.316725,0.316783,0.316769
1,-0.392352,-0.390931,-0.389515,-0.388140,-0.386799,-0.385506,-0.384272,-0.383046,-0.381832,-0.380493,...,0.393738,0.393679,0.393555,0.393591,0.393748,0.393820,0.393732,0.393476,0.392953,0.392352
2,-0.395732,-0.394205,-0.392682,-0.391210,-0.389802,-0.388441,-0.387181,-0.385971,-0.384794,-0.383556,...,0.385905,0.386156,0.386736,0.387808,0.389403,0.391396,0.393625,0.394835,0.395313,0.395732
3,-0.353691,-0.351974,-0.350242,-0.348509,-0.346777,-0.345091,-0.343411,-0.341739,-0.340078,-0.338397,...,0.353261,0.352495,0.351930,0.351762,0.352015,0.352671,0.353454,0.353844,0.353813,0.353691
4,-0.388814,-0.387189,-0.385506,-0.383811,-0.382188,-0.380599,-0.379026,-0.377480,-0.375925,-0.374382,...,0.377174,0.378062,0.379091,0.380494,0.382271,0.384355,0.386594,0.387837,0.388371,0.388814
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176,1.216063,1.212980,1.209800,1.206363,1.202908,1.199597,1.196533,1.193830,1.191242,1.188586,...,-1.198214,-1.201348,-1.203690,-1.206748,-1.211088,-1.216074,-1.220527,-1.222032,-1.219498,-1.216063
177,1.405710,1.402492,1.399350,1.396283,1.393269,1.390305,1.387590,1.385171,1.382781,1.380179,...,-1.399635,-1.401556,-1.402483,-1.403973,-1.406681,-1.409943,-1.412725,-1.413162,-1.409897,-1.405710
178,1.452745,1.448847,1.445136,1.441636,1.438428,1.435438,1.432754,1.430351,1.427990,1.425488,...,-1.458494,-1.459141,-1.458486,-1.458382,-1.459472,-1.461106,-1.462207,-1.461516,-1.457577,-1.452745
179,1.370582,1.367806,1.365097,1.362142,1.359241,1.356519,1.354035,1.351736,1.349495,1.346956,...,-1.392243,-1.392203,-1.390443,-1.388581,-1.387419,-1.386407,-1.384495,-1.381759,-1.376632,-1.370582


The names of the columns (the wavelengths) are not necessary or useful in prediction, so doesn't matter if renamed.

### Train-Test Split:

In [16]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

In [17]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2790,2791,2792,2793,2794,2795,2796,2797,2798,2799
65,-0.294132,-0.292470,-0.290753,-0.289059,-0.287421,-0.285885,-0.284432,-0.283031,-0.281643,-0.280298,...,0.276976,0.278672,0.280488,0.282665,0.285239,0.288108,0.291088,0.292727,0.293466,0.294132
67,-0.284122,-0.282493,-0.280881,-0.279314,-0.277783,-0.276313,-0.274894,-0.273518,-0.272146,-0.270753,...,0.287967,0.288118,0.287882,0.287565,0.287224,0.286867,0.286299,0.285740,0.284973,0.284122
31,-0.306980,-0.305355,-0.303725,-0.302108,-0.300526,-0.299003,-0.297521,-0.296112,-0.294748,-0.293489,...,0.290890,0.292695,0.294566,0.296723,0.299148,0.301887,0.304602,0.305979,0.306527,0.306980
12,-0.283304,-0.281756,-0.280143,-0.278480,-0.276874,-0.275329,-0.273899,-0.272542,-0.271209,-0.269888,...,0.272093,0.273169,0.274347,0.275795,0.277602,0.279657,0.281763,0.282801,0.283100,0.283304
41,-0.276594,-0.275093,-0.273528,-0.271935,-0.270347,-0.268851,-0.267408,-0.266029,-0.264725,-0.263381,...,0.298471,0.296658,0.294318,0.291676,0.288821,0.285722,0.282349,0.280222,0.278439,0.276594
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,1.287983,1.285159,1.282166,1.279110,1.276117,1.273214,1.270348,1.267523,1.264811,1.262283,...,-1.271121,-1.273053,-1.275304,-1.277644,-1.279920,-1.282017,-1.283961,-1.285168,-1.286476,-1.287983
14,-0.303077,-0.301640,-0.300190,-0.298733,-0.297363,-0.296033,-0.294755,-0.293592,-0.292444,-0.291202,...,0.288615,0.290797,0.292942,0.295185,0.297425,0.299608,0.301592,0.302569,0.302859,0.303077
92,0.113718,0.112752,0.111822,0.110825,0.109820,0.108743,0.107677,0.106608,0.105618,0.104834,...,-0.096473,-0.099153,-0.101798,-0.104224,-0.106403,-0.108327,-0.110121,-0.111282,-0.112436,-0.113718
179,1.370582,1.367806,1.365097,1.362142,1.359241,1.356519,1.354035,1.351736,1.349495,1.346956,...,-1.392243,-1.392203,-1.390443,-1.388581,-1.387419,-1.386407,-1.384495,-1.381759,-1.376632,-1.370582


In [18]:
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2790,2791,2792,2793,2794,2795,2796,2797,2798,2799
19,-0.355102,-0.353486,-0.351813,-0.350161,-0.348585,-0.347129,-0.345743,-0.344429,-0.34318,-0.341912,...,0.340416,0.341895,0.343482,0.345356,0.347572,0.350045,0.352639,0.354027,0.35461,0.355102
42,-0.282375,-0.280648,-0.278907,-0.277137,-0.275392,-0.273708,-0.272107,-0.270569,-0.269033,-0.26757,...,0.297368,0.296231,0.294651,0.292879,0.291017,0.289009,0.286725,0.285213,0.28383,0.282375
153,2.635599,2.635815,2.6362,2.636678,2.637186,2.637641,2.637959,2.638116,2.638125,2.638064,...,-2.634756,-2.633414,-2.632869,-2.632611,-2.632381,-2.632199,-2.632285,-2.632771,-2.63407,-2.635599
78,0.82426,0.821456,0.818978,0.81687,0.8151,0.813557,0.812046,0.810543,0.809171,0.807972,...,-0.859986,-0.85447,-0.849215,-0.84402,-0.838745,-0.833657,-0.828785,-0.826249,-0.825185,-0.82426
145,1.015041,1.018451,1.022076,1.025791,1.029497,1.033103,1.036628,1.040053,1.043411,1.046847,...,-1.017165,-1.016343,-1.015912,-1.015459,-1.01476,-1.013998,-1.013447,-1.01347,-1.014196,-1.015041
15,-0.311459,-0.309956,-0.30847,-0.307018,-0.305663,-0.30437,-0.303054,-0.301791,-0.300531,-0.299335,...,0.303902,0.304389,0.304934,0.305798,0.307034,0.308599,0.310247,0.311081,0.311317,0.311459
24,-0.391316,-0.389854,-0.388355,-0.386846,-0.385327,-0.383907,-0.382503,-0.381139,-0.379829,-0.378499,...,0.383391,0.384632,0.385814,0.387141,0.388509,0.389851,0.390985,0.391457,0.391426,0.391316
68,-0.307925,-0.306302,-0.304696,-0.303123,-0.301648,-0.30019,-0.298805,-0.297472,-0.296194,-0.294981,...,0.297898,0.298984,0.300117,0.301462,0.303127,0.304972,0.306807,0.307662,0.307835,0.307925
113,-0.297175,-0.295772,-0.294428,-0.293066,-0.291698,-0.29037,-0.289084,-0.287829,-0.286619,-0.285454,...,0.31966,0.317251,0.314478,0.311598,0.308681,0.30574,0.302597,0.300601,0.298929,0.297175
118,1.324047,1.320891,1.317578,1.314243,1.311018,1.307931,1.304869,1.301836,1.299072,1.296605,...,-1.333095,-1.331216,-1.329523,-1.327962,-1.326339,-1.32489,-1.32362,-1.323171,-1.323514,-1.324047


### Dimensionality Reduction:

 Use PCA to reduce the input dimensionality if the dataset is too high-dimensional.

In [40]:
# Dimensionality Reduction
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# convert back to a dataframe
X_train_pca = pd.DataFrame(X_train_pca)
X_test_pca = pd.DataFrame(X_test_pca)

In [41]:
X_train_pca.shape

(144, 3)

In [42]:
X_train_pca

Unnamed: 0,0,1,2
0,-30.397715,-7.173837,0.505082
1,-30.639217,-7.951355,1.682380
2,-30.803980,-6.855545,0.180291
3,-30.928344,-7.593637,0.553239
4,-29.201630,-9.662276,1.238003
...,...,...,...
139,73.491355,-0.817233,-12.861807
140,-30.279794,-7.323924,0.720564
141,7.762607,4.279655,-14.742861
142,92.428913,-13.540680,15.012772


In [43]:
X_test_pca.shape

(37, 3)

In [44]:
X_test_pca

Unnamed: 0,0,1,2
0,-31.353769,-5.490578,-0.029496
1,-29.071803,-9.11502,0.351931
2,42.753934,-90.722974,21.305769
3,36.982381,-8.221943,3.612671
4,-34.08543,-74.027118,11.974625
5,-30.136958,-7.429598,1.266313
6,-30.365386,-4.896786,0.51304
7,-30.055368,-7.852576,2.747932
8,-28.149508,-4.202456,-6.073699
9,76.569013,0.869426,-11.531683


---

# Define the Neural Network Architecture

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Define the model
model = Sequential()

# Input layer
model.add(Dense(units=128, input_dim=X_train_pca.shape[1], activation='relu'))

# Hidden layers
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(rate=0.3))
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(rate=0.3))

# Output layer
model.add(Dense(units=6, activation='linear'))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Summary of the model
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 128)               512       
                                                                 
 dense_13 (Dense)            (None, 64)                8256      
                                                                 
 dropout_6 (Dropout)         (None, 64)                0         
                                                                 
 dense_14 (Dense)            (None, 32)                2080      
                                                                 
 dropout_7 (Dropout)         (None, 32)                0         
                                                                 
 dense_15 (Dense)            (None, 6)                 198       
                                                                 
Total params: 11046 (43.15 KB)
Trainable params: 11046

# Model Training

In [50]:
# validation data for model evaluation split from training data
X_train_pca, X_val_pca, y_train, y_val = train_test_split(X_train_pca, y_train, test_size=0.2, random_state=0)

In [None]:
# Train the model
history = model.fit(X_train_pca, y_train, epochs=100, batch_size=32, validation_data=(X_val_pca, y_val))

# Test the model
loss, mae = model.evaluate(X_test_pca, y_test)
print(f'Test Loss: {loss:.4f}')
print(f'Test MAE: {mae:.4f}')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

## Model Evaluation

In [58]:
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error, mean_absolute_error

# Calculate metrics

# Predict on test data
y_pred = model.predict(X_test_pca)

r2 = r2_score(y_test, y_pred)
print(f'R^2 Score: {r2:.4f}')

MSE = mean_squared_error(y_test, y_pred)
print(f'MSE: {MSE:.4f}')

MAE = mean_absolute_error(y_test, y_pred)
print(f'MAE: {MAE:.4f}')

R^2 Score: 0.3561
MSE: 25.9599
MAE: 2.4431


---

# Predicting the target variables using the complete dataset

### Train - Test Split

#### Validation data

In [62]:
# split training data into training and validation data
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

### Neural Network Architecture

In [63]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Define the model
model = Sequential()

# Input layer
model.add(Dense(units=128, input_dim=X_train.shape[1], activation='relu'))

# Hidden layers
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(rate=0.3))
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(rate=0.3))

# Output layer
model.add(Dense(units=6, activation='linear'))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Summary of the model
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_16 (Dense)            (None, 128)               358528    
                                                                 
 dense_17 (Dense)            (None, 64)                8256      
                                                                 
 dropout_8 (Dropout)         (None, 64)                0         
                                                                 
 dense_18 (Dense)            (None, 32)                2080      
                                                                 
 dropout_9 (Dropout)         (None, 32)                0         
                                                                 
 dense_19 (Dense)            (None, 6)                 198       
                                                                 
Total params: 369062 (1.41 MB)
Trainable params: 36906

### Model Training

In [64]:
# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_val, y_val))

# Test the model
loss, mae = model.evaluate(X_test, y_test)

print(f'Test Loss: {loss:.4f}')
print(f'Test MAE: {mae:.4f}')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

### Model Evaluation

In [66]:
# Calculate metrics

# Predict on test data
y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
print(f'R^2 Score: {r2:.4f}')

MSE = mean_squared_error(y_test, y_pred)
print(f'MSE: {MSE:.4f}')

MAE = mean_absolute_error(y_test, y_pred)
print(f'MAE: {MAE:.4f}')

R^2 Score: 0.2032
MSE: 40.6203
MAE: 2.8175


#### Model Evaluation Results on PCA data:

R^2 Score: 0.3561

MSE: 25.9599

MAE: 2.4431

So, it's better to use data with PCA

---

# Let's predict each target variable separately

## Neural Network Architecture

In [74]:
def NN_model(X_train, y_train, X_val, y_val, X_test, y_test):
    # Define the model
    model = Sequential()

    # Input layer
    model.add(Dense(units=128, input_dim=X_train.shape[1], activation='relu'))

    # Hidden layers
    model.add(Dense(units=64, activation='relu'))
    model.add(Dropout(rate=0.3))
    model.add(Dense(units=32, activation='relu'))
    model.add(Dropout(rate=0.3))

    # Output layer
    model.add(Dense(units=1, activation='linear'))

    # Compile the model
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

    # Train the model
    history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_val, y_val))

    # Test the model
    loss, mae = model.evaluate(X_test, y_test)

    print(f'Test Loss: {loss:.4f}')
    print(f'Test MAE: {mae:.4f}')

    # Calculate metrics

    # Predict on test data
    y_pred = model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    print(f'R^2 Score: {r2:.4f}')

    MSE = mean_squared_error(y_test, y_pred)
    print(f'MSE: {MSE:.4f}')

    MAE = mean_absolute_error(y_test, y_pred)
    print(f'MAE: {MAE:.4f}')

    return model

## We'll do this using 3 variations of the dataset:

1. With the complete dataset

2. With the complete dataset undergone PCA

3. With the selected features dataset for each target variable

---

# 1. With the complete dataset

In [75]:
y_predicted = pd.DataFrame(columns=target.columns)
y_truth = pd.DataFrame(columns=target.columns)

In [76]:
for col in target.columns:
    y = target[col]
    X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=0)

    # split training data into training and validation data
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

    model = NN_model(X_train, y_train, X_val, y_val, X_test, y_test)

    y_predicted[col] = model.predict(X_test).flatten()
    y_truth[col] = y_test.values

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [84]:
y_predicted.head()

Unnamed: 0,Glucose (g/L),Lactate (g/L),Ethanol (g/L),Acetate (g/L),Biomass (g/L),Formate (g/L)
0,20.024462,4.836507,0.58165,0.420489,0.110299,0.586327
1,24.768311,0.306895,0.226264,0.303186,0.110299,0.338433
2,-0.140375,0.171545,0.144541,0.154827,0.110299,0.18305
3,19.868496,33.733025,0.144541,0.342344,2.72268,0.109338
4,3.520661,0.101497,0.048329,0.072261,0.18034,0.049769


In [83]:
y_truth.head()

Unnamed: 0,Glucose (g/L),Lactate (g/L),Ethanol (g/L),Acetate (g/L),Biomass (g/L),Formate (g/L)
0,30.0,25.0,0.5,2.0,0.0,1.0
1,50.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.5,0.0,0.0
3,34.739658,40.735563,0.0,0.0,3.91524,0.0
4,8.66,2.18,0.0,0.0,0.72,0.0


In [80]:
# r2 score for all target variables
r2 = r2_score(y_truth, y_predicted)
r2

0.42913293043177275

In [82]:
# r2 score for each target variable
r2 = r2_score(y_truth, y_predicted, multioutput='raw_values')

# convert to a dataframe
r2 = pd.DataFrame(r2, index=target.columns, columns=['R^2 Score'])
r2

Unnamed: 0,R^2 Score
Glucose (g/L),0.549681
Lactate (g/L),0.568295
Ethanol (g/L),0.156038
Acetate (g/L),0.144207
Biomass (g/L),0.857244
Formate (g/L),0.299333


### The combined r2 score is: **0.42913293043177275** [curr]

The combined r2 score is: **0.35816286323129254** *[2]*

The combined r2 score is: **0.2843934946185081** *[3]*

The combined r2 score is: **0.33809704738378416** *[4]*


Which is an average score.

Individually for each target variable, the r2 score has high variability. With Biomass being predicted most accurately with a score of 0.857244, and acetate being predicted most poorly with a score of 0.144207.

---

# 2. With the complete dataset undergone PCA

In [85]:
y_predicted = pd.DataFrame(columns=target.columns)
y_truth = pd.DataFrame(columns=target.columns)

In [87]:
for col in target.columns:
    y = target[col]
    X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=0)

    # Dimensionality Reduction
    pca = PCA(n_components=0.95)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    # convert back to a dataframe
    X_train_pca = pd.DataFrame(X_train_pca)
    X_test_pca = pd.DataFrame(X_test_pca)

    # split training data into training and validation data
    X_train, X_val, y_train, y_val = train_test_split(X_train_pca, y_train, test_size=0.2, random_state=0)

    model = NN_model(X_train, y_train, X_val, y_val, X_test_pca, y_test)

    y_predicted[col] = model.predict(X_test_pca).flatten()
    y_truth[col] = y_test.values

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [88]:
y_predicted.head()

Unnamed: 0,Glucose (g/L),Lactate (g/L),Ethanol (g/L),Acetate (g/L),Biomass (g/L),Formate (g/L)
0,16.82399,3.329969,0.414321,0.30851,0.066362,0.21609
1,6.614796,-0.00238,0.108477,0.093146,0.05184,0.093849
2,2.313177,0.181588,0.114306,0.090248,0.062036,0.109636
3,22.379244,31.750568,0.108698,0.034434,2.765123,0.156546
4,5.483399,0.295539,0.143515,0.050766,0.284747,0.075028


In [89]:
y_truth.head()

Unnamed: 0,Glucose (g/L),Lactate (g/L),Ethanol (g/L),Acetate (g/L),Biomass (g/L),Formate (g/L)
0,30.0,25.0,0.5,2.0,0.0,1.0
1,50.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.5,0.0,0.0
3,34.739658,40.735563,0.0,0.0,3.91524,0.0
4,8.66,2.18,0.0,0.0,0.72,0.0


In [90]:
# r2 score for all target variables
r2 = r2_score(y_truth, y_predicted)
r2

0.35816286323129254

In [91]:
# r2 score for each target variable
r2 = r2_score(y_truth, y_predicted, multioutput='raw_values')

# convert to a dataframe
r2 = pd.DataFrame(r2, index=target.columns, columns=['R^2 Score'])
r2

Unnamed: 0,R^2 Score
Glucose (g/L),0.057638
Lactate (g/L),0.604816
Ethanol (g/L),0.128175
Acetate (g/L),0.165103
Biomass (g/L),0.884916
Formate (g/L),0.308329


The combined r2 score is: **0.42913293043177275** [1]

### The combined r2 score is: **0.35816286323129254** [curr]

The combined r2 score is: **0.2843934946185081** [3]

The combined r2 score is: **0.33809704738378416** [4]


Which is an average score.

Individually for each target variable, the r2 score has high variability. With Biomass being predicted most accurately with a score of 0.884916 (higher than before) and Glucose being predicted most poorly with a score of 0.057638.

---

# 3. With the selected features dataset for each target variable

In [102]:
y_predicted = pd.DataFrame(columns=target.columns)
y_truth = pd.DataFrame(columns=target.columns)

In [103]:
glucose = pd.read_csv('modified_data/data_glucose2.csv')
lactate = pd.read_csv('modified_data/data_lactate.csv')
ethanol = pd.read_csv('modified_data/data_ethanol.csv')
acetate = pd.read_csv('modified_data/data_acetate.csv')
biomass = pd.read_csv('modified_data/data_biomass.csv')
formate = pd.read_csv('modified_data/data_formate.csv')

In [104]:
datasets = [glucose, lactate, ethanol, acetate, biomass, formate]

In [105]:
for dataset in datasets:
    # y is the last column in the dataset
    y = dataset.iloc[:, -1]
    X = dataset.iloc[:, :-1]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    # split training data into training and validation data
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

    model = NN_model(X_train, y_train, X_val, y_val, X_test, y_test)

    y_predicted[dataset.columns[-1]] = model.predict(X_test).flatten()
    y_truth[dataset.columns[-1]] = y_test.values

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [106]:
y_predicted.head()

Unnamed: 0,Glucose (g/L),Lactate (g/L),Ethanol (g/L),Acetate (g/L),Biomass (g/L),Formate (g/L)
0,12.044641,5.81445,0.337607,0.181046,0.139045,0.216647
1,8.300912,5.204381,0.208007,0.117998,0.127974,0.154979
2,4.568072,5.015829,0.12705,0.089438,0.165664,0.138042
3,19.185236,15.85512,0.501427,0.325027,3.737548,0.416546
4,3.817746,3.584099,0.064924,0.037689,0.362659,0.129893


In [107]:
y_truth.head()

Unnamed: 0,Glucose (g/L),Lactate (g/L),Ethanol (g/L),Acetate (g/L),Biomass (g/L),Formate (g/L)
0,30.0,25.0,0.5,2.0,0.0,1.0
1,50.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.5,0.0,0.0
3,34.739658,40.735563,0.0,0.0,3.91524,0.0
4,8.66,2.18,0.0,0.0,0.72,0.0


In [108]:
# r2 score for all target variables
r2 = r2_score(y_truth, y_predicted)
r2

0.2843934946185081

In [109]:
# r2 score for each target variable
r2 = r2_score(y_truth, y_predicted, multioutput='raw_values')

# convert to a dataframe
r2 = pd.DataFrame(r2, index=target.columns, columns=['R^2 Score'])
r2

Unnamed: 0,R^2 Score
Glucose (g/L),0.052366
Lactate (g/L),0.16148
Ethanol (g/L),0.079092
Acetate (g/L),0.117112
Biomass (g/L),0.970402
Formate (g/L),0.325907


The combined r2 score is: **0.42913293043177275** [1]

The combined r2 score is: **0.35816286323129254** [2]

### The combined r2 score is: **0.2843934946185081** [curr] THE WORST

The combined r2 score is: **0.33809704738378416** [4]

Which is a below average score. :(

BUT

Individually for each target variable, the r2 score has high variability. With Biomass being predicted most accurately with a score of ***0.970402***, and acetate being predicted most poorly with a score of 0.052366.

---

# 4. With the selected features dataset (after dimensionality reduction by PCA) for each target variable

In [93]:
y_predicted = pd.DataFrame(columns=target.columns)
y_truth = pd.DataFrame(columns=target.columns)

In [94]:
glucose = pd.read_csv('modified_data/data_glucose2.csv')
lactate = pd.read_csv('modified_data/data_lactate.csv')
ethanol = pd.read_csv('modified_data/data_ethanol.csv')
acetate = pd.read_csv('modified_data/data_acetate.csv')
biomass = pd.read_csv('modified_data/data_biomass.csv')
formate = pd.read_csv('modified_data/data_formate.csv')

In [95]:
datasets = [glucose, lactate, ethanol, acetate, biomass, formate]

In [97]:
for dataset in datasets:
    # y is the last column in the dataset
    y = dataset.iloc[:, -1]
    X = dataset.iloc[:, :-1]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    # Dimensionality Reduction
    pca = PCA(n_components=0.95)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    # convert back to a dataframe
    X_train_pca = pd.DataFrame(X_train_pca)
    X_test_pca = pd.DataFrame(X_test_pca)

    # split training data into training and validation data
    X_train, X_val, y_train, y_val = train_test_split(X_train_pca, y_train, test_size=0.2, random_state=0)

    model = NN_model(X_train, y_train, X_val, y_val, X_test_pca, y_test)

    y_predicted[dataset.columns[-1]] = model.predict(X_test_pca).flatten()
    y_truth[dataset.columns[-1]] = y_test.values

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [98]:
y_predicted.head()

Unnamed: 0,Glucose (g/L),Lactate (g/L),Ethanol (g/L),Acetate (g/L),Biomass (g/L),Formate (g/L)
0,12.639973,6.241774,0.941666,0.830453,0.02564,0.414972
1,8.473408,5.801205,0.087066,0.071283,0.021768,0.034079
2,4.326695,6.053002,0.077514,0.087387,0.025963,0.231419
3,19.522985,18.556988,0.031337,0.012789,3.682764,-0.010527
4,4.298193,2.921015,0.030391,0.047471,0.881406,0.037265


In [99]:
y_truth.head()

Unnamed: 0,Glucose (g/L),Lactate (g/L),Ethanol (g/L),Acetate (g/L),Biomass (g/L),Formate (g/L)
0,30.0,25.0,0.5,2.0,0.0,1.0
1,50.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.5,0.0,0.0
3,34.739658,40.735563,0.0,0.0,3.91524,0.0
4,8.66,2.18,0.0,0.0,0.72,0.0


In [100]:
# r2 score for all target variables
r2 = r2_score(y_truth, y_predicted)
r2

0.33809704738378416

In [101]:
# r2 score for each target variable
r2 = r2_score(y_truth, y_predicted, multioutput='raw_values')

# convert to a dataframe
r2 = pd.DataFrame(r2, index=target.columns, columns=['R^2 Score'])
r2

Unnamed: 0,R^2 Score
Glucose (g/L),0.069825
Lactate (g/L),0.182412
Ethanol (g/L),0.157033
Acetate (g/L),0.253995
Biomass (g/L),0.993745
Formate (g/L),0.371572


The combined r2 score is: **0.42913293043177275** [1]

The combined r2 score is: **0.35816286323129254** [2]

The combined r2 score is: **0.2843934946185081** [3]

### The combined r2 score is: **0.33809704738378416** [curr]

Which is an average score. Worse than previous models.

BUT

Individually for each target variable, the r2 score has high variability. With Biomass being predicted most accurately with a score of ***0.993745***, and acetate being predicted most poorly with a score of 0.069825.

---