In [27]:
import os
import numpy as np
import pandas as pd
import joblib

In [28]:
# Load models
nb = joblib.load('models/binary_classification/nb_model.pkl')


In [29]:
# Load second dataset
df = pd.read_csv("second_data/training_data.csv")  # update path if needed
npz_dir = "second_data/npz_data"  # folder with .npz files


In [30]:
df.head()

Unnamed: 0,id,mz_low,mz_high,Name,root_region_intensity,m/z,spec_id
0,0,87.0007,87.0157,Pyruvic acid,3749.830566,87.0082,901a7d45-9bb9-48ae-aab0-77e82d482817
1,1,88.0328,88.0478,L-Alanine,3823.749268,88.0403,6bb3a4ac-966b-4629-9424-a30b444e4d78
2,2,89.0164,89.0314,Lactic acid,899.031799,89.0239,ce1be6c8-a38f-4df1-abdf-af193d56328b
3,4,104.0277,104.0427,Serine,542.10144,104.0352,6a4ad015-c341-457e-83df-a9e68beaa5b5
4,5,108.0049,108.0199,Hypotaurine,191.645645,108.0124,c54facc5-1090-40a7-b9cd-6dabb6c20b74


In [31]:
# loading npz 
import os
spec_id=df.loc[0,'spec_id']
npz_path=os.path.join(npz_dir,f"{spec_id}.npz")

with np.load(npz_path) as npz:
    mz = npz['mz']
    intensities = npz['intensities']

print("\nFirst Spec ID:", spec_id)
print("m/z shape:", mz.shape)
print("Intensity shape:", intensities.shape)
print("\nSample m/z values:", mz[:10])
print("Sample intensities:", intensities[:10])


First Spec ID: 901a7d45-9bb9-48ae-aab0-77e82d482817
m/z shape: (80,)
Intensity shape: (80,)

Sample m/z values: [86.99064875 86.9910837  86.99151866 86.99195361 86.99238857 86.99282354
 86.9932585  86.99369347 86.99412844 86.99456341]
Sample intensities: [4.8089743 8.8732    5.7162895 7.9441743 5.0840626 6.9757705 5.213771
 9.002102  4.0344863 9.366201 ]


In [32]:
# Build data for regression
X_reg = []
y_reg = []

for _, row in df.iterrows():
    spec_id = row['spec_id']
    mz_val = row['m/z']
    mz_low = row['mz_low']
    mz_high = row['mz_high']
    npz_path = os.path.join(npz_dir, f"{spec_id}.npz")

    if not os.path.exists(npz_path):
        continue
    with np.load(npz_path) as data:
        mz = data['mz']
        intensities = data['intensities']

    # Normalize mz and intensity
    norm_mz = (mz - np.min(mz)) / (np.max(mz) - np.min(mz) + 1e-8)
    norm_intensities = intensities / (np.max(intensities) + 1e-8)

    # Combine: [m/z from CSV] + mz spectrum + intensity spectrum
    features = np.concatenate([norm_mz, norm_intensities])

    # Apply classifier
    if nb.predict([features])[0] == 0:  # Not Discarded
        X_reg.append(features)
        y_reg.append([mz_low, mz_high])

In [33]:
from sklearn.model_selection import train_test_split
# Convert to arrays
X_reg = np.array(X_reg)
y_reg = np.array(y_reg)

# Split and train regression model
X_train, X_test, y_train, y_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

In [34]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

reg = RandomForestRegressor(random_state=42)
reg.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [35]:
# Predict and evaluate
y_pred = reg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae= mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R² Score:", r2)

Mean Squared Error: 2554.013563485544
Mean Absolute Error: 38.38731961790022
R² Score: 0.9597190977527585


In [36]:
from sklearn.linear_model import Ridge, Lasso

ridge = Ridge(alpha=1.0, random_state=42)  # you can tune alpha
ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)

In [37]:
mse = mean_squared_error(y_test, y_pred_ridge)
mae= mean_absolute_error(y_test, y_pred_ridge)
r2 = r2_score(y_test, y_pred_ridge)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R² Score:", r2)

Mean Squared Error: 108495.90151935612
Mean Absolute Error: 240.332965665785
R² Score: -0.711154893690285


In [38]:
lasso = Lasso(alpha=0.01, random_state=42)  # alpha must be small due to multi-output
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)

  model = cd_fast.enet_coordinate_descent(


In [39]:
mse = mean_squared_error(y_test, y_pred_lasso)
mae= mean_absolute_error(y_test, y_pred_lasso)
r2 = r2_score(y_test, y_pred_lasso)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R² Score:", r2)

Mean Squared Error: 350471.0241684929
Mean Absolute Error: 488.27853710381333
R² Score: -4.527491815463552


## with 3 features

In [41]:
rf = joblib.load('models/binary_classification/rf_model.pkl')


In [42]:
# Build data for regression
X_reg = []
y_reg = []

for _, row in df.iterrows():
    spec_id = row['spec_id']
    mz_val = row['m/z']
    mz_low = row['mz_low']
    mz_high = row['mz_high']
    npz_path = os.path.join(npz_dir, f"{spec_id}.npz")

    if not os.path.exists(npz_path):
        continue
    with np.load(npz_path) as data:
        mz = data['mz']
        intensities = data['intensities']

    # Normalize mz and intensity
    norm_mz = (mz - np.min(mz)) / (np.max(mz) - np.min(mz) + 1e-8)
    norm_intensities = intensities / (np.max(intensities) + 1e-8)

    # Combine: [m/z from CSV] + mz spectrum + intensity spectrum
    features = np.concatenate([[mz_val],norm_mz, norm_intensities])

    # Apply classifier
    if nb.predict([features])[0] == 0:  # Not Discarded
        X_reg.append(features)
        y_reg.append([mz_low, mz_high])

In [43]:
from sklearn.model_selection import train_test_split
# Convert to arrays
X_reg = np.array(X_reg)
y_reg = np.array(y_reg)

# Split and train regression model
X_train, X_test, y_train, y_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

### Random Forest

In [44]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

reg = RandomForestRegressor(random_state=42)
reg.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [45]:
# Predict and evaluate
y_pred = reg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae= mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R² Score:", r2)

Mean Squared Error: 16952.865857417193
Mean Absolute Error: 101.66557127962896
R² Score: 0.7794682363145833


### Ridge and Lasso Regression

In [46]:
from sklearn.linear_model import Ridge, Lasso

ridge = Ridge(alpha=1.0, random_state=42)  # you can tune alpha
ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)

In [47]:
mse = mean_squared_error(y_test, y_pred_ridge)
mae= mean_absolute_error(y_test, y_pred_ridge)
r2 = r2_score(y_test, y_pred_ridge)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R² Score:", r2)

Mean Squared Error: 7.915733837625562e-06
Mean Absolute Error: 0.00223942406054789
R² Score: 0.9999999998970264


In [48]:
lasso = Lasso(alpha=0.01, random_state=42)  # alpha must be small due to multi-output
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)

In [49]:
mse = mean_squared_error(y_test, y_pred_lasso)
mae= mean_absolute_error(y_test, y_pred_lasso)
r2 = r2_score(y_test, y_pred_lasso)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R² Score:", r2)

Mean Squared Error: 1.3852105549596192e-05
Mean Absolute Error: 0.0020706792416831377
R² Score: 0.9999999998198015


### XGBoost

In [50]:
import xgboost as xgb
from sklearn.multioutput import MultiOutputRegressor
model = MultiOutputRegressor(xgb.XGBRegressor(
    objective='reg:squarederror',  # regression objective
    n_estimators=100,
    learning_rate=0.1,
    max_depth=4,
    random_state=42
))

model.fit(X_train, y_train)

0,1,2
,estimator,"XGBRegressor(...ree=None, ...)"
,n_jobs,

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [51]:
y_pred=model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae= mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R² Score:", r2)

Mean Squared Error: 20607.142676155003
Mean Absolute Error: 102.36768092060235
R² Score: 0.7319314881446758
