<a href="https://colab.research.google.com/github/RainXie23/MANE4961---Machine-Learning-for-Engineering-Fall-2025-/blob/main/MLE_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [19]:
url = "https://www.fpbase.org/api/proteins/?format=csv"
df = pd.read_csv(url)
df.to_csv("fpbase_with_sequences.csv", index=False)
print(df.columns)

Index(['agg', 'doi', 'genbank', 'ipg_id', 'name', 'pdb', 'pdb.0', 'pdb.1',
       'pdb.10', 'pdb.2',
       ...
       'transitions.2.to_state', 'transitions.2.trans_wave',
       'transitions.3.from_state', 'transitions.3.to_state',
       'transitions.3.trans_wave', 'transitions.4.from_state',
       'transitions.4.to_state', 'transitions.4.trans_wave', 'uniprot',
       'uuid'],
      dtype='object', length=117)


In [22]:
!pip install biopython
from Bio.SeqUtils import molecular_weight

df_no_ambiguous = df[df['seq'].apply(lambda x: ('X' not in x and 'Z' not in x and 'B' not in x) if isinstance(x, str) else False)]
df_no_ambiguous['mol_weight'] = df_no_ambiguous['seq'].apply(lambda s: molecular_weight(s, seq_type='protein') if isinstance(s, str) else None)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_ambiguous['mol_weight'] = df_no_ambiguous['seq'].apply(lambda s: molecular_weight(s, seq_type='protein') if isinstance(s, str) else None)


In [26]:
df = df_no_ambiguous[
    (df_no_ambiguous['switch_type'] == 'b') &
    df_no_ambiguous['agg'].notna() &
    (df_no_ambiguous['agg'] != '') &
    df_no_ambiguous['states.0.em_max'].notna() &
    df_no_ambiguous['states.0.brightness'].notna()
]

len(df['name'])

435

In [34]:
import collections
import numpy as np

amino_acids_list = list('ARNDCQEGHILKMFPSTWYV')
aaindex = {aa: i for i, aa in enumerate(amino_acids_list)}

def Seq2OneHot(seq, amino_acid_alphabet):
  max_len = max(len(s) for s in df['seq'])
  one_hot_seq = np.zeros((max_len, len(amino_acid_alphabet)))
  for i, char in enumerate(seq):
    if i < max_len and char in amino_acid_alphabet:
      one_hot_seq[i, aaindex[char]] = 1
  return one_hot_seq

def Position(seq, amino_acid_alphabet):
  most_aa_in_seq = 0
  if isinstance(seq, str) and len(seq) > 0:
    counts = collections.Counter(seq)
    if counts:
      most_aa_in_seq = max(counts.values())

  # Ensure most_aa is at least 1 to avoid issues with empty sequences or zero counts
  if most_aa_in_seq == 0 and len(seq) > 0: # If there are AAs but counts are all zero, it's an issue
      most_aa_in_seq = 1 # Smallest possible dimension for position if seq is not empty
  elif len(seq) == 0:
      return np.zeros((len(amino_acid_alphabet), 0)) # Return empty array if sequence is empty

  position = np.zeros((len(amino_acid_alphabet), most_aa_in_seq))
  aa_occurrence_count = {aa: 0 for aa in amino_acid_alphabet}

  for i, char_in_seq in enumerate(seq):
    if char_in_seq in aaindex:
      aa_index = aaindex[char_in_seq]
      occurrence_idx = aa_occurrence_count[char_in_seq]
      if occurrence_idx < most_aa_in_seq:
          position[aa_index, occurrence_idx] = 1
      aa_occurrence_count[char_in_seq] += 1
  return position

In [35]:
df['seq_one_hot'] = df['seq'].apply(lambda s: Seq2OneHot(s, amino_acids_list))
df['seq_position'] = df['seq'].apply(lambda s: Position(s, amino_acids_list))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['seq_one_hot'] = df['seq'].apply(lambda s: Seq2OneHot(s, amino_acids_list))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['seq_position'] = df['seq'].apply(lambda s: Position(s, amino_acids_list))


In [36]:
def Agg2Num(agg):
  if agg == 'm':
    return 1
  elif agg == 'd' or agg == 'wd':
    return 2
  elif agg == 'td':
    return 2.5
  elif agg == 't':
    return 4
  else:
    return np.nan # Handle cases where agg might not match known categories

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
import numpy as np # Ensure numpy is imported if used in this cell for np.array

# Create a numeric representation for 'agg'
df['agg_numeric'] = df['agg'].apply(Agg2Num)

# Define target variables (y) for multi-output regression
# Using .values to get a numpy array from the DataFrame subset
y = df[['agg_numeric', 'states.0.em_max', 'states.0.brightness']].values

# Flatten the 'seq_one_hot' arrays into a 2D numpy array for X
# First, determine the maximum sequence length used for one-hot encoding
# The 'amino_acids_list' is already defined in a previous cell.
max_len_seq = max(len(s) for s in df['seq'])
num_amino_acids = len(amino_acids_list) # From aaindex and amino_acids_list in previous cells

# Each entry in df['seq_one_hot'] is a numpy array of shape (max_len_seq, num_amino_acids)
# We flatten each of these arrays into a 1D vector and stack them.
X = np.array([arr.flatten() for arr in df['seq_one_hot']])

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['agg_numeric'] = df['agg'].apply(Agg2Num)


In [38]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [39]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Define the Neural Network model
model = Sequential([
    Input(shape=(x_train.shape[1],)), # Input layer expects flattened feature vector
    Dense(256, activation='relu'), # First hidden layer
    Dense(128, activation='relu'), # Second hidden layer
    Dense(64, activation='relu'),  # Third hidden layer
    Dense(y_train.shape[1])       # Output layer: 3 neurons for 'agg_numeric', 'em_max', 'brightness'
])

# Compile the model
# Using Adam optimizer and Mean Squared Error (MSE) loss for regression
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae', 'mse'])

# Setup Early Stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
print("Training the Neural Network model...")
history = model.fit(
    x_train,
    y_train,
    epochs=100, # Increased epochs, but early stopping will manage it
    batch_size=32,
    validation_split=0.2, # Use a portion of training data for validation
    callbacks=[early_stopping], # Add early stopping callback
    verbose=1
)
print("Model training complete.")

# Evaluate the model on the test set
print("\nEvaluating the model on the test set...")
loss, mae, mse = model.evaluate(x_test, y_test, verbose=0)
print(f"Test Loss (MSE): {loss:.4f}")
print(f"Test MAE: {mae:.4f}")
print(f"Test MSE: {mse:.4f}")

# Make predictions on the test set
y_pred = model.predict(x_test)

print("\nSample predictions vs actual values (first 5 test samples):")
for i in range(5):
    print(f"  Sample {i+1}:")
    print(f"    Predicted: {y_pred[i]}")
    print(f"    Actual:    {y_test[i]}")

Training the Neural Network model...
Epoch 1/100
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 88ms/step - loss: 109112.6094 - mae: 201.0556 - mse: 109112.6094 - val_loss: 94510.6719 - val_mae: 191.0801 - val_mse: 94510.6719
Epoch 2/100
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - loss: 93225.2188 - mae: 190.4466 - mse: 93225.2188 - val_loss: 68303.3984 - val_mae: 170.3733 - val_mse: 68303.3984
Epoch 3/100
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 58ms/step - loss: 55148.5156 - mae: 153.9451 - mse: 55148.5156 - val_loss: 22206.7734 - val_mae: 89.1603 - val_mse: 22206.7734
Epoch 4/100
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 57ms/step - loss: 12845.4717 - mae: 72.5647 - mse: 12845.4717 - val_loss: 13198.9336 - val_mae: 66.5615 - val_mse: 13198.9336
Epoch 5/100
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 60ms/step - loss: 5836.8926 - mae: 49.6996 - mse: 5836.8926 - val_loss: 10115.4121