In [1]:
!pip install rarfile
!pip install catboost

Collecting rarfile
  Downloading rarfile-4.0-py3-none-any.whl (28 kB)
Installing collected packages: rarfile
Successfully installed rarfile-4.0
Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [36]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error
import rarfile
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from lightgbm import LGBMRegressor
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor
import xgboost as xgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Specify the path and filename of the RAR file
rar_path = '/content/drive/MyDrive/data/covid.rar'


# Specify the directory where you want to extract the files
extract_directory = '/content/data'
# Open the RAR file
with rarfile.RarFile(rar_path, 'r') as rar_ref:
    # Extract all the files to the specified directory
    rar_ref.extractall(extract_directory)

print('File extracted successfully.')

File extracted successfully.


In [5]:
# Load the dataset
train_data = pd.read_csv('/content/data/covid/train.csv')
train_data["id_seqpos"] = [id.split('_')[-1] for id in train_data["id_seqpos"]]

test_data = pd.read_csv('/content/data/covid/test.csv')
test_id = test_data['id_seqpos']
test_data["id_seqpos"] = [id.split('_')[-1] for id in test_data["id_seqpos"]]

# Drop target variables and id column
# Extract the multi-label target features
target_columns = ['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C']
target_features = train_data[target_columns]
train_data = train_data.drop(['id'] + ["id_seqpos"], axis=1)

test_data=test_data.drop(['id']+["id_seqpos"], axis=1)

In [6]:
train_data

Unnamed: 0,sequence,structure,predicted_loop_type,reactivity,reactivity_error,deg_Mg_pH10,deg_error_Mg_pH10,deg_pH10,deg_error_pH10,deg_Mg_50C,...,b4_structure,a4_structure,b4_predicted_loop_type,a4_predicted_loop_type,b5_sequence,a5_sequence,b5_structure,a5_structure,b5_predicted_loop_type,a5_predicted_loop_type
0,A,.,H,1.7786,0.1195,0.7346,0.0790,0.3680,0.0851,0.7228,...,(,),S,S,C,A,.,),I,S
1,G,.,I,0.4397,0.1033,0.1630,0.0799,0.0878,0.0878,0.1100,...,(,.,S,H,C,C,.,.,B,H
2,G,(,S,0.1120,0.1375,0.0000,0.0744,0.1835,0.1969,0.0933,...,(,(,S,S,A,G,(,(,S,S
3,G,(,S,0.0622,0.0471,0.1179,0.1075,0.0657,0.0615,0.0548,...,.,(,E,S,A,U,.,(,E,S
4,U,(,S,0.1378,0.1352,0.4069,0.2168,0.3375,0.2623,0.2378,...,(,(,S,S,G,G,(,.,S,H
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130555,U,),S,0.1244,0.0393,0.2672,0.0728,0.1479,0.0493,0.1921,...,),),S,S,A,A,.,.,H,M
130556,A,.,X,0.0201,0.0351,0.1271,0.0583,0.0580,0.0660,0.0245,...,.,.,H,X,A,U,.,.,H,X
130557,A,.,H,0.4597,0.0601,0.2014,0.0515,0.2838,0.0793,0.2042,...,.,.,H,I,U,U,.,),H,S
130558,G,.,I,0.3634,0.1063,0.1928,0.0738,0.1885,0.1276,0.1627,...,),.,S,I,G,G,),),S,S


In [7]:
# Assuming df is your DataFrame

# Merge sequences
sequence_columns = ['sequence', 'b1_sequence', 'a1_sequence', 'b2_sequence', 'a2_sequence',
                    'b3_sequence', 'a3_sequence', 'b4_sequence', 'a4_sequence',
                    'b5_sequence', 'a5_sequence']
train_data['merged_sequence'] = train_data[sequence_columns].apply(lambda row: ''.join(row), axis=1)

# Merge structures
structure_columns = ['structure', 'b1_structure', 'a1_structure', 'b2_structure', 'a2_structure',
                     'b3_structure', 'a3_structure', 'b4_structure', 'a4_structure',
                     'b5_structure', 'a5_structure']
train_data['merged_structure'] = train_data[structure_columns].apply(lambda row: ''.join(row), axis=1)

# Merge predicted loop types
loop_type_columns = ['predicted_loop_type', 'b1_predicted_loop_type', 'a1_predicted_loop_type',
                     'b2_predicted_loop_type', 'a2_predicted_loop_type',
                     'b3_predicted_loop_type', 'a3_predicted_loop_type',
                     'b4_predicted_loop_type', 'a4_predicted_loop_type',
                     'b5_predicted_loop_type', 'a5_predicted_loop_type']
train_data['merged_predicted_loop_type'] = train_data[loop_type_columns].apply(lambda row: ''.join(row), axis=1)

In [8]:
train_data

Unnamed: 0,sequence,structure,predicted_loop_type,reactivity,reactivity_error,deg_Mg_pH10,deg_error_Mg_pH10,deg_pH10,deg_error_pH10,deg_Mg_50C,...,a4_predicted_loop_type,b5_sequence,a5_sequence,b5_structure,a5_structure,b5_predicted_loop_type,a5_predicted_loop_type,merged_sequence,merged_structure,merged_predicted_loop_type
0,A,.,H,1.7786,0.1195,0.7346,0.0790,0.3680,0.0851,0.7228,...,S,C,A,.,),I,S,ACUGCUGGCCA,.(.(.()().),HSHSHSSSSIS
1,G,.,I,0.4397,0.1033,0.1630,0.0799,0.0878,0.0878,0.1100,...,H,C,C,.,.,B,H,GGACCAGGUCC,.(.(((((...,ISISSSSSHBH
2,G,(,S,0.1120,0.1375,0.0000,0.0744,0.1835,0.1969,0.0933,...,S,A,G,(,(,S,S,GUGUGUAUUAG,(((((((((((,SSSSSSSSSSS
3,G,(,S,0.0622,0.0471,0.1179,0.1075,0.0657,0.0615,0.0548,...,S,A,U,.,(,E,S,GCGGCGGAGAU,(((((((.(.(,SSSSSSSESES
4,U,(,S,0.1378,0.1352,0.4069,0.2168,0.3375,0.2623,0.2378,...,S,G,G,(,.,S,H,UGACCAGUAGG,(.((((((((.,SBSSSSSSSSH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130555,U,),S,0.1244,0.0393,0.2672,0.0728,0.1479,0.0493,0.1921,...,S,A,A,.,.,H,M,UUGGAAGUAAA,)).).))))..,SSISISSSSHM
130556,A,.,X,0.0201,0.0351,0.1271,0.0583,0.0580,0.0660,0.0245,...,X,A,U,.,.,H,X,AUACGGAAAAU,.).).).....,XSXSXSXHXHX
130557,A,.,H,0.4597,0.0601,0.2014,0.0515,0.2838,0.0793,0.2042,...,I,U,U,.,),H,S,AUCACAAUGUU,..).).)...),HHSHSHSHIHS
130558,G,.,I,0.3634,0.1063,0.1928,0.0738,0.1885,0.1276,0.1627,...,I,G,G,),),S,S,GCGGUGGCGGG,.))))).).)),ISSSSSISISS


In [9]:
# Assuming df is your DataFrame

# Merge sequences
sequence_columns = ['sequence', 'b1_sequence', 'a1_sequence', 'b2_sequence', 'a2_sequence',
                    'b3_sequence', 'a3_sequence', 'b4_sequence', 'a4_sequence',
                    'b5_sequence', 'a5_sequence']
test_data['merged_sequence'] = test_data[sequence_columns].apply(lambda row: ''.join(row), axis=1)

# Merge structures
structure_columns = ['structure', 'b1_structure', 'a1_structure', 'b2_structure', 'a2_structure',
                     'b3_structure', 'a3_structure', 'b4_structure', 'a4_structure',
                     'b5_structure', 'a5_structure']
test_data['merged_structure'] = test_data[structure_columns].apply(lambda row: ''.join(row), axis=1)

# Merge predicted loop types
loop_type_columns = ['predicted_loop_type', 'b1_predicted_loop_type', 'a1_predicted_loop_type',
                     'b2_predicted_loop_type', 'a2_predicted_loop_type',
                     'b3_predicted_loop_type', 'a3_predicted_loop_type',
                     'b4_predicted_loop_type', 'a4_predicted_loop_type',
                     'b5_predicted_loop_type', 'a5_predicted_loop_type']
test_data['merged_predicted_loop_type'] = test_data[loop_type_columns].apply(lambda row: ''.join(row), axis=1)

# Drop the original sequence, structure, and predicted_loop_type columns if needed
#train_data.drop(sequence_columns + structure_columns + loop_type_columns, axis=1, inplace=True)

# Now your DataFrame has 'merged_sequence', 'merged_structure', and 'merged_predicted_loop_type' columns


In [10]:
def featurize(df):

    df['total_A_count'] = df['merged_sequence'].apply(lambda s: s.count('A'))
    df['total_G_count'] = df['merged_sequence'].apply(lambda s: s.count('G'))


    return df

In [11]:
train_data = featurize(train_data)
test_data = featurize(test_data)

In [12]:
train_data = train_data.drop(['merged_sequence', 'merged_structure', 'merged_predicted_loop_type'], axis=1)
test_data = test_data.drop(['merged_sequence', 'merged_structure', 'merged_predicted_loop_type'], axis=1)


In [13]:
train_data

Unnamed: 0,sequence,structure,predicted_loop_type,reactivity,reactivity_error,deg_Mg_pH10,deg_error_Mg_pH10,deg_pH10,deg_error_pH10,deg_Mg_50C,...,b4_predicted_loop_type,a4_predicted_loop_type,b5_sequence,a5_sequence,b5_structure,a5_structure,b5_predicted_loop_type,a5_predicted_loop_type,total_A_count,total_G_count
0,A,.,H,1.7786,0.1195,0.7346,0.0790,0.3680,0.0851,0.7228,...,S,S,C,A,.,),I,S,2,3
1,G,.,I,0.4397,0.1033,0.1630,0.0799,0.0878,0.0878,0.1100,...,S,H,C,C,.,.,B,H,2,4
2,G,(,S,0.1120,0.1375,0.0000,0.0744,0.1835,0.1969,0.0933,...,S,S,A,G,(,(,S,S,2,4
3,G,(,S,0.0622,0.0471,0.1179,0.1075,0.0657,0.0615,0.0548,...,E,S,A,U,.,(,E,S,2,6
4,U,(,S,0.1378,0.1352,0.4069,0.2168,0.3375,0.2623,0.2378,...,S,S,G,G,(,.,S,H,3,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130555,U,),S,0.1244,0.0393,0.2672,0.0728,0.1479,0.0493,0.1921,...,S,S,A,A,.,.,H,M,5,3
130556,A,.,X,0.0201,0.0351,0.1271,0.0583,0.0580,0.0660,0.0245,...,H,X,A,U,.,.,H,X,6,2
130557,A,.,H,0.4597,0.0601,0.2014,0.0515,0.2838,0.0793,0.2042,...,H,I,U,U,.,),H,S,4,1
130558,G,.,I,0.3634,0.1063,0.1928,0.0738,0.1885,0.1276,0.1627,...,S,I,G,G,),),S,S,0,8


In [27]:
# Identify categorical columns
from sklearn.preprocessing import LabelEncoder
categorical_columns = train_data.select_dtypes(include=['object']).columns

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply LabelEncoder to all categorical columns in both training and test data
for column in categorical_columns:
    # Fit LabelEncoder using training data and apply to test data
    label_encoder.fit(train_data[column])
    train_data[column] = label_encoder.transform(train_data[column])

    # Transform test data with a 'transform' call, not 'fit_transform'
    test_data[column] = test_data[column].apply(lambda x: label_encoder.transform([x])[0] if x in label_encoder.classes_ else -1)  # Use -1 for unseen labels

In [15]:
'''import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Assume you have the 'train_data' and 'test_data' DataFrames

# Identify categorical columns
categorical_columns = train_data.select_dtypes(include=['object']).columns

# Initialize OneHotEncoder
onehot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Fit and transform OneHotEncoder on training data
encoded_train = onehot_encoder.fit_transform(train_data[categorical_columns])
encoded_train_df = pd.DataFrame(encoded_train, columns=onehot_encoder.get_feature_names_out(categorical_columns))

# Transform test data using the trained OneHotEncoder
encoded_test = onehot_encoder.transform(test_data[categorical_columns])
encoded_test_df = pd.DataFrame(encoded_test, columns=encoded_train_df.columns)

# Replace categorical columns with encoded columns in both training and test data
train_data_encoded = pd.concat([train_data.drop(categorical_columns, axis=1), encoded_train_df], axis=1)
test_data_encoded = pd.concat([test_data.drop(categorical_columns, axis=1), encoded_test_df], axis=1)'''


"import pandas as pd\nfrom sklearn.preprocessing import OneHotEncoder\n\n# Assume you have the 'train_data' and 'test_data' DataFrames\n\n# Identify categorical columns\ncategorical_columns = train_data.select_dtypes(include=['object']).columns\n\n# Initialize OneHotEncoder\nonehot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')\n\n# Fit and transform OneHotEncoder on training data\nencoded_train = onehot_encoder.fit_transform(train_data[categorical_columns])\nencoded_train_df = pd.DataFrame(encoded_train, columns=onehot_encoder.get_feature_names_out(categorical_columns))\n\n# Transform test data using the trained OneHotEncoder\nencoded_test = onehot_encoder.transform(test_data[categorical_columns])\nencoded_test_df = pd.DataFrame(encoded_test, columns=encoded_train_df.columns)\n\n# Replace categorical columns with encoded columns in both training and test data\ntrain_data_encoded = pd.concat([train_data.drop(categorical_columns, axis=1), encoded_train_df], axis=1)\ntest

In [31]:
# Split data in features and labels
target = train_data[['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C']]
train_data_encoded = train_data.drop(['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C'], axis=1)


In [17]:
# Create a Random Forest model
model = RandomForestRegressor()  # For regression



# Fit the model on the training data
model.fit(train_data_encoded, target["reactivity"])

# Get feature importance scores
feature_importances = model.feature_importances_

# Create a DataFrame to store feature names and their importance scores
feature_importance_df = pd.DataFrame({'Feature': train_data_encoded.columns, 'Importance': feature_importances})

# Sort the DataFrame by importance scores in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)



In [32]:
# Normalize specified features using Min-Max scaling
scaler = MinMaxScaler()
features_to_normalize = ['reactivity_error', 'deg_pH10', 'deg_error_50C']
train_data_encoded[features_to_normalize] = scaler.fit_transform(train_data_encoded[features_to_normalize])

# Convert scaled values to a DataFrame for better display
train_data_encoded[features_to_normalize] = pd.DataFrame(train_data_encoded, columns=features_to_normalize)

# Display the scaled values with formatting
pd.set_option('display.float_format', '{:.10f}'.format)
train_data_encoded

Unnamed: 0,sequence,structure,predicted_loop_type,reactivity_error,deg_error_Mg_pH10,deg_pH10,deg_error_pH10,deg_error_Mg_50C,deg_50C,deg_error_50C,...,b4_predicted_loop_type,a4_predicted_loop_type,b5_sequence,a5_sequence,b5_structure,a5_structure,b5_predicted_loop_type,a5_predicted_loop_type,total_A_count,total_G_count
0,0,2,2,0.0000008046,0.0790000000,0.5041000039,0.0851000000,0.0792000000,0.5685000000,0.0000003636,...,6,5,2,0,3,1,4,5,2,3
1,2,2,3,0.0000006938,0.0799000000,0.5009529800,0.0878000000,0.0683000000,0.0666000000,0.0000002947,...,6,2,2,1,3,2,1,2,2,4
2,2,0,5,0.0000009278,0.0744000000,0.5020278201,0.1969000000,0.1163000000,0.0000000000,0.0000003773,...,6,5,1,2,0,0,6,5,2,4
3,2,0,5,0.0000003093,0.1075000000,0.5007047671,0.0615000000,0.0506000000,0.0749000000,0.0000003127,...,2,5,1,3,3,0,2,5,2,6
4,3,0,5,0.0000009121,0.2168000000,0.5037574478,0.2623000000,0.1569000000,0.6247000000,0.0000012188,...,6,5,3,2,0,2,6,2,3,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130555,3,1,5,0.0000002559,0.0728000000,0.5016279840,0.0493000000,0.0481000000,0.4711000000,0.0000003017,...,6,5,1,0,3,2,3,4,5,3
130556,0,2,6,0.0000002272,0.0583000000,0.5006182858,0.0660000000,0.0419000000,0.2559000000,0.0000003592,...,3,6,1,3,3,2,3,6,6,2
130557,0,2,2,0.0000003982,0.0515000000,0.5031543244,0.0793000000,0.0528000000,0.3089000000,0.0000002936,...,3,3,4,3,3,1,3,5,4,1
130558,2,2,3,0.0000007143,0.0738000000,0.5020839768,0.1276000000,0.0773000000,0.1001000000,0.0000004311,...,6,3,3,2,1,1,6,5,0,8


In [33]:
# Assuming you have loaded your data into X_train, y_train, X_test, y_test, and defined target_features
X_train, X_test, y_train, y_test = train_test_split(train_data_encoded[feature_importance_df["Feature"]], target, test_size=0.20,random_state=42)

In [25]:
# Assuming you have loaded your data into X_train, y_train, X_test, y_test, and defined target_features

# Create a MultiOutputRegressor with CatBoost
catboost_model = MultiOutputRegressor(CatBoostRegressor(iterations=100, random_state=42))

# Train the model
catboost_model.fit(X_train, y_train)

# Predict target values on the test set
y_pred = catboost_model.predict(X_test)

# Calculate MCRMSE for each target
mcrmse_per_target = []
for i in range(len(target_features.columns)):
    mse = mean_squared_error(y_test.iloc[:, i], y_pred[:, i])
    mcrmse = np.sqrt(mse)
    mcrmse_per_target.append(mcrmse)

# Calculate average MCRMSE across all targets
average_mcrmse = np.mean(mcrmse_per_target)

# Print results
print("CatBoost Model")
for i, target in enumerate(target_features.columns):
    print(f"MCRMSE for {target}: {mcrmse_per_target[i]}")
print(f"Average MCRMSE: {average_mcrmse}\n")


Learning rate set to 0.5
0:	learn: 0.6499621	total: 35.6ms	remaining: 3.53s
1:	learn: 0.6070330	total: 83.9ms	remaining: 4.11s
2:	learn: 0.5866013	total: 109ms	remaining: 3.51s
3:	learn: 0.5751177	total: 143ms	remaining: 3.43s
4:	learn: 0.5661343	total: 177ms	remaining: 3.37s
5:	learn: 0.5552534	total: 202ms	remaining: 3.17s
6:	learn: 0.5480625	total: 231ms	remaining: 3.07s
7:	learn: 0.5425874	total: 260ms	remaining: 2.99s
8:	learn: 0.5353585	total: 283ms	remaining: 2.86s
9:	learn: 0.5299929	total: 308ms	remaining: 2.77s
10:	learn: 0.5256369	total: 345ms	remaining: 2.79s
11:	learn: 0.5199443	total: 371ms	remaining: 2.72s
12:	learn: 0.5145917	total: 401ms	remaining: 2.68s
13:	learn: 0.5123207	total: 427ms	remaining: 2.62s
14:	learn: 0.5093327	total: 453ms	remaining: 2.57s
15:	learn: 0.5070123	total: 485ms	remaining: 2.55s
16:	learn: 0.4994166	total: 512ms	remaining: 2.5s
17:	learn: 0.4971379	total: 561ms	remaining: 2.55s
18:	learn: 0.4934228	total: 585ms	remaining: 2.49s
19:	learn: 0.49

In [34]:
# Create a MultiOutputRegressor with Random Forest
random_forest_model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))

# Train the model
random_forest_model.fit(X_train, y_train)

# Predict target values on the test set
y_pred = random_forest_model.predict(X_test)

# Calculate MCRMSE for each target
mcrmse_per_target = []
for i in range(len(target_features.columns)):
    mse = mean_squared_error(y_test.iloc[:, i], y_pred[:, i])
    mcrmse = np.sqrt(mse)
    mcrmse_per_target.append(mcrmse)

# Calculate average MCRMSE across all targets
average_mcrmse = np.mean(mcrmse_per_target)

# Print results
print("Random Forest Model")
for i, target in enumerate(target_features.columns):
    print(f"MCRMSE for {target}: {mcrmse_per_target[i]}")
print(f"Average MCRMSE: {average_mcrmse}\n")


Random Forest Model
MCRMSE for reactivity: 0.5053379881973803
MCRMSE for deg_Mg_pH10: 0.4091480744873961
MCRMSE for deg_Mg_50C: 0.4756372520364208
Average MCRMSE: 0.46337443824039903



In [35]:
# Create a MultiOutputRegressor with GradientBoostingRegressor
GradientBoostingRegressor_model = MultiOutputRegressor(GradientBoostingRegressor(n_estimators=100, random_state=42))

# Train the model
GradientBoostingRegressor_model.fit(X_train, y_train)

# Predict target values on the test set
y_pred = GradientBoostingRegressor_model.predict(X_test)

# Calculate MCRMSE for each target
mcrmse_per_target = []
for i in range(len(target_features.columns)):
    mse = mean_squared_error(y_test.iloc[:, i], y_pred[:, i])
    mcrmse = np.sqrt(mse)
    mcrmse_per_target.append(mcrmse)

# Calculate average MCRMSE across all targets
average_mcrmse = np.mean(mcrmse_per_target)

# Print results
print("Random Forest Model")
for i, target in enumerate(target_features.columns):
    print(f"MCRMSE for {target}: {mcrmse_per_target[i]}")
print(f"Average MCRMSE: {average_mcrmse}\n")


Random Forest Model
MCRMSE for reactivity: 0.5180035197191839
MCRMSE for deg_Mg_pH10: 0.4380689117295897
MCRMSE for deg_Mg_50C: 0.5417245504379395
Average MCRMSE: 0.4992656606289044



In [39]:
# Create a MultiOutputRegressor with XGBoostRegressor
xgboost_model = MultiOutputRegressor(XGBRegressor(
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=2,
    reg_alpha=1,
    n_estimators=500,
    learning_rate=0.12,
    random_state=42
))

# Train the model
xgboost_model.fit(X_train, y_train)

# Predict target values on the test set
y_pred = xgboost_model.predict(X_test)

# Calculate MCRMSE for each target
mcrmse_per_target = []
for i in range(len(target_features.columns)):
    mse = mean_squared_error(y_test.iloc[:, i], y_pred[:, i])
    mcrmse = np.sqrt(mse)
    mcrmse_per_target.append(mcrmse)

# Calculate average MCRMSE across all targets
average_mcrmse = np.mean(mcrmse_per_target)

# Print results
print("XGBoost Model")
for i, target in enumerate(target_features.columns):
    print(f"MCRMSE for {target}: {mcrmse_per_target[i]}")
print(f"Average MCRMSE: {average_mcrmse}\n")


XGBoost Model
MCRMSE for reactivity: 0.3432676294430101
MCRMSE for deg_Mg_pH10: 0.32900570328836226
MCRMSE for deg_Mg_50C: 0.35347859427959305
Average MCRMSE: 0.34191730900365513



In [40]:
# Create a MultiOutputRegressor with LinearRegression
linear_regression_model = MultiOutputRegressor(LinearRegression())

# Train the model
linear_regression_model.fit(X_train, y_train)

# Predict target values on the test set
y_pred = linear_regression_model.predict(X_test)

# Calculate MCRMSE for each target
mcrmse_per_target = []
for i in range(len(target_features.columns)):
    mse = mean_squared_error(y_test.iloc[:, i], y_pred[:, i])
    mcrmse = np.sqrt(mse)
    mcrmse_per_target.append(mcrmse)

# Calculate average MCRMSE across all targets
average_mcrmse = np.mean(mcrmse_per_target)

# Print results
print("Linear Regression Model")
for i, target in enumerate(target_features.columns):
    print(f"MCRMSE for {target}: {mcrmse_per_target[i]}")
print(f"Average MCRMSE: {average_mcrmse}\n")


Linear Regression Model
MCRMSE for reactivity: 0.5742745895262231
MCRMSE for deg_Mg_pH10: 0.5831883497835979
MCRMSE for deg_Mg_50C: 0.6961495731885803
Average MCRMSE: 0.6178708374994671



In [37]:
# Create a MultiOutputRegressor with ExtraTreesRegressor
extra_trees_model = MultiOutputRegressor(ExtraTreesRegressor(n_estimators=100, random_state=42))

# Train the model
extra_trees_model.fit(X_train, y_train)

# Predict target values on the test set
y_pred = extra_trees_model.predict(X_test)

# Calculate MCRMSE for each target
mcrmse_per_target = []
for i in range(len(target_features.columns)):
    mse = mean_squared_error(y_test.iloc[:, i], y_pred[:, i])
    mcrmse = np.sqrt(mse)
    mcrmse_per_target.append(mcrmse)

# Calculate average MCRMSE across all targets
average_mcrmse = np.mean(mcrmse_per_target)

# Print results
print("Extra Trees Model")
for i, target in enumerate(target_features.columns):
    print(f"MCRMSE for {target}: {mcrmse_per_target[i]}")
print(f"Average MCRMSE: {average_mcrmse}\n")


Extra Trees Model
MCRMSE for reactivity: 0.525190703976936
MCRMSE for deg_Mg_pH10: 0.44084416539035465
MCRMSE for deg_Mg_50C: 0.5438405325793291
Average MCRMSE: 0.5032918006488732



In [41]:
# Create a MultiOutputRegressor with AdaBoostRegressor
adaboost_model = MultiOutputRegressor(AdaBoostRegressor(n_estimators=100, random_state=42))

# Train the model
adaboost_model.fit(X_train, y_train)

# Predict target values on the test set
y_pred = adaboost_model.predict(X_test)

# Calculate MCRMSE for each target
mcrmse_per_target = []
for i in range(len(target_features.columns)):
    mse = mean_squared_error(y_test.iloc[:, i], y_pred[:, i])
    mcrmse = np.sqrt(mse)
    mcrmse_per_target.append(mcrmse)

# Calculate average MCRMSE across all targets
average_mcrmse = np.mean(mcrmse_per_target)

# Print results
print("AdaBoost Model")
for i, target in enumerate(target_features.columns):
    print(f"MCRMSE for {target}: {mcrmse_per_target[i]}")
print(f"Average MCRMSE: {average_mcrmse}\n")


AdaBoost Model
MCRMSE for reactivity: 1.3042072630291706
MCRMSE for deg_Mg_pH10: 0.9598297309951456
MCRMSE for deg_Mg_50C: 1.0797297570598197
Average MCRMSE: 1.1145889170280452



In [42]:
test_data

features_to_normalize = ['reactivity_error', 'deg_pH10', 'deg_error_50C']
test_data[features_to_normalize] = scaler.transform(test_data[features_to_normalize])

# Convert scaled values to a DataFrame for better display
test_data[features_to_normalize] = pd.DataFrame(test_data, columns=features_to_normalize)

# Display the scaled values with formatting
pd.set_option('display.float_format', '{:.10f}'.format)
test_data

Unnamed: 0,sequence,structure,predicted_loop_type,reactivity_error,deg_error_Mg_pH10,deg_pH10,deg_error_pH10,deg_error_Mg_50C,deg_50C,deg_error_50C,...,b4_predicted_loop_type,a4_predicted_loop_type,b5_sequence,a5_sequence,b5_structure,a5_structure,b5_predicted_loop_type,a5_predicted_loop_type,total_A_count,total_G_count
0,2,2,1,0.0000009169,0.2613000000,0.5262201457,0.2631000000,0.1501000000,0.6382000000,0.0000007752,...,0,1,0,0,2,0,0,5,4,2
1,0,2,1,0.0000008861,0.1798000000,0.5029263280,0.1000000000,0.1369000000,0.7877000000,0.0000006041,...,2,5,0,1,2,0,0,5,4,3
2,3,0,5,0.0000005043,0.1056000000,0.5005621290,0.0517000000,0.0705000000,0.0585000000,0.0000002113,...,2,2,1,3,3,2,2,2,5,1
3,1,0,5,0.0000007307,0.1896000000,0.5059733929,0.1474000000,0.1588000000,0.7349000000,0.0000006413,...,6,2,1,0,3,2,2,2,5,1
4,3,2,2,0.0000009367,0.1810000000,0.5086138831,0.1417000000,0.1382000000,1.0442000000,0.0000006314,...,6,2,4,2,0,1,6,5,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32635,0,0,5,0.0000000493,0.0344000000,0.5001241064,0.0303000000,0.0284000000,0.0182000000,0.0000001147,...,6,5,2,0,0,2,6,2,2,4
32636,0,1,5,0.0000001492,0.0390000000,0.5008721142,0.0529000000,0.0361000000,0.0237000000,0.0000001409,...,6,5,3,2,1,1,6,5,2,4
32637,0,2,0,0.0000004352,0.0672000000,0.5061867886,0.1022000000,0.0694000000,0.4242000000,0.0000003592,...,5,5,3,1,1,1,6,5,2,4
32638,0,1,5,0.0000001362,0.0391000000,0.5018110550,0.0611000000,0.0520000000,0.1120000000,0.0000002017,...,6,5,4,1,1,0,6,5,2,2


In [43]:
# Predict target values on the test set
y_pred = xgboost_model.predict(test_data[feature_importance_df["Feature"]])

In [44]:
# Create a DataFrame for the predictions
prediction_df = pd.DataFrame(y_pred, columns=['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C'])
prediction_df['id_seqpos'] = test_id
prediction_df = prediction_df[['id_seqpos', 'reactivity', 'deg_Mg_pH10', 'deg_Mg_50C']]

# Save the predictions to a CSV file
prediction_df.to_csv('predictions.csv', index=False)

In [45]:
prediction_df

Unnamed: 0,id_seqpos,reactivity,deg_Mg_pH10,deg_Mg_50C
0,id_001f94081_0,0.5212914944,0.7746857405,0.5664007664
1,id_001f94081_4,0.6886702180,0.5260664225,0.5813571215
2,id_001f94081_8,0.2849986255,0.1862083375,0.1218476892
3,id_001f94081_9,0.3524409235,0.8843256831,1.0188076496
4,id_001f94081_13,1.1513599157,1.3702285290,1.1130084991
...,...,...,...,...
32635,id_fff546103_39,-0.0115549425,0.0846866295,0.0484697334
32636,id_fff546103_52,0.0480578281,0.1677568108,0.1085437313
32637,id_fff546103_62,0.5024493933,0.5958237052,0.4661529958
32638,id_fff546103_65,0.0465703085,0.1841044128,0.2244262993
