In [786]:
!pip install rarfile
!pip install catboost



In [787]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error
import rarfile
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from lightgbm import LGBMRegressor
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor
import xgboost as xgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [788]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [789]:
# Specify the path and filename of the RAR file
rar_path = '/content/drive/MyDrive/data/covid.rar'


# Specify the directory where you want to extract the files
extract_directory = '/content/data'
# Open the RAR file
with rarfile.RarFile(rar_path, 'r') as rar_ref:
    # Extract all the files to the specified directory
    rar_ref.extractall(extract_directory)

print('File extracted successfully.')

File extracted successfully.


In [790]:
# Load the dataset
train_data = pd.read_csv('/content/data/covid/train.csv')
train_data["id_seqpos"] = [id.split('_')[-1] for id in train_data["id_seqpos"]]

test_data = pd.read_csv('/content/data/covid/test.csv')
test_id = test_data['id_seqpos']
test_data["id_seqpos"] = [id.split('_')[-1] for id in test_data["id_seqpos"]]

# Drop target variables and id column
# Extract the multi-label target features
target_columns = ['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C']
target_features = train_data[target_columns]
train_data = train_data.drop(['id'] + ["id_seqpos"], axis=1)

test_data=test_data.drop(['id']+["id_seqpos"], axis=1)

In [791]:
train_data

Unnamed: 0,sequence,structure,predicted_loop_type,reactivity,reactivity_error,deg_Mg_pH10,deg_error_Mg_pH10,deg_pH10,deg_error_pH10,deg_Mg_50C,...,b4_structure,a4_structure,b4_predicted_loop_type,a4_predicted_loop_type,b5_sequence,a5_sequence,b5_structure,a5_structure,b5_predicted_loop_type,a5_predicted_loop_type
0,A,.,H,1.7786000000,0.1195000000,0.7346000000,0.0790000000,0.3680000000,0.0851000000,0.7228000000,...,(,),S,S,C,A,.,),I,S
1,G,.,I,0.4397000000,0.1033000000,0.1630000000,0.0799000000,0.0878000000,0.0878000000,0.1100000000,...,(,.,S,H,C,C,.,.,B,H
2,G,(,S,0.1120000000,0.1375000000,0.0000000000,0.0744000000,0.1835000000,0.1969000000,0.0933000000,...,(,(,S,S,A,G,(,(,S,S
3,G,(,S,0.0622000000,0.0471000000,0.1179000000,0.1075000000,0.0657000000,0.0615000000,0.0548000000,...,.,(,E,S,A,U,.,(,E,S
4,U,(,S,0.1378000000,0.1352000000,0.4069000000,0.2168000000,0.3375000000,0.2623000000,0.2378000000,...,(,(,S,S,G,G,(,.,S,H
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130555,U,),S,0.1244000000,0.0393000000,0.2672000000,0.0728000000,0.1479000000,0.0493000000,0.1921000000,...,),),S,S,A,A,.,.,H,M
130556,A,.,X,0.0201000000,0.0351000000,0.1271000000,0.0583000000,0.0580000000,0.0660000000,0.0245000000,...,.,.,H,X,A,U,.,.,H,X
130557,A,.,H,0.4597000000,0.0601000000,0.2014000000,0.0515000000,0.2838000000,0.0793000000,0.2042000000,...,.,.,H,I,U,U,.,),H,S
130558,G,.,I,0.3634000000,0.1063000000,0.1928000000,0.0738000000,0.1885000000,0.1276000000,0.1627000000,...,),.,S,I,G,G,),),S,S


In [792]:
# Assuming df is your DataFrame

# Merge sequences
sequence_columns = ['sequence', 'b1_sequence', 'a1_sequence', 'b2_sequence', 'a2_sequence',
                    'b3_sequence', 'a3_sequence', 'b4_sequence', 'a4_sequence',
                    'b5_sequence', 'a5_sequence']
train_data['merged_sequence'] = train_data[sequence_columns].apply(lambda row: ''.join(row), axis=1)

# Merge structures
structure_columns = ['structure', 'b1_structure', 'a1_structure', 'b2_structure', 'a2_structure',
                     'b3_structure', 'a3_structure', 'b4_structure', 'a4_structure',
                     'b5_structure', 'a5_structure']
train_data['merged_structure'] = train_data[structure_columns].apply(lambda row: ''.join(row), axis=1)

# Merge predicted loop types
loop_type_columns = ['predicted_loop_type', 'b1_predicted_loop_type', 'a1_predicted_loop_type',
                     'b2_predicted_loop_type', 'a2_predicted_loop_type',
                     'b3_predicted_loop_type', 'a3_predicted_loop_type',
                     'b4_predicted_loop_type', 'a4_predicted_loop_type',
                     'b5_predicted_loop_type', 'a5_predicted_loop_type']
train_data['merged_predicted_loop_type'] = train_data[loop_type_columns].apply(lambda row: ''.join(row), axis=1)

In [793]:
train_data

Unnamed: 0,sequence,structure,predicted_loop_type,reactivity,reactivity_error,deg_Mg_pH10,deg_error_Mg_pH10,deg_pH10,deg_error_pH10,deg_Mg_50C,...,a4_predicted_loop_type,b5_sequence,a5_sequence,b5_structure,a5_structure,b5_predicted_loop_type,a5_predicted_loop_type,merged_sequence,merged_structure,merged_predicted_loop_type
0,A,.,H,1.7786000000,0.1195000000,0.7346000000,0.0790000000,0.3680000000,0.0851000000,0.7228000000,...,S,C,A,.,),I,S,ACUGCUGGCCA,.(.(.()().),HSHSHSSSSIS
1,G,.,I,0.4397000000,0.1033000000,0.1630000000,0.0799000000,0.0878000000,0.0878000000,0.1100000000,...,H,C,C,.,.,B,H,GGACCAGGUCC,.(.(((((...,ISISSSSSHBH
2,G,(,S,0.1120000000,0.1375000000,0.0000000000,0.0744000000,0.1835000000,0.1969000000,0.0933000000,...,S,A,G,(,(,S,S,GUGUGUAUUAG,(((((((((((,SSSSSSSSSSS
3,G,(,S,0.0622000000,0.0471000000,0.1179000000,0.1075000000,0.0657000000,0.0615000000,0.0548000000,...,S,A,U,.,(,E,S,GCGGCGGAGAU,(((((((.(.(,SSSSSSSESES
4,U,(,S,0.1378000000,0.1352000000,0.4069000000,0.2168000000,0.3375000000,0.2623000000,0.2378000000,...,S,G,G,(,.,S,H,UGACCAGUAGG,(.((((((((.,SBSSSSSSSSH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130555,U,),S,0.1244000000,0.0393000000,0.2672000000,0.0728000000,0.1479000000,0.0493000000,0.1921000000,...,S,A,A,.,.,H,M,UUGGAAGUAAA,)).).))))..,SSISISSSSHM
130556,A,.,X,0.0201000000,0.0351000000,0.1271000000,0.0583000000,0.0580000000,0.0660000000,0.0245000000,...,X,A,U,.,.,H,X,AUACGGAAAAU,.).).).....,XSXSXSXHXHX
130557,A,.,H,0.4597000000,0.0601000000,0.2014000000,0.0515000000,0.2838000000,0.0793000000,0.2042000000,...,I,U,U,.,),H,S,AUCACAAUGUU,..).).)...),HHSHSHSHIHS
130558,G,.,I,0.3634000000,0.1063000000,0.1928000000,0.0738000000,0.1885000000,0.1276000000,0.1627000000,...,I,G,G,),),S,S,GCGGUGGCGGG,.))))).).)),ISSSSSISISS


In [794]:
# Assuming df is your DataFrame

# Merge sequences
sequence_columns = ['sequence', 'b1_sequence', 'a1_sequence', 'b2_sequence', 'a2_sequence',
                    'b3_sequence', 'a3_sequence', 'b4_sequence', 'a4_sequence',
                    'b5_sequence', 'a5_sequence']
test_data['merged_sequence'] = test_data[sequence_columns].apply(lambda row: ''.join(row), axis=1)

# Merge structures
structure_columns = ['structure', 'b1_structure', 'a1_structure', 'b2_structure', 'a2_structure',
                     'b3_structure', 'a3_structure', 'b4_structure', 'a4_structure',
                     'b5_structure', 'a5_structure']
test_data['merged_structure'] = test_data[structure_columns].apply(lambda row: ''.join(row), axis=1)

# Merge predicted loop types
loop_type_columns = ['predicted_loop_type', 'b1_predicted_loop_type', 'a1_predicted_loop_type',
                     'b2_predicted_loop_type', 'a2_predicted_loop_type',
                     'b3_predicted_loop_type', 'a3_predicted_loop_type',
                     'b4_predicted_loop_type', 'a4_predicted_loop_type',
                     'b5_predicted_loop_type', 'a5_predicted_loop_type']
test_data['merged_predicted_loop_type'] = test_data[loop_type_columns].apply(lambda row: ''.join(row), axis=1)

# Drop the original sequence, structure, and predicted_loop_type columns if needed
#train_data.drop(sequence_columns + structure_columns + loop_type_columns, axis=1, inplace=True)

# Now your DataFrame has 'merged_sequence', 'merged_structure', and 'merged_predicted_loop_type' columns


In [795]:
def featurize(df):

    df['total_A_count'] = df['sequence'].apply(lambda s: s.count('A'))
    df['total_G_count'] = df['sequence'].apply(lambda s: s.count('G'))
    df['total_U_count'] = df['sequence'].apply(lambda s: s.count('U'))
    df['total_C_count'] = df['sequence'].apply(lambda s: s.count('C'))

    df['total_dot_count'] = df['structure'].apply(lambda s: s.count('.'))
    df['total_ob_count'] = df['structure'].apply(lambda s: s.count('('))
    df['total_cb_count'] = df['structure'].apply(lambda s: s.count(')'))

    return df

In [796]:
train_data = featurize(train_data)
test_data = featurize(test_data)

In [797]:
train_data = train_data.drop(['merged_sequence', 'merged_structure', 'merged_predicted_loop_type'], axis=1)
test_data = test_data.drop(['merged_sequence', 'merged_structure', 'merged_predicted_loop_type'], axis=1)


In [798]:
train_data

Unnamed: 0,sequence,structure,predicted_loop_type,reactivity,reactivity_error,deg_Mg_pH10,deg_error_Mg_pH10,deg_pH10,deg_error_pH10,deg_Mg_50C,...,a5_structure,b5_predicted_loop_type,a5_predicted_loop_type,total_A_count,total_G_count,total_U_count,total_C_count,total_dot_count,total_ob_count,total_cb_count
0,A,.,H,1.7786000000,0.1195000000,0.7346000000,0.0790000000,0.3680000000,0.0851000000,0.7228000000,...,),I,S,1,0,0,0,1,0,0
1,G,.,I,0.4397000000,0.1033000000,0.1630000000,0.0799000000,0.0878000000,0.0878000000,0.1100000000,...,.,B,H,0,1,0,0,1,0,0
2,G,(,S,0.1120000000,0.1375000000,0.0000000000,0.0744000000,0.1835000000,0.1969000000,0.0933000000,...,(,S,S,0,1,0,0,0,1,0
3,G,(,S,0.0622000000,0.0471000000,0.1179000000,0.1075000000,0.0657000000,0.0615000000,0.0548000000,...,(,E,S,0,1,0,0,0,1,0
4,U,(,S,0.1378000000,0.1352000000,0.4069000000,0.2168000000,0.3375000000,0.2623000000,0.2378000000,...,.,S,H,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130555,U,),S,0.1244000000,0.0393000000,0.2672000000,0.0728000000,0.1479000000,0.0493000000,0.1921000000,...,.,H,M,0,0,1,0,0,0,1
130556,A,.,X,0.0201000000,0.0351000000,0.1271000000,0.0583000000,0.0580000000,0.0660000000,0.0245000000,...,.,H,X,1,0,0,0,1,0,0
130557,A,.,H,0.4597000000,0.0601000000,0.2014000000,0.0515000000,0.2838000000,0.0793000000,0.2042000000,...,),H,S,1,0,0,0,1,0,0
130558,G,.,I,0.3634000000,0.1063000000,0.1928000000,0.0738000000,0.1885000000,0.1276000000,0.1627000000,...,),S,S,0,1,0,0,1,0,0


In [799]:
# Identify categorical columns
from sklearn.preprocessing import LabelEncoder
categorical_columns = train_data.select_dtypes(include=['object']).columns

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply LabelEncoder to all categorical columns in both training and test data
for column in categorical_columns:
    # Fit LabelEncoder using training data and apply to test data
    label_encoder.fit(train_data[column])
    train_data[column] = label_encoder.transform(train_data[column])

    # Transform test data with a 'transform' call, not 'fit_transform'
    test_data[column] = test_data[column].apply(lambda x: label_encoder.transform([x])[0] if x in label_encoder.classes_ else -1)  # Use -1 for unseen labels

In [800]:
'''import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Assume you have the 'train_data' and 'test_data' DataFrames

# Identify categorical columns
categorical_columns = train_data.select_dtypes(include=['object']).columns

# Initialize OneHotEncoder
onehot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Fit and transform OneHotEncoder on training data
encoded_train = onehot_encoder.fit_transform(train_data[categorical_columns])
encoded_train_df = pd.DataFrame(encoded_train, columns=onehot_encoder.get_feature_names_out(categorical_columns))

# Transform test data using the trained OneHotEncoder
encoded_test = onehot_encoder.transform(test_data[categorical_columns])
encoded_test_df = pd.DataFrame(encoded_test, columns=encoded_train_df.columns)

# Replace categorical columns with encoded columns in both training and test data
train_data_encoded = pd.concat([train_data.drop(categorical_columns, axis=1), encoded_train_df], axis=1)
test_data_encoded = pd.concat([test_data.drop(categorical_columns, axis=1), encoded_test_df], axis=1)'''


"import pandas as pd\nfrom sklearn.preprocessing import OneHotEncoder\n\n# Assume you have the 'train_data' and 'test_data' DataFrames\n\n# Identify categorical columns\ncategorical_columns = train_data.select_dtypes(include=['object']).columns\n\n# Initialize OneHotEncoder\nonehot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')\n\n# Fit and transform OneHotEncoder on training data\nencoded_train = onehot_encoder.fit_transform(train_data[categorical_columns])\nencoded_train_df = pd.DataFrame(encoded_train, columns=onehot_encoder.get_feature_names_out(categorical_columns))\n\n# Transform test data using the trained OneHotEncoder\nencoded_test = onehot_encoder.transform(test_data[categorical_columns])\nencoded_test_df = pd.DataFrame(encoded_test, columns=encoded_train_df.columns)\n\n# Replace categorical columns with encoded columns in both training and test data\ntrain_data_encoded = pd.concat([train_data.drop(categorical_columns, axis=1), encoded_train_df], axis=1)\ntest

In [801]:
# Split data in features and labels
target = train_data[['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C']]
train_data_encoded = train_data.drop(['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C'], axis=1)


In [838]:
# Create a Random Forest model
model = RandomForestRegressor()  # For regression



# Fit the model on the training data
model.fit(train_data_encoded, target["reactivity"])

# Get feature importance scores
feature_importances = model.feature_importances_

# Create a DataFrame to store feature names and their importance scores
feature_importance_df = pd.DataFrame({'Feature': train_data_encoded.columns, 'Importance': feature_importances})

# Sort the DataFrame by importance scores in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)



In [839]:
# Create a Random Forest model
model = RandomForestRegressor()  # For regression



# Fit the model on the training data
model.fit(train_data_encoded, target["deg_Mg_pH10"])

# Get feature importance scores
feature_importances = model.feature_importances_

# Create a DataFrame to store feature names and their importance scores
feature_importance_df1 = pd.DataFrame({'Feature': train_data_encoded.columns, 'Importance': feature_importances})

# Sort the DataFrame by importance scores in descending order
feature_importance_df1 = feature_importance_df.sort_values(by='Importance', ascending=False)



In [840]:
# Create a Random Forest model
model = RandomForestRegressor()  # For regression



# Fit the model on the training data
model.fit(train_data_encoded, target["deg_Mg_50C"])

# Get feature importance scores
feature_importances = model.feature_importances_

# Create a DataFrame to store feature names and their importance scores
feature_importance_df2 = pd.DataFrame({'Feature': train_data_encoded.columns, 'Importance': feature_importances})

# Sort the DataFrame by importance scores in descending order
feature_importance_df2 = feature_importance_df.sort_values(by='Importance', ascending=False)



In [841]:
# Normalize specified features using Min-Max scaling
scaler = MinMaxScaler()
features_to_normalize = ['reactivity_error', 'deg_pH10', 'deg_error_50C']
train_data_encoded[features_to_normalize] = scaler.fit_transform(train_data_encoded[features_to_normalize])

# Convert scaled values to a DataFrame for better display
train_data_encoded[features_to_normalize] = pd.DataFrame(train_data_encoded, columns=features_to_normalize)

# Display the scaled values with formatting
pd.set_option('display.float_format', '{:.10f}'.format)
train_data_encoded

Unnamed: 0,sequence,structure,predicted_loop_type,reactivity_error,deg_error_Mg_pH10,deg_pH10,deg_error_pH10,deg_error_Mg_50C,deg_50C,deg_error_50C,...,a5_structure,b5_predicted_loop_type,a5_predicted_loop_type,total_A_count,total_G_count,total_U_count,total_C_count,total_dot_count,total_ob_count,total_cb_count
0,0,2,2,0.0000008046,0.0790000000,0.5041000039,0.0851000000,0.0792000000,0.5685000000,0.0000003636,...,1,4,5,1,0,0,0,1,0,0
1,2,2,3,0.0000006938,0.0799000000,0.5009529800,0.0878000000,0.0683000000,0.0666000000,0.0000002947,...,2,1,2,0,1,0,0,1,0,0
2,2,0,5,0.0000009278,0.0744000000,0.5020278201,0.1969000000,0.1163000000,0.0000000000,0.0000003773,...,0,6,5,0,1,0,0,0,1,0
3,2,0,5,0.0000003093,0.1075000000,0.5007047671,0.0615000000,0.0506000000,0.0749000000,0.0000003127,...,0,2,5,0,1,0,0,0,1,0
4,3,0,5,0.0000009121,0.2168000000,0.5037574478,0.2623000000,0.1569000000,0.6247000000,0.0000012188,...,2,6,2,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130555,3,1,5,0.0000002559,0.0728000000,0.5016279840,0.0493000000,0.0481000000,0.4711000000,0.0000003017,...,2,3,4,0,0,1,0,0,0,1
130556,0,2,6,0.0000002272,0.0583000000,0.5006182858,0.0660000000,0.0419000000,0.2559000000,0.0000003592,...,2,3,6,1,0,0,0,1,0,0
130557,0,2,2,0.0000003982,0.0515000000,0.5031543244,0.0793000000,0.0528000000,0.3089000000,0.0000002936,...,1,3,5,1,0,0,0,1,0,0
130558,2,2,3,0.0000007143,0.0738000000,0.5020839768,0.1276000000,0.0773000000,0.1001000000,0.0000004311,...,1,6,5,0,1,0,0,1,0,0


In [893]:
# Assuming you have loaded your data into X_train, y_train, X_test, y_test, and defined target_features
X_train, X_test, y_train, y_test = train_test_split(train_data_encoded[feature_importance_df["Feature"][0:13]], target, test_size=0.20)

# Split the data into training and testing sets
# Create a CatBoostRegressor
catboost_model = CatBoostRegressor(iterations=100)

# Train the model
catboost_model.fit(X_train, y_train["reactivity"])  # Replace "reactivity" with the desired label name

# Predict target values on the test set
y_pred = catboost_model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test["reactivity"], y_pred))

# Print results
print("CatBoost Model")
print(f"RMSE for reactivity: {rmse}\n")


Learning rate set to 0.5
0:	learn: 0.6603696	total: 16.9ms	remaining: 1.67s
1:	learn: 0.6230197	total: 31.5ms	remaining: 1.54s
2:	learn: 0.6062218	total: 46.7ms	remaining: 1.51s
3:	learn: 0.5956023	total: 64.2ms	remaining: 1.54s
4:	learn: 0.5844509	total: 79.1ms	remaining: 1.5s
5:	learn: 0.5746262	total: 94.5ms	remaining: 1.48s
6:	learn: 0.5673605	total: 110ms	remaining: 1.46s
7:	learn: 0.5593344	total: 129ms	remaining: 1.49s
8:	learn: 0.5539919	total: 145ms	remaining: 1.47s
9:	learn: 0.5503974	total: 165ms	remaining: 1.48s
10:	learn: 0.5448075	total: 181ms	remaining: 1.46s
11:	learn: 0.5418284	total: 205ms	remaining: 1.5s
12:	learn: 0.5363011	total: 220ms	remaining: 1.47s
13:	learn: 0.5336390	total: 238ms	remaining: 1.46s
14:	learn: 0.5320873	total: 256ms	remaining: 1.45s
15:	learn: 0.5276603	total: 272ms	remaining: 1.43s
16:	learn: 0.5237253	total: 287ms	remaining: 1.4s
17:	learn: 0.5196329	total: 301ms	remaining: 1.37s
18:	learn: 0.5186082	total: 319ms	remaining: 1.36s
19:	learn: 0.

In [849]:
# Assuming you have loaded your data into X_train, y_train, X_test, y_test, and defined target_features
X_train1, X_test1, y_train1, y_test1 = train_test_split(train_data_encoded[feature_importance_df1["Feature"][:18]], target, test_size=0.20)

# Split the data into training and testing sets
# Create a CatBoostRegressor
catboost_model = CatBoostRegressor(iterations=100)

# Train the model
catboost_model.fit(X_train1, y_train1["deg_Mg_pH10"])  # Replace "reactivity" with the desired label name

# Predict target values on the test set
y_pred = catboost_model.predict(X_test1)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test1["deg_Mg_pH10"], y_pred))

# Print results
print("CatBoost Model")
print(f"RMSE for reactivity: {rmse}\n")


Learning rate set to 0.5
0:	learn: 0.6144460	total: 16.6ms	remaining: 1.64s
1:	learn: 0.5683103	total: 32.1ms	remaining: 1.57s
2:	learn: 0.5449084	total: 47.7ms	remaining: 1.54s
3:	learn: 0.5335435	total: 63.8ms	remaining: 1.53s
4:	learn: 0.5246309	total: 79.6ms	remaining: 1.51s
5:	learn: 0.5177807	total: 108ms	remaining: 1.7s
6:	learn: 0.5118515	total: 129ms	remaining: 1.71s
7:	learn: 0.5024431	total: 145ms	remaining: 1.67s
8:	learn: 0.4971100	total: 163ms	remaining: 1.64s
9:	learn: 0.4935671	total: 180ms	remaining: 1.62s
10:	learn: 0.4883439	total: 201ms	remaining: 1.62s
11:	learn: 0.4829388	total: 218ms	remaining: 1.59s
12:	learn: 0.4782977	total: 233ms	remaining: 1.56s
13:	learn: 0.4750363	total: 251ms	remaining: 1.54s
14:	learn: 0.4708517	total: 270ms	remaining: 1.53s
15:	learn: 0.4686279	total: 288ms	remaining: 1.51s
16:	learn: 0.4652697	total: 305ms	remaining: 1.49s
17:	learn: 0.4622394	total: 321ms	remaining: 1.46s
18:	learn: 0.4594906	total: 338ms	remaining: 1.44s
19:	learn: 0

In [880]:
# Assuming you have loaded your data into X_train, y_train, X_test, y_test, and defined target_features
X_train2, X_test2, y_train2, y_test2 = train_test_split(train_data_encoded[feature_importance_df2["Feature"]], target, test_size=0.20)

# Split the data into training and testing sets
# Create a CatBoostRegressor
catboost_model = CatBoostRegressor(iterations=100)

# Train the model
catboost_model.fit(X_train2, y_train2["deg_Mg_50C"])  # Replace "reactivity" with the desired label name

# Predict target values on the test set
y_pred = catboost_model.predict(X_test2)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test2["deg_Mg_50C"], y_pred))

# Print results
print("CatBoost Model")
print(f"RMSE for reactivity: {rmse}\n")


Learning rate set to 0.5
0:	learn: 0.7098125	total: 58.1ms	remaining: 5.75s
1:	learn: 0.6569900	total: 110ms	remaining: 5.38s
2:	learn: 0.6347711	total: 162ms	remaining: 5.25s
3:	learn: 0.6226481	total: 232ms	remaining: 5.56s
4:	learn: 0.6078190	total: 276ms	remaining: 5.25s
5:	learn: 0.6018697	total: 342ms	remaining: 5.35s
6:	learn: 0.5950135	total: 404ms	remaining: 5.37s
7:	learn: 0.5881990	total: 452ms	remaining: 5.19s
8:	learn: 0.5824854	total: 494ms	remaining: 5s
9:	learn: 0.5780516	total: 558ms	remaining: 5.02s
10:	learn: 0.5750826	total: 609ms	remaining: 4.92s
11:	learn: 0.5702124	total: 667ms	remaining: 4.89s
12:	learn: 0.5656611	total: 718ms	remaining: 4.8s
13:	learn: 0.5631180	total: 781ms	remaining: 4.8s
14:	learn: 0.5578150	total: 818ms	remaining: 4.64s
15:	learn: 0.5552474	total: 864ms	remaining: 4.54s
16:	learn: 0.5538678	total: 901ms	remaining: 4.4s
17:	learn: 0.5505330	total: 924ms	remaining: 4.21s
18:	learn: 0.5471163	total: 947ms	remaining: 4.04s
19:	learn: 0.5445220	

In [894]:
# Create an XGBoost model
xgb_model = xgb.XGBRegressor(
 subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=2,
    reg_alpha=1,
    n_estimators = 800,
    learning_rate =0.18,
    max_depth = 5
)

# Train the model
xgb_model.fit(X_train, y_train["reactivity"])  # Replace "reactivity" with the desired label name

In [896]:
# Predict target values on the test set
y_pred = xgb_model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test["reactivity"], y_pred))

# Print results
print("XGBoost Model")
print(f"RMSE for reactivity: {rmse}\n")

XGBoost Model
RMSE for reactivity: 0.3504704295834129



In [865]:

# Assuming you have loaded your data into X_train, y_train, X_test, y_test, and defined target_features

# Create an XGBoost model
xgb_model1 = xgb.XGBRegressor(
   subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=2,
    reg_alpha=1,

    n_estimators = 800,
    learning_rate = 0.12,

)

# Train the model
xgb_model1.fit(X_train1, y_train1["deg_Mg_pH10"])  # Replace "reactivity" with the desired label name

# Predict target values on the test set
y_pred = xgb_model1.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test["deg_Mg_pH10"], y_pred))

# Print results
print("XGBoost Model")
print(f"RMSE for reactivity: {rmse}\n")


XGBoost Model
RMSE for reactivity: 0.26407720207456803



In [881]:
# Create an XGBoost model
xgb_model2 = XGBRegressor(
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=2,
    reg_alpha=1,
    n_estimators=800,
    learning_rate=0.12,
    max_depth=5
)

# Train the XGBoost model
xgb_model2.fit(X_train2, y_train2["deg_Mg_50C"])

# Predict target values using the XGBoost model
y_pred_xgb = xgb_model2.predict(X_test2)

# Calculate RMSE for XGBoost model
rmse_xgb = np.sqrt(mean_squared_error(y_test2["deg_Mg_50C"], y_pred_xgb))



print("XGBoost Model")
print(f"RMSE for deg_Mg_50C (XGBoost): {rmse_xgb}\n")


XGBoost Model
RMSE for deg_Mg_50C (XGBoost): 0.34474230633822217



In [867]:
test_data

features_to_normalize = ['reactivity_error', 'deg_pH10', 'deg_error_50C']
test_data[features_to_normalize] = scaler.transform(test_data[features_to_normalize])

# Convert scaled values to a DataFrame for better display
test_data[features_to_normalize] = pd.DataFrame(test_data, columns=features_to_normalize)

# Display the scaled values with formatting
pd.set_option('display.float_format', '{:.10f}'.format)
test_data

Unnamed: 0,sequence,structure,predicted_loop_type,reactivity_error,deg_error_Mg_pH10,deg_pH10,deg_error_pH10,deg_error_Mg_50C,deg_50C,deg_error_50C,...,a5_structure,b5_predicted_loop_type,a5_predicted_loop_type,total_A_count,total_G_count,total_U_count,total_C_count,total_dot_count,total_ob_count,total_cb_count
0,2,2,1,0.0000009169,0.2613000000,0.5262201457,0.2631000000,0.1501000000,0.6382000000,0.0000007752,...,0,0,5,0,1,0,0,1,0,0
1,0,2,1,0.0000008861,0.1798000000,0.5029263280,0.1000000000,0.1369000000,0.7877000000,0.0000006041,...,0,0,5,1,0,0,0,1,0,0
2,3,0,5,0.0000005043,0.1056000000,0.5005621290,0.0517000000,0.0705000000,0.0585000000,0.0000002113,...,2,2,2,0,0,1,0,0,1,0
3,1,0,5,0.0000007307,0.1896000000,0.5059733929,0.1474000000,0.1588000000,0.7349000000,0.0000006413,...,2,2,2,0,0,0,1,0,1,0
4,3,2,2,0.0000009367,0.1810000000,0.5086138831,0.1417000000,0.1382000000,1.0442000000,0.0000006314,...,1,6,5,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32635,0,0,5,0.0000000493,0.0344000000,0.5001241064,0.0303000000,0.0284000000,0.0182000000,0.0000001147,...,2,6,2,1,0,0,0,0,1,0
32636,0,1,5,0.0000001492,0.0390000000,0.5008721142,0.0529000000,0.0361000000,0.0237000000,0.0000001409,...,1,6,5,1,0,0,0,0,0,1
32637,0,2,0,0.0000004352,0.0672000000,0.5061867886,0.1022000000,0.0694000000,0.4242000000,0.0000003592,...,1,6,5,1,0,0,0,1,0,0
32638,0,1,5,0.0000001362,0.0391000000,0.5018110550,0.0611000000,0.0520000000,0.1120000000,0.0000002017,...,0,6,5,1,0,0,0,0,0,1


In [897]:

y_pred2 = xgb_model.predict(test_data[feature_importance_df["Feature"][0:13]])
y_pred3 = xgb_model1.predict(test_data[feature_importance_df1["Feature"][0:18]])

In [811]:

# Now remove the column from prediction_df3
#test_data2.drop(columns='deg_Mg_pH10', inplace=True)

In [898]:
y_pred = xgb_model2.predict(test_data[feature_importance_df2["Feature"]])

In [899]:
# Create a DataFrame for the predictions
prediction_dict = {
    'id_seqpos': test_id,
    'reactivity': y_pred2,
    'deg_Mg_pH10': y_pred3,
    'deg_Mg_50C': y_pred
}

prediction_df = pd.DataFrame(prediction_dict)

# Save the predictions to a CSV file
prediction_df.to_csv('predictions.csv', index=False)

In [900]:
prediction_df

Unnamed: 0,id_seqpos,reactivity,deg_Mg_pH10,deg_Mg_50C
0,id_001f94081_0,0.4398843050,0.8117605448,0.5366960168
1,id_001f94081_4,0.8943917155,0.4686833024,0.6369154453
2,id_001f94081_8,0.3298865259,0.1719706655,0.1265527010
3,id_001f94081_9,0.5688855052,0.9699054360,0.9917514324
4,id_001f94081_13,1.0647768974,1.2408246994,1.0858554840
...,...,...,...,...
32635,id_fff546103_39,-0.0053966413,0.0838072449,0.0666786954
32636,id_fff546103_52,0.0184414722,0.1455953866,0.1061036661
32637,id_fff546103_62,0.4225301147,0.5312647223,0.4693120122
32638,id_fff546103_65,0.0182480700,0.2032098472,0.2401004881
