In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from scipy.stats import spearmanr
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from supervised.automl import AutoML

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
total_time_limit = 60 * 5  # for automl, in seconds
random_state = 42
VER = 3

In [4]:
df_train = pd.read_csv("train.csv")
df_test =  pd.read_csv("test.csv")
df_update = pd.read_csv("train_updates_20220929.csv")

#update train-set 
temp = df_update[df_update["pH"] > -0.1]
for i in range(len(temp)):
    df_train[df_train.seq_id == temp.iloc[i].seq_id] = temp.iloc[i]

df_train[["pH","tm"]] = df_train[["tm","pH"]].where(df_train["pH"] > 14, df_train[["pH","tm"]].values)

df_train.drop(columns=["seq_id","data_source"], inplace=True)
df_train.dropna(inplace=True)

df_train = df_train.reset_index(drop=True)

df_train["protein_length"] = [len(s) for s in df_train["protein_sequence"]]
df_test["protein_length"] = [len(s) for s in df_test["protein_sequence"]]

# drop too long sequence
train_long_id = df_train[df_train["protein_length"] > 851].index
df_train = df_train.drop(df_train.index[train_long_id], axis=0)
df_train = df_train.reset_index(drop=True)

amino_acids= ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"]
for letter in amino_acids:
    df_train[letter] = df_train["protein_sequence"].str.count(letter)
    df_test[letter]  = df_test["protein_sequence"].str.count(letter)
    
    
test_ids = df_test["seq_id"]


df_train.drop(columns=["protein_sequence"], inplace=True)
df_test.drop(columns=["seq_id","protein_sequence","data_source"], inplace=True)


df_train = df_train[df_train["tm"] > 51.5]


train_y_df = df_train["tm"].values
train_x_df = df_train.drop(columns=["tm"])
test_x_df = df_test.values


poplist = ["I"]
train_x_df.drop(columns=poplist, inplace=True)
df_test.drop(columns=poplist, inplace=True)

x_train, x_test, y_train, y_test = train_test_split(train_x_df, 
                                                    train_y_df, 
                                                    test_size=0.2, 
                                                    random_state=random_state)

In [5]:
def get_model(model_name, params={}):
    if model_name == "XGBRegressor":
        model = XGBRegressor(verbosity=0, **params)
    elif model_name == "CatBoostRegressor":
        model = CatBoostRegressor(verbose=0, **params)
    return model


def fit_predict(model_name, x_train, x_test, y_train, y_test, gs_params):
    model = get_model(model_name)
    
    gs = GridSearchCV(model, gs_params, verbose=1)
    gs.fit(x_train,y_train)
    
    print(gs.best_params_)
    
    model = get_model(model_name, gs.best_params_)
    model.fit(x_train,y_train)
    
    predictions = model.predict(x_test)
    spearman = spearmanr(y_test, predictions)
    return spearman, model

In [6]:
submission = pd.read_csv("sample_submission.csv")

In [7]:
model_names = [
    "CatBoostRegressor",
    "XGBRegressor"
]

gs_params = {
    "learning_rate": [0.01, 0.05],
    "max_depth": [6, 9], 
    "n_estimators": [150, 250, 650],
    "random_state": [random_state]
}

for model_name in model_names:
    spearman, model = fit_predict(model_name, x_train, x_test, y_train, y_test, gs_params)
    print(f"{model_name}: {spearman}")
    submission["tm"] = model.predict(df_test)
    submission.to_csv(f"submission_{model_name}_ver{VER}.csv", index=False)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
{'learning_rate': 0.05, 'max_depth': 9, 'n_estimators': 650, 'random_state': 42}
CatBoostRegressor: SpearmanrResult(correlation=0.7190267522007091, pvalue=4.1331578154826e-311)
Fitting 5 folds for each of 12 candidates, totalling 60 fits
{'learning_rate': 0.05, 'max_depth': 9, 'n_estimators': 250, 'random_state': 42}
XGBRegressor: SpearmanrResult(correlation=0.6994704011738112, pvalue=2.1069703261799782e-287)


In [8]:
automl = AutoML(mode="Compete", 
                total_time_limit=total_time_limit, 
                random_state=random_state)
automl.fit(x_train, y_train)

AutoML directory: AutoML_1
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['Decision Tree', 'Linear', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree rmse 8.81425 trained in 0.59 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 5-fold CV Shuffle
* Step simple_algorithms will try to check up to 4 models
1_DecisionTree rmse 9.312058 trained in 1.87 seconds
2_DecisionTree rmse 10.411612 trained in 1.13 seconds
3_DecisionTree rmse 10.411612 trained in 1.25 seconds




4_Linear rmse 9.409044 trained in 1.78 seconds
* Step default_algorithms will try to check up to 7 models
5_Default_LightGBM rmse 6.772319 trained in 7.48 seconds
6_Default_Xgboost rmse 6.948317 trained in 6.51 seconds
7_Default_CatBoost rmse 6.767519 trained in 15.7 seconds




8_Default_NeuralNetwork rmse 7.278111 trained in 5.2 seconds
9_Default_RandomForest rmse 8.849949 trained in 7.33 seconds
10_Default_ExtraTrees rmse 9.61916 trained in 5.41 seconds




There was an error during 11_Default_NearestNeighbors training.
Please check AutoML_1\errors.md for details.
* Step not_so_random will try to check up to 63 models
20_LightGBM rmse 6.910127 trained in 6.45 seconds
11_Xgboost rmse 6.833549 trained in 4.76 seconds
29_CatBoost rmse 6.756253 trained in 18.36 seconds
38_RandomForest rmse 8.931156 trained in 9.39 seconds
47_ExtraTrees rmse 9.472239 trained in 5.62 seconds




56_NeuralNetwork rmse 9.610247 trained in 4.89 seconds




There was an error during 65_NearestNeighbors training.
Please check AutoML_1\errors.md for details.
21_LightGBM rmse 6.932838 trained in 4.43 seconds
12_Xgboost rmse 7.042853 trained in 4.01 seconds
30_CatBoost rmse 6.906099 trained in 9.94 seconds
39_RandomForest rmse 7.958077 trained in 9.46 seconds
* Step golden_features will try to check up to 3 models
None 10
Add Golden Feature: Q_ratio_R
Add Golden Feature: S_ratio_L
Add Golden Feature: R_ratio_Q
Add Golden Feature: L_ratio_S
Add Golden Feature: Q_ratio_L
Add Golden Feature: R_diff_S
Add Golden Feature: Q_ratio_A
Add Golden Feature: A_ratio_N
Add Golden Feature: N_ratio_A
Add Golden Feature: L_ratio_Q
Created 10 Golden Features in 26.05 seconds.
29_CatBoost_GoldenFeatures rmse 6.625433 trained in 49.51 seconds
Skip kmeans_features because of the time limit.
Not enough time to perform features selection. Skip
Time needed for features selection ~ 74.0 seconds
Please increase total_time_limit to at least (802 seconds) to have featu



8_Default_NeuralNetwork_Stacked rmse 6.829121 trained in 7.52 seconds
* Step ensemble_stacked will try to check up to 1 model
Ensemble_Stacked rmse 6.37972 trained in 2.6 seconds
AutoML fit time: 310.79 seconds
AutoML best model: Ensemble_Stacked


In [9]:
predictions = automl.predict(x_test)
print("Test spearman:", spearmanr(y_test, predictions))
submission["tm"] = automl.predict(df_test)
submission.to_csv(f"submission_automl_ver{VER}.csv", index=False)



Test spearman: SpearmanrResult(correlation=0.7325577610190409, pvalue=0.0)


