In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
inpath = '/home/suhan/data/kaggle/novozyme/'
train = pd.read_table(inpath+'/novozymes-enzyme-stability-prediction/train.csv',sep=',').dropna()
test = pd.read_table(inpath+'/novozymes-enzyme-stability-prediction/test.csv',sep=',')

In [None]:
train['prot_len'] = [len(p) for p in train.protein_sequence.tolist()]

In [None]:
def GetFastaFromDF(df,output):
    outfile = open(output,'w')
    for i in tqdm(range(len(df))):
        outfile.write('>'+str(df['seq_id'].tolist()[i])+'\n')
        outfile.write(df['protein_sequence'].tolist()[i]+'\n')
    outfile.close()
# GetFastaFromDF(train,'./Novozyme.Train.fasta')
# GetFastaFromDF(test,'./Novozyme.Test.fasta')

In [None]:
aatable = {
            'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 'ACA':'T', 'ACC':'T',
            'ACG':'T', 'ACT':'T', 'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 
            'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R', 'CTA':'L', 'CTC':'L', 
            'CTG':'L', 'CTT':'L', 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 
            'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 'CGA':'R', 'CGC':'R', 
            'CGG':'R', 'CGT':'R', 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 
            'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 'GAC':'D', 'GAT':'D', 
            'GAA':'E', 'GAG':'E', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 
            'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'TTC':'F', 'TTT':'F', 
            'TTA':'L', 'TTG':'L', 'TAC':'Y', 'TAT':'Y', 'TAA':'*', 'TAG':'*', 
            'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W',
            }

In [None]:
aa = list(set(aatable.values()))

In [None]:
def aa_bow(sequence,aminolist):
    bow_in=[]
    for a in aminolist:
        bow_in.append(sequence.count(a))
    return(bow_in)

In [None]:
def get_bow(df):
    bow_lst=[]
    for i in range(len(df)):
        bow_tmp = aa_bow(df.protein_sequence.tolist()[i],aa)
        bow_lst.append(bow_tmp)
    bow_df = pd.DataFrame(bow_lst,columns = aa)
    return(bow_df)

In [None]:
train_df = pd.concat([get_bow(train),train],axis=1)
test_df = pd.concat([get_bow(test),test],axis=1)

> Protein sources are important for explaining batch effect or institutional bias

In [None]:
train_df["data_source"] = train_df["data_source"].astype('category')
train_df["data_source_cat"] = train_df["data_source"].cat.codes

train_input = train_df.drop(['protein_sequence','data_source','seq_id'],axis=1).dropna()
test_input = test_df.drop(['protein_sequence','data_source','seq_id'],axis=1)

In [None]:
train_input = train_input.sample(frac = 1)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
import optuna
import sklearn
import sklearn.ensemble
from sklearn.model_selection import train_test_split


# Define an objective function to be minimized.
def objective(trial):

    regressor_name = trial.suggest_categorical('classifier', ['SVR', 'RandomForest'])
    if regressor_name == 'SVR':
        svr_c = trial.suggest_float('svr_c', 1e-10, 1e10, log=True)
        regressor_obj = sklearn.svm.SVR(C=svr_c)
    else:
        rf_max_depth = trial.suggest_int('rf_max_depth', 2, 32)
        regressor_obj = sklearn.ensemble.RandomForestRegressor(max_depth=rf_max_depth)

    X_train, X_val, y_train, y_val = train_test_split(train_input.drop('tm',axis=1),train_input['tm'])
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.fit_transform(X_val)

    regressor_obj.fit(X_train, y_train)
    y_pred = regressor_obj.predict(X_val)

    error = sklearn.metrics.mean_squared_error(y_val, y_pred)
    # from scipy import stats
    # print("Spearman correlation coef on unknown data is",str(stats.spearmanr(y_test, y2)[0]))

    return error  # An objective value linked with the Trial object.

study = optuna.create_study()  # Create a new study.
study.optimize(objective, n_trials=5)  # Invoke optimization of the objective function.