### using a baseline model that can "predict" the mean value for each dyanmic property, use the loss of the baseline model to adjust the weight of each property.

In [None]:
import pandas as pd
import numpy as np
from config import config
import h5py

In [None]:
df = pd.read_csv(config['file_path']['train_df_path'])
h5py_read = h5py.File(config['file_path']['h5py_path'], 'r')
pair_idx = {'vdw':0, 'hbbb':1, 'hbsb':2, 'hbss':3, 'hp':4, 'sb':5, 'pc':6, 'ps':7, 'ts':8, 'corr':9}

In [75]:
def get_feature_dict(res_feat, pair_feat, pair_idx, source):
    feat_all = {}
    if 'proteinflow' not in source:
        rmsf_feat = res_feat[:, 2]
        valid_mask = (rmsf_feat != -1)

        feat_all['rmsf_nor'] = (rmsf_feat / rmsf_feat.max())[valid_mask]
        feat_all['sasa_mean'] = res_feat[:, 0][valid_mask]
        feat_all['sasa_std'] = res_feat[:, 1][valid_mask]
        feat_all['ss'] = res_feat[:, 3:11][valid_mask]
        feat_all['chi'] = res_feat[:, 11:23][valid_mask]
        feat_all['phi'] = res_feat[:, 23:35][valid_mask]
        feat_all['psi'] = res_feat[:, 35:47][valid_mask]

        for i in pair_idx:
            f = pair_feat[:, :, pair_idx[i]]
            feat_all[i] = f[f != -1]

    else:
        for k in range(3):
            res_f = res_feat[:, k]
            pair_f = pair_feat[:, :, k]
            feat_all[f'nma_res{k+1}'] = res_f[res_f != -1]
            feat_all[f'nma_pair{k+1}'] = pair_f[pair_f != -1] 

    return feat_all

In [95]:
for i in df.index:
    name = df.loc[i, 'name']
    res_feat = h5py_read[f'{name}_res_feature'][:]
    pair_feat = h5py_read[f'{name}_pair_feature'][:]

    feat_all = get_feature_dict(res_feat, pair_feat, pair_idx, df.loc[i, 'source'])
    
    # print(i)
    for j in feat_all:
        if j in ['ss', 'chi', 'phi', 'psi']:
            df.loc[i, j] = feat_all[j].var(axis=0).mean()
        else:
            df.loc[i, j] = ((feat_all[j] - feat_all[j].mean())**2).mean()

In [None]:
df.to_csv('feature_var.csv', index=False)

### get weights
use RMSE as this value is comparable

In [None]:
df = pd.read_csv('/nfs/user/Users/ch3849/ProDance/data_new/train_data_all/feature_var.csv')
# convert MSE to RMSE
df.iloc[:,7:] = np.sqrt(df.iloc[:,7:])

In [None]:
rmse = df[df['label'] == 'train'][['source'] + list(df.columns[7:])].groupby('source').mean().T
rmse['ATLAS_GPCRmd_PED_mdCATH'] = df[(df['label'] == 'train') & (df['source'].isin(['ATLAS', 'GPCRmd', 'PED', 'mdCATH']))].describe().loc['mean']
rmse['Proteinflow'] = df[(df['label'] == 'train') & (df['source'].str.contains('proteinflow'))].describe().loc['mean']

In [None]:
rmse.to_excel('source_feature_rmse.xlsx')