In [87]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import sys
sys.path.append("../")
import preproc as preproc
import models as models

# Data reading

In [58]:
# Normalized DMS data and DeMaSk features. Keep 'NA' in wt_mut.
norm_dms_demask = pd.read_csv('../data/normalized/normalized_dms_DeMaSk_features.csv', 
                              index_col=0, na_values=[''], keep_default_na=False)
norm_as = pd.read_csv('../data/normalized/normalized_alanine_scanning.csv')

# Protein property features (alanine scanning feature is not included).
demask_features = ['entropy', 'log2f_var', 'matrix']

# Model with all alanine scanning data

In [59]:
all_as = norm_as.copy()

In [76]:
# Merge in a cartesian product way.
dms_all_as = pd.merge(norm_dms_demask, all_as[['uniprot_id', 'u_pos', 'Ascan_id', 'AS_score']], 
                      on=['uniprot_id', 'u_pos'], how='left')

# Identifier for each DMS+AS combined dataset.
dms_all_as['dmsa_id'] = dms_all_as['dms_id'] + '@' + dms_all_as['Ascan_id'].astype(str)
dms_all_as['Ascan_score_avail'] = ~dms_all_as['AS_score'].isna()

# Impute alanine scanning scores by mean and encode wild-type & mutation type amino acids.
dms_all_as, encoded_col = preproc.impute_encode_features(dms_all_as, ['aa1', 'aa2'], ['AS_score'], ['aa2'])
dms_all_as[encoded_col] = dms_all_as[encoded_col].mul(dms_all_as['AS_score'], axis='index')

# Weight each mutatnt.
weight = dms_all_as.groupby(['uniprot_id', 'u_pos', 'aa2'])['score'].count()
weight = 1 / weight
weight.name = 'weight'
dms_all_as = pd.merge(dms_all_as, weight, left_on=['uniprot_id', 'u_pos', 'aa2'], right_index=True,
                      how='outer', validate='m:1').reset_index(drop=True)

In [114]:
for unip in dms_all_as['uniprot_id'].unique():  # Leave-one-protein-out cross-validation
    train_data = dms_all_as.query("uniprot_id != @unip")
    test_data = dms_all_as.query("uniprot_id == @unip")
    train_data, test_data = models.refit_matrix_score(train_data, test_data)
    
    for model in ['with_as', 'no_as']:
        output_header = f"../result/prediction/demask_based/all_as/{unip}_{model}_"
        if model == 'with_as':
            model_features = demask_features + encoded_col + ['AS_score']
        else:
            model_features = demask_features.copy()
            
        estimator = LinearRegression(n_jobs=1)
        predictor = estimator.fit(train_data[model_features], train_data['score'], train_data['weight'])
        models.save_compared_prediction(predictor, test_data, model_features, 'score', output_header)

In [98]:
local_pred = pd.read_csv('../result/prediction/demask_based/all_as/P38398_with_as_prediction.csv',index_col=0)

In [103]:
hpc_pred = pd.read_csv('/Users/fu.j/Documents/Spartan_Process/007_extrapolation/10_missforest/10_selected/10_211203_assay/09_mean_impute/02_no_subset/log/P38398_with_ala_0_test_prediction.csv', index_col=0)

In [107]:
left = hpc_pred.sort_values(['dmsa_id', 'position', 'aa2']).reset_index(drop=True)
right = local_pred.sort_values(['dmsa_id', 'position', 'aa2']).reset_index(drop=True)

In [95]:
hpc_data = pd.read_csv('/Users/fu.j/AnacondaProjects/Project_3_extrapolation_model/data/dmsa_data/211029/demask_imputed_v12/P38398_mat_AS_prep_dmsa.csv', index_col=0)

# Data preprocessing

## Data reading

In [20]:
# Normalized DMS data and DeMaSk features. Keep 'NA' in wt_mut.
norm_dms_demask = pd.read_csv('../data/normalized/normalized_dms_DeMaSk_features.csv', index_col=0, na_values=[''], keep_default_na=False)
norm_dms_envision = pd.read_csv('../data/normalized/normalized_dms_Envision_features.csv', index_col=0, na_values=[''], keep_default_na=False)
norm_as = pd.read_csv('../data/normalized/normalized_alanine_scanning.csv')

## Preprocess Envision features

In [3]:
keep_col = ['score', 'dms_id', 'mut_type', 'uniprot_id', 'u_pos', 'pos_id', 'position']
features = ['aa1', 'aa2', 'wt_mut', 'aa1_polarity', 'aa2_polarity', 'aa1_PI','aa2_PI', 'deltaPI', 
            'Grantham', 'aa1_weight', 'aa2_weight', 'deltaWeight', 'aa1vol', 'aa2vol', 'deltavolume',
            'aa1_psic', 'aa2_psic', 'delta_psic', 'accessibility', 'dssp_sec_str', 'phi_psi_reg',
            'delta_solvent_accessibility', 'b_factor', 'mut_msa_congruency', 'seq_ind_closest_mut', 
            'mut_mut_msa_congruency', 'evolutionary_coupling_avg']

# Set categorical and numerical features.
categ_feat = ['aa1', 'aa2', 'wt_mut', 'aa1_polarity', 'aa2_polarity', 'dssp_sec_str', 'phi_psi_reg']
numer_feat = [x for x in features if x not in categ_feat]

In [12]:
preproc_data = preproc.impute_encode_features(norm_dms_envision, categ_feat, numer_feat, ['aa1', 'aa2'], 
                                              '../data/processed/preprocess_encoder.pickle')

## Merge DMS and alanine scanning data

In [25]:
# Merge in a cartesian product way.
dms_ascan_dem = pd.merge(norm_dms_demask, norm_as[['uniprot_id', 'u_pos', 'Ascan_id', 'AS_score']], 
                         on=['uniprot_id', 'u_pos'], how='left')

# Identifier for each DMS+AS combined dataset.
dms_ascan_dem['dmsa_id'] = dms_ascan_dem['dms_id'] + '@' + dms_ascan_dem['Ascan_id'].astype(str)
dms_ascan_dem['Ascan_score_avail'] = ~dms_ascan_dem['AS_score'].isna()
dms_ascan_dem.to_csv('../data/processed/dms_as_demask.csv')

In [24]:
# Merge in a cartesian product way.
dms_ascan_env = pd.merge(preproc_data, norm_as[['uniprot_id', 'u_pos', 'Ascan_id', 'AS_score']], 
                         on=['uniprot_id', 'u_pos'], how='left')

# Identifier for each DMS+AS combined dataset.
dms_ascan_env['dmsa_id'] = dms_ascan_env['dms_id'] + '@' + dms_ascan_env['Ascan_id'].astype(str)
dms_ascan_env['Ascan_score_avail'] = ~dms_ascan_env['AS_score'].isna()
dms_ascan_env.to_csv('../data/processed/dms_as_envision.csv')

# Modelling

In [26]:
dms_ascan_dem = pd.read_csv('../data/processed/dms_as_demask.csv', index_col=0)
# Protein property features (alanine scanning feature is not included).
demask_features = ['entropy', 'log2f_var', 'matrix']

## Model with all alanine scanning data

In [31]:
all_ascan_data = dms_ascan_dem.copy().reset_index(drop=True)

In [37]:
foo = dms_ascan_dem.copy()

In [32]:
# Impute alanine scanning scores by mean.
all_ascan_data['AS_score'] = all_ascan_data['AS_score'].fillna(all_ascan_data['AS_score'].mean())

# Encode with wild-type and mutation type amino acids.
encoded_aa = preproc.encode_categorical_feature(all_ascan_data[['aa1', 'aa2']])
encoded_as = encoded_aa.mul(all_ascan_data['AS_score'], axis='index')
encoded_col = list(encoded_as.columns)
all_ascan_data = pd.concat([all_ascan_data, encoded_as], axis=1)

In [None]:
all_ascan_data.groupby(['uniprot_id', 'u_pos'])

In [None]:

for unip in all_ascan_data['uniprot_id'].unique():  # Leave-one-protein-out cross-validation
    

# OUTDATED

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
import sys
sys.path.append("../")
import preproc as preproc
import models as models

## Data reading

In [None]:
dms_ascan = pd.read_csv('../data/processed/dms_as_data.csv', index_col=0)
# Protein property features (alanine scanning features are not included).
with open('../data/data_info/features.txt', 'r') as file:
    basic_features = eval(file.read())

## Predictor with alanine scanning data available mutants

In [None]:
# Prepare for modelling data.
model_data = dms_ascan[dms_ascan['uniprot_id'] != 'P51681']  # Remove CCR5 data.
# Keep mutants with available alanine scanning data.
model_data = model_data[model_data['Ascan_score_avail']].reset_index(drop=True)

# Set training weight.
unip_weight = model_data[['uniprot_id', 'dmsa_id']].groupby('uniprot_id').nunique()['dmsa_id']
unip_weight = 1 / unip_weight
model_data['weight'] = model_data['uniprot_id'].map(unip_weight)

In [None]:
# Set parameters.
"""
# These are real parameters
seed = 0
output_dir = '../result/prediction/mutants_with_AS_data/'
search_space = [{'name': 'n_estimators', 'type': 'discrete', 'domain': np.arange(100, 501), 'dtype': int},
                {'name': 'max_depth', 'type': 'discrete', 'domain': np.arange(1, 81), 'dtype': int},
                {'name': 'min_weight_fraction_leaf', 'type': 'continuous', 'domain': (0, 0.5), 'dtype': float}]
bo_kwargs = {'num_iterations': 1000, 'num_cores': 16, 'if_maximize': True, 'random_seed': seed, 'max_time': 165600}
cv_kwargs = {'scoring': 'neg_mean_squared_error', 'cv': KFold(10, True), 'n_jobs': 1}
"""
# These are trial parameters which are faster to run, but still can take a while (7 min on MacBook Pro 15-inch. 2018).
seed = 0
output_dir = '../result/prediction/trial/'
search_space = [{'name': 'n_estimators', 'type': 'discrete', 'domain': np.arange(1, 3), 'dtype': int},
                {'name': 'max_depth', 'type': 'discrete', 'domain': np.arange(1, 3), 'dtype': int}]
bo_kwargs = {'num_iterations': 1, 'num_cores': 12, 'if_maximize': True, 'random_seed': seed, 'max_time': 1000}
cv_kwargs = {'scoring': 'neg_mean_squared_error', 'cv': KFold(2, True), 'n_jobs': 1}

In [None]:
for test_unip in model_data['uniprot_id'].unique():
    for use_as_data in [True, False]:
        # Separate training and testing data.
        train_data = model_data[model_data['uniprot_id'] != test_unip]
        test_data = model_data[model_data['uniprot_id'] == test_unip]
        
        # Parameters processing.
        cv_kwargs['fit_params'] = {'sample_weight': train_data['weight']}
        if use_as_data:
            model_feature = basic_features + ['AS_score', 'Ascan_score_avail']
            model_name = 'with_ala'
        else:
            model_feature = basic_features.copy()
            model_name = 'nothing'
        output_header = f"{output_dir}{test_unip}_{model_name}_0_"  # The last 0 was used for replicates.
        bo_kwargs['output_header'] = output_header

        # Training.
        models.monitor_process(output_dir, f"{output_header[len(output_dir):-1]} starts.", 0)
        estimator = RandomForestRegressor(n_jobs=1, random_state=seed)
        predictor = models.fit_best_estimator(search_space, estimator, train_data[model_feature], train_data['score'],
                                              cv_kwargs, bo_kwargs)
        models.save_feature_importance(predictor, output_header)
        models.save_tuned_hyperparameters(predictor, search_space, output_header)
        models.save_compared_prediction(predictor, test_data, model_feature, 'score', output_header + 'test_',
                                        info_col=["dmsa_id", "position", "aa2"])
        models.monitor_process(output_dir, f"{output_header[len(output_dir):-1]} ends.", 0)

# For GAL4, '../data/processed/gal4_missing_mutants.csv' can be added to testing data.

## Predictor with all mutants

In [None]:
# Prepare for modelling data.
model_data = dms_ascan[dms_ascan['uniprot_id'] != 'P51681'].reset_index(drop=True)  # Remove CCR5 data.

# Set training weight.
unip_weight = model_data[['uniprot_id', 'dmsa_id']].groupby('uniprot_id').nunique()['dmsa_id']
unip_weight = 1 / unip_weight
model_data['weight'] = model_data['uniprot_id'].map(unip_weight)

In [None]:
# Set parameters.
"""
# These are real parameters
seed = 0
output_dir = '../result/prediction/all_mutants/'
search_space = [{'name': 'n_estimators', 'type': 'discrete', 'domain': np.arange(100, 501), 'dtype': int},
                {'name': 'max_depth', 'type': 'discrete', 'domain': np.arange(1, 81), 'dtype': int},
                {'name': 'min_weight_fraction_leaf', 'type': 'continuous', 'domain': (0, 0.5), 'dtype': float}]
bo_kwargs = {'num_iterations': 1000, 'num_cores': 16, 'if_maximize': True, 'random_seed': seed, 'max_time': 165600}
cv_kwargs = {'scoring': 'neg_mean_squared_error', 'cv': KFold(10, True), 'n_jobs': 1}
"""
# These are trial parameters which are faster to run, but still can take a while (24 min MacBook Pro 15-inch. 2018).
seed = 0
output_dir = '../result/prediction/trial/'
search_space = [{'name': 'n_estimators', 'type': 'discrete', 'domain': np.arange(1, 3), 'dtype': int},
                {'name': 'max_depth', 'type': 'discrete', 'domain': np.arange(1, 3), 'dtype': int}]
bo_kwargs = {'num_iterations': 1, 'num_cores': 12, 'if_maximize': True, 'random_seed': seed, 'max_time': 1000}
cv_kwargs = {'scoring': 'neg_mean_squared_error', 'cv': KFold(2, True), 'n_jobs': 1}

In [None]:
for test_unip in model_data['uniprot_id'].unique():
    for use_as_data in [True, False]:
        # Separate training and testing data.
        train_data = model_data[model_data['uniprot_id'] != test_unip]
        test_data = model_data[model_data['uniprot_id'] == test_unip]
        
        # Parameters processing.
        cv_kwargs['fit_params'] = {'sample_weight': train_data['weight']}
        if use_as_data:
            model_feature = basic_features + ['AS_score', 'Ascan_score_avail']
            model_name = 'with_ala'
        else:
            model_feature = basic_features.copy()
            model_name = 'nothing'
        output_header = f"{output_dir}{test_unip}_{model_name}_0_"  # The last 0 was used for replicates.
        bo_kwargs['output_header'] = output_header

        # Training.
        models.monitor_process(output_dir, f"{output_header[len(output_dir):-1]} starts.", 0)
        estimator = RandomForestRegressor(n_jobs=1, random_state=seed)
        predictor = models.fit_best_estimator(search_space, estimator, train_data[model_feature], train_data['score'],
                                              cv_kwargs, bo_kwargs)
        models.save_feature_importance(predictor, output_header)
        models.save_tuned_hyperparameters(predictor, search_space, output_header)
        models.save_compared_prediction(predictor, test_data, model_feature, 'score', output_header + 'test_',
                                        info_col=["dmsa_id", "position", "aa2"])
        models.monitor_process(output_dir, f"{output_header[len(output_dir):-1]} ends.", 0)
        
# For GAL4, '../data/processed/gal4_missing_mutants.csv' can be added to testing data.