# <font color=green>deepBreaks Applications</font>
## Modeling spectral tuning sites of opsin proteins based on amino-acid sequence...  

# <font color=red>Performance vs. Training Data Size Subtest</font>
### **Output** = a set of models trained on smaller and smaller amounts of the training data and a file tracking the change in performance to use for graph/figure making.

In [None]:
# importing deepBreaks libraries 
from deepBreaks.utils import get_models, get_scores, get_params, make_pipeline
from deepBreaks.preprocessing import MisCare, ConstantCare, URareCare, CustomOneHotEncoder
from deepBreaks.preprocessing import FeatureSelection, CollinearCare
from deepBreaks.preprocessing import read_data
from deepBreaks.models import model_compare_cv, finalize_top, importance_from_pipe, mean_importance, summarize_results
from deepBreaks.visualization import plot_scatter, dp_plot, plot_imp_model, plot_imp_all
from deepBreaks.preprocessing import write_fasta
import numpy as np
import csv
import pandas as pd
import warnings
import datetime
import os
import shutil
import time

warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')

In [None]:
# defining user params, file pathes, analysis type

#assign your path to folder containing all the datasplits
path = 'c:/Users/safra/Documents/GitHub/visual-physiology-opsin-db/vpod_data/VPOD_1.0/formatted_data_splits/vpod_2023-10-16_12-13-11'
# path to sequences of interest
seqFileName = f'{path}/VPOD_wds_het_1.0.fasta' 
# path to corresponding metadata of interest
metaDataFileName = f'{path}/wds_meta.tsv' 

# name of the phenotype
mt = 'Lambda_Max'

# type of the sequences
seq_type = 'aa'

# type of the analysis if it is a classification model, then we put cl instead of reg
ana_type = 'reg' 

gap_threshold = 0.60

#Whether or not you want to drop the reference sequence from the training data- Usually 'Bovine' or 'Squid'
drop_ref = True

print('reading meta-data')
# importing metadata
meta_data = read_data(metaDataFileName, seq_type = None, is_main=False)
metaFile = metaDataFileName.split('/')[1]
# importing sequences data
print('reading fasta file')
ref_df = read_data(seqFileName, seq_type = seq_type, is_main=True, gap_threshold=gap_threshold)
#merging in lambda max values, simultaneously dropping all sequences without entries in metadata file
ref_df= ref_df.merge(meta_data.loc[:, mt],  left_index=True, right_index=True)
#tr.shape

first_run = 0
stop_marker = 0

In [None]:
full_df = read_data(seqFileName, seq_type = seq_type, is_main=True, gap_threshold=gap_threshold)
#merging in lambda max values, simultaneously dropping all sequences without entries in metadata file
full_df= full_df.merge(meta_data.loc[:, mt],  left_index=True, right_index=True)
full_df.drop(mt, axis=1, inplace=True)

In [None]:
while stop_marker == 0:
    tr = read_data(seqFileName, seq_type = seq_type, is_main=True, gap_threshold=gap_threshold)
    #merging in lambda max values, simultaneously dropping all sequences without entries in metadata file
    tr = tr.merge(meta_data.loc[:, mt],  left_index=True, right_index=True)
    #tr.shape
    # making a unique directory for saving the reports of the analysis
    print('direcory preparation')
    dt_label = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    seqFile = seqFileName.split('/')[2]
    print(seqFile)
    seqFile = seqFile.split('.')[0]
    print(seqFile)
    report_dir = str(seqFile +'_' + mt + '_' + dt_label)
    os.makedirs(report_dir)

    #changing the number of test sequences to extract from the training dataframe based on the size of the training dataset
    if tr.shape[0] <= 150:
        sample_n = 15
    elif tr.shape[0] > 150 and tr.shape[0] <= 300:
        sample_n = 15
    elif tr.shape[0] > 300 and tr.shape[0] <= 1000:
        sample_n = 15
    else:
        sample_n = 100

    #taking a sample of 'n' sequences from the reference dataframe and dropping the selected sequences so they don't get resampled
    n=0
    while n<1:
        #try:
            if first_run == 0:
                drop_indices = np.random.choice(ref_df.index, sample_n, replace=False)
                #print(drop_indices)
                ref_df = ref_df.drop(drop_indices)
                n+=1
                first_run+=1

            else:
                new_drop_indices = np.random.choice(ref_df.index, sample_n, replace=False)
                ref_df = ref_df.drop(new_drop_indices)
                drop_indices = np.append(drop_indices, new_drop_indices)
            
                print(f'There are {ref_df.shape[0]} sequences remaining in the training data')
                if (int(ref_df.shape[0]) - sample_n) < 30:
                    stop_marker+=1
                n+=1
        #except:
            #print('Sequence sampling error')
    #dropping our sample indices from the training dataframe
    tr = tr.drop(drop_indices)

    y = tr.loc[:, mt].values
    tr.drop(mt, axis=1, inplace=True)

    #settingthe paramaters for our ML pipeline
    prep_pipeline = make_pipeline(
        steps=[
            ('mc', MisCare(missing_threshold=0.05)),
            ('cc', ConstantCare()),
            ('ur', URareCare(threshold=0.025)),
            ('cc2', ConstantCare()),
            ('one_hot', CustomOneHotEncoder()),
            ('feature_selection', FeatureSelection(model_type=ana_type, alpha=0.10, keep=False)),
            ('collinear_care', CollinearCare(dist_method='correlation', threshold=0.05, keep=False))
        ])

    #training models
    report, top = model_compare_cv(X=tr, y=y, preprocess_pipe=prep_pipeline,
                                models_dict=get_models(ana_type=ana_type),
                                scoring=get_scores(ana_type=ana_type),
                                report_dir=report_dir,
                                cv=12, ana_type=ana_type, cache_dir=report_dir)

    model_report_file = f"./{report_dir}/model_report.csv"
    model_report = read_data(model_report_file, seq_type = None, is_main=False)
    perf_v_seqs = open("perf_vs_seqs.tsv", 'a')
    if first_run == 1:
        perf_v_seqs.write(f"# of Sequences\tR2\n")
        first_run+=1  
    perf_v_seqs.write(ref_df.shape[0] + '\t' + model_report['R2'][0] + '\n')

    time.sleep(1)
    #setting parameters for tuning the top 3 performing models
    prep_pipeline = make_pipeline(
        steps=[
            ('mc', MisCare(missing_threshold=0.05)),
            ('cc', ConstantCare()),
            ('ur', URareCare(threshold=0.025)),
            ('cc2', ConstantCare()),
            ('one_hot', CustomOneHotEncoder()),
            ('feature_selection', FeatureSelection(model_type=ana_type, alpha=0.10, keep=True)),
            ('collinear_care', CollinearCare(dist_method='correlation', threshold=0.05, keep=True))
        ])

    modified_top = []
    mtml = []
    for model in top:
        modified_top.append(make_pipeline(steps=[('prep', prep_pipeline), model.steps[-1]]))
        my_top_models = str(model[1:])
        #print(my_top_models)
        my_top_models = my_top_models.split("'")[3]
        mtml.append(my_top_models)
        #print(my_top_models)

    #print(mtml)
    time.sleep(1)

    #tuning the top 3 performing models 
    top = finalize_top(X=tr, y=y, top_models=modified_top, grid_param=get_params(),report_dir=report_dir, cv=10)
    #summarize the results by extracting feature importance and p-values and grouping correlated features.
    sr = summarize_results(top_models=top, report_dir=report_dir)
    #plot a scatter plot with -log of (p-value) column as the x-axis and the values of the other columns 
    scatter_plot = plot_scatter(summary_result=sr, report_dir=report_dir)

    time.sleep(1)

    #plot mean relative importance of each feature - corresponding to an amino acid position.
    mean_imp = mean_importance(top, report_dir=report_dir)

    dp_plot(importance=mean_imp,imp_col='mean', model_name='mean', report_dir=report_dir)
    tr = prep_pipeline[:4].fit_transform(tr)

    for model in top:
        model_name = model.steps[-1][0]
        dp_plot(importance=importance_from_pipe(model),
                imp_col='standard_value',
                model_name = model_name, report_dir=report_dir)
        
        plot_imp_model(importance=importance_from_pipe(model), 
                X_train=tr, y_train=y, model_name=model_name,
                    meta_var='meta', model_type=ana_type, report_dir=report_dir)

    time.sleep(1)
