# Tool Flow
Per the user input  
  
Functions in this notebook and what they wrap:  
`predict_log2fc()`  
1. `usr_seq()`  
    a. `get_seq()`  
  
2. `get_feat()`  
    a. `get_prot_mass()`  
    b. `get_biopy_feat()`  
    c. `count_aa_types()`  
  
3. `scale_input_feat()`    
    
4. `bagging_regr()` - the modeling function that outputs the predicted log2fc  
5. `vizualization_thing_here()`

`multi_pred_log2fc()`

### Imports

In [1]:
import Bio
from Bio import Entrez
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt   

# Import Scikit-Learn library for the regression model
import sklearn   
from sklearn import preprocessing #sklearn for normalization function
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
#for Bagging regressor
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

## Input EID or sequence

In [None]:
user_input = eid

## The `usr_seq` Function

In [2]:
def usr_seq(user_input, email):
    """If user input is an integer (EID and not a sequence that would be a string), get the sequence
    """
    t = type(user_input)
    if t is int:
        user_sequence = get_seq(email, user_input)
    else:
        user_sequence = user_input
    
    return user_sequence

In [2]:
def test_usr_seq():
    # Case 1: input is EID
    usr_in = 15599626
    result = usr_seq(usr_in)
    assert result is str, 'the usr_seq function does not work'
    # Case 2: input is sequence
    usr_in = 'juliaisthecoolest'
    result = usr_seq(usr_in)
    assert result == usr_in, 'the usr_seq function does not work'
    return

## The `get_seq` function

In [3]:
# definition of function:
def get_seq(email, prot_accession_num): 
    Entrez.email = email

    gis = [prot_accession_num] 
    request = Entrez.epost("protein",id=",".join(map(str,gis)))
    result = Entrez.read(request)
    webEnv = result["WebEnv"]
    queryKey = result["QueryKey"]
    handle = Entrez.efetch(db="protein",retmode="xml", webenv=webEnv, query_key=queryKey) 

    for r in Entrez.parse(handle):
        try:
            gi=int([x for x in r['GBSeq_other-seqids'] if "gi" in x][0].split("|")[1])
        except ValueError:
            gi=None
        #print(r['GBSeq_sequence'])
        seq = r['GBSeq_sequence']
    return seq

In [6]:
def test_get_seq():
    seq = get_seq("karanjia@uw.edu", 728886557)
    expect = 'mlrsmltasttlnqlqqqidtissnlsnsnttgykakdtnfselvrqqfdqvdekneevakarktppglrlgvgammssrlvsdqgsiqktdrdldiaftspyqylqvnvngnrqytrdgalyvtpsaananqlqlvtgngypvldengntvnidssmknitinkngtltasdgnavqrfnlgvvqvnnpqelksegnnlfsidnaaafeelnganrqnigmqqgslemsnvdiseqmtdlitsqrsyqlnsrtitmgdqmlglinsvr'
    assert seq == expect, 'the get_seq function is not working'

## The `get_feat` function

In [4]:
def get_feat(user_sequence):
    """
    This function takes the sequence that the user is querying and returns a dataframe appended 
    with all of the features used in the predictive model
    """
    
    mass = get_prot_mass(user_sequence)
    
    molwt_biopy, aromaticity, isoelctric_pt = get_biopy_feat(user_sequence)
    
    nonpolar, positive, polar, negative = count_aa_types(user_sequence)
    
    # Make the features into a Pandas DataFrame
    feat_list = [ nonpolar, positive, negative, polar, mass, aromaticity, isoelctric_pt,]
    column_names = ['AA_NP','AA_POS','AA_NEG', 'AA_POL', 'MW', 'AROM', 'ISO_E']
    feat_df = pd. DataFrame(feat_list, columns=column_names)
    
    return feat_df

## The `scale_input_feat` function

In [8]:
def scale_input_feat(X):
    '''
    The function takes in X (our input features), and rescale based on min-max normalization
    it returns the normalized X
    '''
    #returns a numpy array for X (needed to use the min_max_scaler)
    X_arr = X.values 

    X_col_names = list(X.columns.values.tolist()) #get column names to then put back into X_norm

    #min-max normalization (rescaling) of input features
    min_max_scaler = preprocessing.MinMaxScaler()
    X_scaled = min_max_scaler.fit_transform(X_arr)
    X_norm= pd.DataFrame(X_scaled)

    #put back the original column names
    X_norm.columns = X_col_names
    
    return X_norm

## The `bagging_regr` function

In [9]:
def bagging_regr(X_user):
    '''
    the function takes in the master csv and user input
    csv_file should be inputted as a string = 'compiled_features_complete.csv'
    test ratio, random state and n_estimator are set (from previous ML optimization)
    returning the predicted output based on user input
    '''

    # Open and load dataset
    bacterial_csv = pd.read_csv('UWInSpace_ModelData.csv')
    df = pd.DataFrame(data=bacterial_csv)
    
    #assign input (X) /output (y) features
    X= df[['AA_NP','AA_POS', 'AA_POL','AA_NEG', 'MW', 'AROM', 'ISO_E']]
    y= df['LOG2FC']
    
    #Scale input features
    X_arr = X.values #returns a numpy array for X (needed to use the min_max_scaler)

    X_col_names = list(X.columns.values.tolist()) #get column names to then put back into X_norm

    #min-max normalization (rescaling) of input features
    min_max_scaler = preprocessing.MinMaxScaler()
    X_scaled = min_max_scaler.fit_transform(X_arr)
    X_norm= pd.DataFrame(X_scaled)

    #put back the original column names
    X_norm.columns = X_col_names
    
  
    #set Bagging regressor parameters, from ML training: 
    test_ratio = 0.30
    seed_random = 42
    n_estim= 20
    
    X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=test_ratio, random_state=seed_random, shuffle=True)
    #Model is Bagging Regressor, base estimator is Decision Tree regressor
    model = BaggingRegressor(base_estimator=DecisionTreeRegressor(),n_estimators=n_estim, random_state=seed_random)
    model.fit(X_train, y_train)
    #y_testpredict = model.predict(X_test)
       
      
    user_predict = model.predict(X_user)
    
    return user_predict

# Overall tool function:

In [10]:
def predict_log2fc(user_input, email):
    """
    This function takes the user input of either a protein accesion number or 
    an amino acid sequence and predicts how much the sequence will change after 
    being in space (near-zero gravity and increased radiation exposure)
    """
    
    # Get user sequence
    user_sequence = usr_seq(user_input, email)
    
    # Get features
    user_features_df = get_feat(user_sequence)
    
    # Scale features
    scaled_features_df = scale_input_feat(user_features_df)
    
    # Predict log2fc
    prediction = bagging_regr(scaled_features_df)
    
    print('The predicted log2fc is ', prediction)
    
    return prediction

## Do multiple predictions

In [11]:
def multi_pred_log2fc(user_inputs, email):
    """Input the proteins of interest as a list.  This function will iterate through 
    and return a list of predicted log2fc's.
    """
    
    input_length = len(user_input)
    
    # Initiate results list
    result_log2fc = []
    
    for i in range(input_length):
        result = predict_log2fc(user_input[i], email)
        result_log2fc.append(result)
    
    return result_log2fc