### SIMPLER Bagging Regressor

In [1]:
#Import python libraries/packages

# Pandas to handle dataframes
import pandas as pd    

# Import Scikit-Learn library for the regression model
import sklearn   
from sklearn import preprocessing #sklearn for normalization function
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
#for Bagging regressor
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

# Import numpy 
import numpy as np

# Import plotting libraries
import matplotlib 
from matplotlib import pyplot as plt


#### Function to do multiple linear regression with bagging regressor (BR)
bagging_regr(test_ratio, n_estim, seed_random, X, y):

In [7]:
def bagging_regr(csv_file, X_user):
    '''
    the function takes in the master csv and user input
    csv_file should be inputted as a string = 'compiled_features_complete.csv'
    test ratio, random state and n_estimator are set (from previous ML optimization)
    returning the predicted output based on user input
    '''

    # Open and load dataset
    bacterial_csv = pd.read_csv(csv_file)
    df = pd.DataFrame(data=bacterial_csv)
    
    #assign input (X) /output (y) features
    X= df[['AA_NP','AA_POS','AA_NEG', 'AA_POL', 'MW', 'AROM', 'ISO_E']]
    y= df['LOG2FC']
    
    #Scale input features
    X_arr = X.values #returns a numpy array for X (needed to use the min_max_scaler)

    X_col_names = list(X.columns.values.tolist()) #get column names to then put back into X_norm

    #min-max normalization (rescaling) of input features
    min_max_scaler = preprocessing.MinMaxScaler()
    X_scaled = min_max_scaler.fit_transform(X_arr)
    X_norm= pd.DataFrame(X_scaled)

    #put back the original column names
    X_norm.columns = X_col_names
    
  
    #set Bagging regressor parameters, from ML training: 
    test_ratio = 0.30
    seed_random = 42
    n_estim= 20
    
    X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=test_ratio, random_state=seed_random, shuffle=True)
    #Model is Bagging Regressor, base estimator is Decision Tree regressor
    model = BaggingRegressor(base_estimator=DecisionTreeRegressor(),n_estimators=n_estim, random_state=seed_random)
    model.fit(X_train, y_train)
    #y_testpredict = model.predict(X_test)
       
      
    user_predict = model.predict(X_user)
    
    return user_predict

### EXAMPLE USE OF THE FUNCTION:

User input:

In [8]:
user_features = [{'AA_NP':0.23,'AA_POS':0.3,'AA_NEG':0.9, 'AA_POL': 0.3, 'MW':0.95, 'AROM':0.891, 'ISO_E': 0.012}]
X_user = pd.DataFrame(user_features)
X_user

Unnamed: 0,AA_NP,AA_POS,AA_NEG,AA_POL,MW,AROM,ISO_E
0,0.23,0.3,0.9,0.3,0.95,0.891,0.012


In [9]:
csv_file = 'compiled_features_complete.csv'
pred_input_logFC = bagging_regr(csv_file, X_user)
print("The predicted log2FC for user input is %.5f" % pred_input_logFC)

The predicted log2FC for user input is -0.01346
