In [None]:
from sklearn import linear_model
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import joblib
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance
import numpy as np

# Importing excel sheets as dfs, should both have samples in the same order
predictors_df = pd.read_excel('../Predictors_Cleaned.xlsx', 0)
    # Rows are samples, columns are predictors
outcomes_df = pd.read_excel('../Outcomes_Cleaned.xlsx', 0)
    # Rows are samples, columns are mechanisms

# Dropping unnecessary metadata
X = predictors_df.drop('SAMPLE NAME', axis=1)
Y = outcomes_df.drop('MECHANISM', axis=1) # Full DF must iterate through
feature_names = X.columns
gene_names = Y.columns

# List of columns for iterating
columns = list(Y)

In [None]:
# SINGLE TEST
# Splitting sets
x_train, x_test, y_train, y_test = train_test_split(X, Y['Lipopeptides'], test_size=0.4) 

print(x_train)

# Creating the Lasso object
lasso = linear_model.Lasso()
lasso = lasso.fit(x_train, y_train)

# Calculating r2
predictions = lasso.predict(x_test)
new_r2 = abs(r2_score(y_test, predictions)) # using absolute value for comparision

print(x_train)

In [None]:
# IF SINGLE TEST WORKS THEN ITERATE
r2_list = []

for i in columns:
    
    # resetting the r2 and best model value for each column
    r2 = 0
    best_model = linear_model.Lasso()
    
    # Iterate through common proportions of test/train data
    for j in [0.2, 0.4, 0.6, 0.8]:

        # Split Data
        x_train, x_test, y_train, y_test = train_test_split(X, Y[i], test_size=j) 

        # Iterate through common n_estimators
        for k in [100, 500, 1000]:

            # Create model
            xg = linear_model.Lasso(alpha = 0.1)
            xg = xg.fit(x_train, y_train)

            # Find model r squared
            predictions = xg.predict(x_test)
            new_r2 = abs(r2_score(y_test, predictions)) # using absolute value for comparision

            # Compare model, if model r2 is over 1 then its very bad fit so we don't save it
            if new_r2 <= 1:
                if new_r2 > r2:
                    r2 = new_r2
                    # Saving the better model
                    best_model = xg
                    # Saving best model params for visualization
                    best_x_test = x_test
                    best_y_test = y_test
    
    # Storing highest r2
    r2_list.append(r2)
    # Printing r2
    print(f"{i} r2: {r2}")
    
    if r2 != 0: # if a best model exists
        # Finally dumping the best model for the class
        joblib.dump(best_model, f"Models/lasso_{i}.joblib")
    
        # Visualizing feature importance
        perm_importance = permutation_importance(best_model, best_x_test, best_y_test)
        sorted_idx = perm_importance.importances_mean.argsort()
        plt.barh(feature_names[sorted_idx], perm_importance.importances_mean[sorted_idx])
        plt.gcf().set_size_inches(10, 7)
        plt.xlabel(f"{i} Permutation Importance")
        plt.savefig(f'Features/{i}_features.png')
        plt.close()