In [None]:
# All imports here
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
import joblib
from sklearn.tree import plot_tree

# Importing excel sheets as dfs, should both have samples in the same order
predictors_df = pd.read_excel('Predictors_Cleaned.xlsx', 0)
    # Rows are samples, columns are predictors
outcomes_df = pd.read_excel('Outcomes_Cleaned.xlsx', 0)
    # Rows are samples, columns are mechanisms

# Dropping unnecessary metadata
X = predictors_df.drop('SAMPLE NAME', axis=1)
Y = outcomes_df.drop('MECHANISM', axis=1) # Full DF must iterate through
feature_names = X.columns

# Storing model scores
r2_list = []
oob_list = []

# List of columns for iterating
columns = list(Y)
# print(predictors_df) # 211 x 33
# print(outcomes_df) # 211 x 41


In [None]:
# DO NOT RUN ON MY PC IT WILL MAYBE EXPLODE

# Iterate through each class of genes
for i in columns:
    
    # resetting the r2 and best model value for each column
    r2 = 0
    best_model = RandomForestRegressor()
    
    # Iterate through common proportions of test/train data
    for j in [0.2, 0.4, 0.6, 0.8]:

        # Split Data
        x_train, x_test, y_train, y_test = train_test_split(X, Y[i], test_size=j) 

        # Iterate through common n_estimators
        for k in [100, 500, 1000]:

            # Create model
            rf = RandomForestRegressor(n_estimators=k, random_state=0, oob_score=True) # add max_features here
            rf = rf.fit(x_train, y_train)

            # Find model r squared
            predictions = rf.predict(x_test)
            new_r2 = abs(r2_score(y_test, predictions)) # using absolute value for comparision

            # Compare model, if model r2 is over 1 then its very bad fit so we don't save it
            if new_r2 <= 1:
                if new_r2 > r2:
                    r2 = new_r2
                    # Saving the better model
                    best_model = rf
                    # Saving best model params for visualization
                    best_x_test = x_test
                    best_y_test = y_test
    
    # Finally dumping the best model for the class
    joblib.dump(best_model, f"Models/rf_{i}.joblib")
    
    # Storing highest r2 and oob scores
    r2_list.append(r2)
    oob_list.append(best_model.oob_score_)

    # Visualizing feature importance
    feature_importance = permutation_importance(best_model, best_x_test, best_y_test, random_state=0)
    forest_importances = pd.Series(feature_importance.importances_mean, index=feature_names)

    # Plot feature importance
    fig, ax = plt.subplots()
    forest_importances.plot.bar(yerr=feature_importance.importances_std, ax=ax) # adds std bar
    ax.set_title(f"{k} Permutation Feature Importances")
    ax.set_ylabel("Mean Accuracy Decrease")

    # Saving the figure
    plt.savefig(f'Features/{i}_features.png')
    plt.close()
    
    # Printing highest r2 value and oob
    print(f"{i} r2: {r2}\n{i} oob score: {best_model.oob_score_}")



In [None]:
# Plotting an overall comparision of different models and their r2 scores

fig, ax = plt.subplots(figsize=(20,10))

ax.barh(np.arange(len(columns)), r2_list, align='center')
ax.set_yticks(np.arange(len(columns)), labels=columns)
ax.invert_yaxis()  # labels read top-to-bottom
ax.set_xlabel('r-squared')
ax.set_title('Model Comparision')

plt.show()

In [None]:
# Plotting an overall comparision of different models and their oob scores

fig, ax = plt.subplots(figsize=(20,10))

ax.barh(np.arange(len(columns)), oob_list, align='center')
ax.set_yticks(np.arange(len(columns)), labels=columns)
ax.invert_yaxis()  # labels read top-to-bottom
ax.set_xlabel('OOB Score')
ax.set_title('RandomForest Model Comparision')

plt.show()
