In [1]:
import pandas as pd
from scripts.data_genertion.consts import *
from asodesigner.file_utils import read_human_genome_fasta_dict
from asodesigner.consts import *

In [2]:
# reading the features file into df
filtered = pd.read_csv("features_output.csv")

In [3]:
from scipy.stats import pearsonr, spearmanr
from sklearn.feature_selection import mutual_info_regression
def print_correlations(df, name1, name2, p_value_threshold=None):
    if p_value_threshold is None:
        p_value_threshold = 1
    corr, p_value = pearsonr(df[name1], df[name2])
    if p_value < p_value_threshold:
        print(f"Feature: {name1:<35}, Pearson: {corr:<5.2f}, p-value: {p_value:<10.2} Target: {name2:<35}")
    corr, p_value = spearmanr(df[name1], df[name2])
    if p_value < p_value_threshold:
        print(f"Feature: {name1:<35}, Spearman: {corr:<5.2f}, p-value: {p_value:<10.2} Target: {name2:<35}")

        # MIC (via Mutual Information)
    try:
        mic = mutual_info_regression(df[[name1]], df[name2], discrete_features='auto')[0]
        print(f"Feature: {name1:<35}, MIC:      {mic:<5.2f}                             Target: {name2:<35}")
    except Exception as e:
        print(f"Could not compute MIC for {name1} vs {name2}: {e}")    

finding the correlations for each feature

In [4]:
import pandas as pd
from scipy.stats import pearsonr, spearmanr
from sklearn.feature_selection import mutual_info_regression
from IPython.display import display

# All features to analyze
#features_to_check = sequence_features  # או תחליף לרשימה שלך
target = 'log_inhibition'

# Collect results into list of dictionaries
results = []
for feature in filtered.columns:
    try:
        x = filtered[feature]
        y = filtered[target]
        pearson_corr, pearson_p = pearsonr(x, y)
        spearman_corr, spearman_p = spearmanr(x, y)
        mic = mutual_info_regression(filtered[[feature]], y, discrete_features='auto')[0]
        results.append({
            'Feature': feature,
            'Target': target,
            'Pearson': round(pearson_corr, 3),
            'Pearson_p': pearson_p,
            'Spearman': round(spearman_corr, 3),
            'Spearman_p': spearman_p,
            'MIC': round(mic, 3)
        })
    except Exception as e:
        results.append({
            'Feature': feature,
            'Target': target,
            'Pearson': None,
            'Pearson_p': None,
            'Spearman': None,
            'Spearman_p': None,
            'MIC': None,
            'Error': str(e)
        })

# Create DataFrame and display full table
results_df = pd.DataFrame(results)
pd.set_option('display.max_rows', None)
results_df.sort_values(by="MIC",ascending= False , inplace= True)
display(results_df)


Unnamed: 0,Feature,Target,Pearson,Pearson_p,Spearman,Spearman_p,MIC,Error
43,log_inhibition,log_inhibition,1.0,0.0,1.0,0.0,4.692,
15,Inhibition(%),log_inhibition,0.88,0.0,1.0,0.0,4.682,
0,ISIS,log_inhibition,-0.051,1.557907e-18,-0.129,3.9771790000000003e-112,0.348,
44,sense_start,log_inhibition,-0.027,2.750779e-06,0.199,6.941965e-265,0.245,
5,ASO_volume(nM),log_inhibition,0.3,0.0,0.325,0.0,0.231,
53,enc_score_gene,log_inhibition,-0.066,4.172096e-30,-0.157,4.656026e-164,0.196,
56,enc_score_gene_scaled,log_inhibition,-0.183,3.90977e-225,-0.223,0.0,0.195,
54,tai_score_gene,log_inhibition,0.013,0.02838418,0.044,1.839655e-14,0.185,
57,tai_score_gene_scaled,log_inhibition,0.053,3.9178169999999996e-20,0.06,3.2379230000000003e-25,0.179,
58,ASO_chimera_score_scaled,log_inhibition,0.073,1.2655699999999999e-36,0.113,2.696995e-86,0.092,


In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np
from sklearn.model_selection import train_test_split

X_G = filtered[['Treatment_Period(hours)', 'ASO_volume(nM)']]
y_log = filtered["log_inhibition"]

#spliting the data to train set and validation set
X_train , X_val ,y_train , y_val = train_test_split(X_G ,y_log, test_size=0.2, random_state=42)

# building the linear regression model# 3. Train Linear Regression model on training data
model_lin = LinearRegression()
model_lin.fit(X_train, y_train)

# 4. Predict on validation data
y_pred_val = model_lin.predict(X_val)

# 5. Compute evaluation metrics
r2_val = r2_score(y_val, y_pred_val)
mae_val = mean_absolute_error(y_val, y_pred_val)
residuals_val = y_val - y_pred_val  # for next stage (e.g., sequence model)

# 6. Display results
print("=== Linear Regression on G (time + volume) ===")
print(f"R² on validation:  {r2_val:.4f}")
print(f"MAE on validation: {mae_val:.4f}")



=== Linear Regression on G (time + volume) ===
R² on validation:  0.1030
MAE on validation: 0.5344


Using a linear regression model with only Treatment_Period(hours) and ASO_volume(nM) to predict log_inhibition yielded modest results, with an R² of 0.103 and a mean absolute error (MAE) of 0.534. This means that the linear model was able to explain only about 10% of the variability in the inhibition outcome, suggesting that either the relationship between these experimental conditions and ASO effectiveness is nonlinear, or that these features alone are not sufficient to capture the underlying biological effects. A more flexible model, such as a Random Forest, may better capture potential nonlinear interactions and improve predictive performance.

In [14]:
from sklearn.ensemble import RandomForestRegressor
model_rf = RandomForestRegressor(n_estimators= 100 , random_state=42)
model_rf.fit(X_train , y_train)
y_pred_rf = model_rf.predict(X_val)
r2_rf = r2_score(y_val , y_pred_rf)
mae_rf = mean_absolute_error(y_val, y_pred_rf)
residuals_rf = y_val - y_pred_rf
print("=== Random Forest on G (time + volume) ===")
print(f"R² on validation: {r2_rf:.4f}")
print(f"MAE on validation : {mae_rf:.4f}")


=== Random Forest on G (time + volume) ===
R² on validation: 0.3574
MAE on validation : 0.4347


Using Treatment_Period(hours) and ASO_volume(nM) as input features, a Random Forest regression model was trained to predict log_inhibition. The model achieved an R² of 0.357 and a mean absolute error (MAE) of 0.435 on the validation set. These results indicate that experimental conditions alone explain a moderate portion of the variance in ASO efficacy, and that the relationship between these variables and inhibition is likely nonlinear. The performance of the Random Forest model highlights its strength in capturing complex patterns between treatment parameters and ASO response.

In [None]:
#generating Polynomial Regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# Create a pipeline with polynomial features + linear regression
poly_model = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False),
    LinearRegression()
)

# Fit to training data
poly_model.fit(X_train, y_train)

# Predict on validation data
y_pred_poly = poly_model.predict(X_val)

# Evaluate
r2_poly = r2_score(y_val, y_pred_poly)
mae_poly = mean_absolute_error(y_val, y_pred_poly)
residuals_poly = y_val - y_pred_poly

# Report
print("=== Polynomial Regression on G (time + volume) ===")
print(f"R² on validation:  {r2_poly:.4f}")
print(f"MAE on validation: {mae_poly:.4f}")


=== Polynomial Regression on G (time + volume) ===
R² on validation:  0.1382
MAE on validation: 0.5183


In [16]:
from xgboost import XGBRegressor

# Train XGBoost on training data
model_xgb = XGBRegressor(n_estimators=100, random_state=42)
model_xgb.fit(X_train, y_train)

# Predict on validation data
y_pred_xgb = model_xgb.predict(X_val)

# Evaluate
r2_xgb = r2_score(y_val, y_pred_xgb)
mae_xgb = mean_absolute_error(y_val, y_pred_xgb)
residuals_xgb = y_val - y_pred_xgb

# Report
print("=== XGBoost on G (time + volume) ===")
print(f"R² on validation:  {r2_xgb:.4f}")
print(f"MAE on validation: {mae_xgb:.4f}")


Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


=== XGBoost on G (time + volume) ===
R² on validation:  0.3573
MAE on validation: 0.4347


| Model                                | R² (Validation) | MAE (Validation) | Notes                                          |
| ------------------------------------ | --------------- | ---------------- | ---------------------------------------------- |
| **Linear Regression**                | 0.103           | 0.534            | Weak performance; limited explanatory power    |
| **Polynomial Regression (degree 2)** | 0.138           | 0.518            | Slight improvement, but still underwhelming    |
| **Random Forest**                    | 0.357           | 0.435            | Significant improvement; captures nonlinearity |
| **XGBoost**                          | 0.357           | 0.435            | Matches Random Forest almost exactly           |


In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
import numpy as np

# 1. Define input (G) and target (log(y))
X_G = filtered[['Treatment_Period(hours)', 'ASO_volume(nM)']]
y_log = filtered['log_inhibition']  # already in log scale

# 2. Create array to store out-of-fold predictions (log(G))
oof_log_G = np.zeros(len(X_G))

# 3. K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 4. Loop over folds to generate out-of-fold predictions
for train_idx, val_idx in kf.split(X_G):
    X_train, X_val = X_G.iloc[train_idx], X_G.iloc[val_idx]
    y_train = y_log.iloc[train_idx]

    model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
    model_rf.fit(X_train, y_train)
    oof_log_G[val_idx] = model_rf.predict(X_val)

# 5. Compute log(M) = log(y) - log(G)
log_M = y_log - oof_log_G

# Optional: quick sanity check
print(f"log(G) min: {oof_log_G.min():.4f}, max: {oof_log_G.max():.4f}")
print(f"log(M) first 5 values: {log_M.head().round(4).tolist()}")


log(G) min: -4.5675, max: -2.0429
log(M) first 5 values: [-0.8673, -1.1478, -0.2528, -0.9163, -0.8215]


In [None]:
filtered["log_inhibition_residual"] = log_M
filtered.head()