In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRanker
from sklearn.metrics import ndcg_score

import pandas as pd
from scipy import stats
from sklearn.metrics import make_scorer
from asodesigner.consts import *
from asodesigner.file_utils import read_human_genome_fasta_dict

# Load data
all_data = pd.read_csv(DATA_PATH / 'data_from_article_fixed.csv')

# Feature engineering
all_data['GC_content'] = all_data['Sequence'].str.count('[GC]') / all_data['Sequence'].str.len()
all_data['Sequence_length'] = all_data['Sequence'].str.len()

# (Optional) check initial structure
# print(all_data[['Cell_line', 'ISIS', 'Sequence', 'GC_content', 'Sequence_length', 'Inhibition(%)']].head())

# Filter out cell lines with fewer than 300 ASOs
min_group_size = 300
valid_cell_lines = (
    all_data.groupby('Cell_line')
    .filter(lambda x: len(x) >= min_group_size)['Cell_line']
    .unique()
)
# Ensure inhibition is finite
all_data = all_data[np.isfinite(all_data['Inhibition(%)'])]

all_data = all_data[all_data['Cell_line'].isin(valid_cell_lines)]

# Split by cell line — group-aware
cell_lines = all_data['Cell_line'].unique()
train_lines, test_lines = train_test_split(cell_lines, test_size=0.2, random_state=42)

train_mask = all_data['Cell_line'].isin(train_lines)
test_mask = ~train_mask

# Consistent features for both train and test sets
feature_cols = ['ASO_volume(nM)','Treatment_Period(hours)']

X_train = all_data.loc[train_mask, feature_cols].values
y_train = all_data.loc[train_mask, 'Inhibition(%)'].values
group_train = all_data.loc[train_mask].groupby('Cell_line').size().values.tolist()

X_test = all_data.loc[test_mask, feature_cols].values
y_test = all_data.loc[test_mask, 'Inhibition(%)'].values
group_test = all_data.loc[test_mask].groupby('Cell_line').size().values.tolist()

# Train XGBRanker
ranker = XGBRanker(
    objective='rank:pairwise',
    learning_rate=0.1,
    n_estimators=50,
    random_state=42
)
ranker.fit(X_train, y_train, group=group_train)

# # 4. Predict and evaluate
y_pred = ranker.predict(X_test)

# Compute NDCG@all
def compute_grouped_ndcg(y_true, y_pred, group_sizes, k=None):
    scores = []
    start = 0
    for size in group_sizes:
        end = start + size
        if size < 2:
            start = end
            continue
        yt = y_true[start:end]
        yp = y_pred[start:end]

        # Shift y_true to be non-negative
        if np.min(yt) < 0:
            yt = yt - np.min(yt)

        scores.append(ndcg_score([yt], [yp], k=k))
        start = end
    return np.mean(scores) if scores else 0.0


print("=== Evaluation on Test Set ===")

print(f"NDCG@all: {compute_grouped_ndcg(y_test, y_pred, group_test):.4f}")
print(f"NDCG@5  : {compute_grouped_ndcg(y_test, y_pred, group_test, k=5):.4f}")
print(f"NDCG@10 : {compute_grouped_ndcg(y_test, y_pred, group_test, k=10):.4f}")

# 5. Compute p-value for NDCG
def compute_ndcg_p_value(y_true, group_sizes, y_pred, n_permutations=1000):
    model_ndcg = compute_grouped_ndcg(y_true, y_pred, group_sizes)
    
    null_scores = []
    for _ in range(n_permutations):
        permuted = []
        start = 0
        for size in group_sizes:
            end = start + size
            segment = y_pred[start:end]
            permuted.append(np.random.permutation(segment))
            start = end
        permuted_y_pred = np.concatenate(permuted)
        null_scores.append(compute_grouped_ndcg(y_true, permuted_y_pred, group_sizes))

  
    
    null_scores = np.array(null_scores)
    p_val = np.mean(null_scores >= model_ndcg) # check if coerent
    print(f"Model NDCG: {model_ndcg:.4f}, Null NDCG mean: {np.mean(null_scores):.4f}, Null NDCG std: {np.std(null_scores):.4f}")
    return model_ndcg, p_val

# Example usage
model_ndcg, pval = compute_ndcg_p_value(y_test, group_test,
                                            ranker.predict(X_test), n_permutations=1000)
print(f"Model NDCG: {model_ndcg:.4f}, p-value: {pval:.4e}")





=== Evaluation on Test Set ===
NDCG@all: 0.9770
NDCG@5  : 0.7963
NDCG@10 : 0.7971
Model NDCG: 0.9770, Null NDCG mean: 0.9704, Null NDCG std: 0.0006
Model NDCG: 0.9770, p-value: 0.0000e+00


In [13]:
# Evaluate on training set
y_train_pred = ranker.predict(X_train)

print("=== Evaluation on Training Set ===")
print(f"NDCG@all: {compute_grouped_ndcg(y_train, y_train_pred, group_train):.4f}")
print(f"NDCG@5  : {compute_grouped_ndcg(y_train, y_train_pred, group_train, k=5):.4f}")
print(f"NDCG@10 : {compute_grouped_ndcg(y_train, y_train_pred, group_train, k=10):.4f}")

# Compute p-value on training predictions
model_ndcg_train, pval_train = compute_ndcg_p_value(y_train, group_train, y_train_pred, n_permutations=1000)
print(f"Model NDCG (train): {model_ndcg_train:.4f}, p-value: {pval_train:.4e}")


=== Evaluation on Training Set ===
NDCG@all: 0.9461
NDCG@5  : 0.7287
NDCG@10 : 0.7321
Model NDCG (train): 0.9461, p-value: 0.0000e+00


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRanker
from sklearn.metrics import ndcg_score

import pandas as pd
from scipy import stats
from sklearn.metrics import make_scorer
from asodesigner.consts import *
from asodesigner.file_utils import read_human_genome_fasta_dict

# Load data
all_data = pd.read_csv(DATA_PATH / 'data_from_article_fixed.csv')

# Feature engineering
all_data['GC_content'] = all_data['Sequence'].str.count('[GC]') / all_data['Sequence'].str.len()
all_data['Sequence_length'] = all_data['Sequence'].str.len()

# (Optional) check initial structure
# print(all_data[['Cell_line', 'ISIS', 'Sequence', 'GC_content', 'Sequence_length', 'Inhibition(%)']].head())

# Filter out cell lines with fewer than 300 ASOs
min_group_size = 300
valid_cell_lines = (
    all_data.groupby('Cell_line')
    .filter(lambda x: len(x) >= min_group_size)['Cell_line']
    .unique()
)
# Ensure inhibition is finite
all_data = all_data[np.isfinite(all_data['Inhibition(%)'])]

all_data = all_data[all_data['Cell_line'].isin(valid_cell_lines)]

# Split by cell line — group-aware
cell_lines = all_data['Cell_line'].unique()
train_lines, test_lines = train_test_split(cell_lines, test_size=0.2, random_state=42)

train_mask = all_data['Cell_line'].isin(train_lines)
test_mask = ~train_mask

# Consistent features for both train and test sets
feature_cols = ['ASO_volume(nM)','Treatment_Period(hours)']

X_train = all_data.loc[train_mask, feature_cols].values
y_train = all_data.loc[train_mask, 'Inhibition(%)'].values
group_train = all_data.loc[train_mask].groupby('Cell_line').size().values.tolist()

X_test = all_data.loc[test_mask, feature_cols].values
y_test = all_data.loc[test_mask, 'Inhibition(%)'].values
group_test = all_data.loc[test_mask].groupby('Cell_line').size().values.tolist()

# Train XGBRanker
ranker = XGBRanker(
    objective='rank:pairwise',
    learning_rate=0.1,
    n_estimators=50,
    random_state=42
)
ranker.fit(X_train, y_train, group=group_train)

# # 4. Predict and evaluate
y_pred = ranker.predict(X_test)

# Compute NDCG@all
def compute_grouped_ndcg(y_true, y_pred, group_sizes, k=None):
    scores = []
    start = 0
    for size in group_sizes:
        end = start + size
        if size < 2:
            start = end
            continue
        yt = y_true[start:end]
        yp = y_pred[start:end]

        # Shift y_true to be non-negative
        if np.min(yt) < 0:
            yt = yt - np.min(yt)

        scores.append(ndcg_score([yt], [yp], k=k))
        start = end
    return np.mean(scores) if scores else 0.0


print("=== Evaluation on Test Set ===")

print(f"NDCG@all: {compute_grouped_ndcg(y_test, y_pred, group_test):.4f}")
print(f"NDCG@5  : {compute_grouped_ndcg(y_test, y_pred, group_test, k=5):.4f}")
print(f"NDCG@10 : {compute_grouped_ndcg(y_test, y_pred, group_test, k=10):.4f}")

# 5. Compute p-value for NDCG
def compute_ndcg_p_value(y_true, group_sizes, y_pred, n_permutations=1000):
    model_ndcg = compute_grouped_ndcg(y_true, y_pred, group_sizes)
    
    null_scores = []
    for _ in range(n_permutations):
        permuted = []
        start = 0
        for size in group_sizes:
            end = start + size
            segment = y_pred[start:end]
            permuted.append(np.random.permutation(segment))
            start = end
        permuted_y_pred = np.concatenate(permuted)
        null_scores.append(compute_grouped_ndcg(y_true, permuted_y_pred, group_sizes))

  
    
    null_scores = np.array(null_scores)
    p_val = np.mean(null_scores - model_ndcg) # check if coerent with the original logic
    return model_ndcg, p_val

# Example usage
model_ndcg, pval = compute_ndcg_p_value(y_test, group_test,
                                            ranker.predict(X_test), n_permutations=1000)
print(f"Model NDCG: {model_ndcg:.4f}, p-value: {pval:.4e}")





=== Evaluation on Test Set ===
NDCG@all: 0.9770
NDCG@5  : 0.7963
NDCG@10 : 0.7971
Model NDCG: 0.9770, p-value: -6.5455e-03


In [7]:
import pandas as pd
import numpy as np

# First 10 ASOs from the all_data DataFrame, keep selected columns
raw_data = all_data[['ISIS', 'Cell_line', 'Sequence', 'GC_content', 'Sequence_length', 'Inhibition(%)']].head(10).copy()
# sort by inhibition percentage
raw_data = raw_data.sort_values(by='Inhibition(%)', ascending=False).reset_index(drop=True)

# Feature engineering
raw_data['GC_content'] = raw_data['Sequence'].str.count('[GC]') / raw_data['Sequence'].str.len()
raw_data['Sequence_length'] = raw_data['Sequence'].str.len()

# Select features
feature_cols = ['GC_content', 'Sequence_length']
X_new = raw_data[feature_cols].values

# Predict scores
raw_data['score'] = ranker.predict(X_new)

# Sort by predicted score (descending)
df_sorted = raw_data.sort_values(by='score', ascending=False).reset_index(drop=True)

# Display ranked ASOs
print(df_sorted[['ISIS', 'Sequence', 'Cell_line', 'GC_content', 'Sequence_length', 'Inhibition(%)','score']])


     ISIS          Sequence Cell_line  GC_content  Sequence_length  \
0  651490  CACTTGTACTAGTATG      A431      0.3750               16   
1  651499  GCATTGCTAGTTCAAA      A431      0.3750               16   
2  651529  ACTAATAGCAGTGGAA      A431      0.3750               16   
3  651540  TTAGGAGTCTTTATAG      A431      0.3125               16   
4  651479  GGTGAATATCTTCAAA      A431      0.3125               16   
5  540733  GCTAAAACAAATGCTA      A431      0.3125               16   
6  540747  TATAATGGTGAATATC      A431      0.2500               16   
7  540806  GCATGAAGATTTCTGG      A431      0.4375               16   
8  651530  TGACTAATAGCAGTGG      A431      0.4375               16   
9  651502  CAACTGCATGCACCAA      A431      0.5000               16   

   Inhibition(%)     score  
0           36.0 -0.170388  
1           46.0 -0.170388  
2           27.0 -0.170388  
3           30.0 -0.490447  
4           28.0 -0.490447  
5           33.0 -0.490447  
6            7.0 -0.523849

In [3]:
import numpy as np
import pandas as pd

# Step 1: Create dummy ASOs
np.random.seed(123)
n = 10
dummy_asos = pd.DataFrame({
    'gc_content': np.round(np.random.uniform(0.3, 0.7, n), 3),
    'purine_content': np.round(np.random.uniform(0.4, 0.6, n), 3),
    'sequence_length': np.random.randint(15, 21, n),
    'cell_line': ['A431'] * n  # Optional
})

# Step 2: Select features used in training
X_dummy = dummy_asos[['gc_content', 'purine_content', 'sequence_length']]

# Step 3: Predict ranking scores using your trained model
dummy_asos['score'] = ranker.predict(X_dummy)

# Step 4: Rank ASOs by model score (higher is better)
ranked_asos = dummy_asos.sort_values(by='score', ascending=False).reset_index(drop=True)

print(ranked_asos)


   gc_content  purine_content  sequence_length cell_line     score
0       0.579           0.469               15      A431  0.233315
1       0.588           0.480               18      A431  0.078414
2       0.692           0.436               19      A431 -0.191720
3       0.574           0.435               19      A431 -0.619765
4       0.521           0.412               16      A431 -0.839575
5       0.492           0.506               16      A431 -0.953596
6       0.391           0.488               15      A431 -1.343866
7       0.469           0.548               19      A431 -1.349718
8       0.414           0.546               20      A431 -1.743491
9       0.457           0.506               20      A431 -1.771293
