In [None]:
from notebooks.consts import UPDATED_CSV
from notebooks.preprocessing import preprocess_aso_data

aso_df = preprocess_aso_data(UPDATED_CSV)

In [None]:
from tauso.data.data import get_paths
from tauso.genome.TranscriptMapper import GeneCoordinateMapper

paths = get_paths('GRCh38')
mapper = GeneCoordinateMapper(paths['db'])

In [None]:
from tauso.features.context.ribo_seq import add_genomic_coordinates

new = add_genomic_coordinates(aso_df, mapper)

In [None]:
from tauso.features.context.ribo_seq import populate_ribo_seq

new_new_df, feature_names = populate_ribo_seq('human', new, flanks=(30, 50, 100, 200, 400, 600, 800))

In [None]:
from notebooks.features.feature_extraction import save_feature

for feature in feature_names:
    save_feature(new_new_df, feature)

In [None]:
df = new_new_df.copy()

In [None]:
from notebooks.consts import CANONICAL_GENE, CELL_LINE

# Assuming 'features' is a list of column names strings
# and CANONICAL_GENE / CELL_LINE are variables holding the column names
cohort_cols = [CANONICAL_GENE, CELL_LINE]

# 1. Count samples per cohort
cohort_counts = df.groupby(cohort_cols).size().reset_index(name='n_samples')
valid_cohorts = cohort_counts[cohort_counts['n_samples'] > 10]

In [None]:
from notebooks.consts import INHIBITION
from matplotlib import pyplot as plt
import seaborn as sns

def calculate_cohort_correlations(sub_df):
    # Calculate Spearman correlation (robust to outliers) of features vs Target
    return sub_df[feature_names].corrwith(sub_df[INHIBITION], method='spearman')

# Group by cohort and apply the correlation function
# We filter df to only include valid cohorts first
df_filtered = df.merge(valid_cohorts[cohort_cols], on=cohort_cols)
cohort_corrs = df_filtered.groupby(cohort_cols).apply(calculate_cohort_correlations)

# Visualization: Cluster Map
# This groups similar cohorts and similar features together
plt.figure(figsize=(12, 8))
sns.clustermap(cohort_corrs.fillna(0), center=0, cmap="vlag", figsize=(15, 10),
               row_cluster=True, col_cluster=True)
plt.title("Feature-Target Correlation per Cohort")
plt.show()

In [None]:
# Melt the dataframe to long format for easy plotting
# We pick just top 3 features to keep the plot readable
top_features = feature_names[:3] # Replace with your most important features

for feat in top_features:
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=df_filtered, x=CANONICAL_GENE, y=feat, hue=CELL_LINE)
    plt.title(f"Distribution of {feat} across Cohorts")
    plt.xticks(rotation=45)
    plt.show()

In [None]:
import pandas as pd
from scipy import stats

# Storage for our results
correlation_results = []

# 1. Iterate through each cohort (Gene x Cell Line)
# We ensure we only look at groups with enough data to calculate correlation
for (gene_val, cell_val), cohort_df in df.groupby([CANONICAL_GENE, CELL_LINE]):

    # Skip tiny cohorts where correlation is meaningless (N < 3)
    if len(cohort_df) < 3:
        continue

    # 2. Iterate through each feature
    for feature in feature_names:
        # Drop NaNs just for this pair calculation to avoid errors
        clean_data = cohort_df[[feature, INHIBITION]].dropna()

        if len(clean_data) < 3:
            continue

        # Calculate Correlation and P-value
        # Use stats.spearmanr(x, y) if you prefer rank correlation
        r, p_val = stats.pearsonr(clean_data[feature], clean_data[INHIBITION])

        correlation_results.append({
            'Gene': gene_val,
            'Cell_Line': cell_val,
            'Feature': feature,
            'Correlation': r,
            'P_Value': p_val,
            'N_Samples': len(clean_data)
        })

# 3. Create the Results DataFrame
results_df = pd.DataFrame(correlation_results)

# 4. Sort by Strength of Correlation (absolute value) or P-value
results_df['Abs_Corr'] = results_df['Correlation'].abs()
results_df = results_df.sort_values(by='Abs_Corr', ascending=False)

# 5. Simple Print
# Setting pandas options to ensure columns don't get hidden
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print(results_df[['Gene', 'Cell_Line', 'Feature', 'Correlation', 'P_Value', 'N_Samples']])