In [1]:
import pandas
from consts import *
from file_utils import read_human_genome_fasta_dict

all_data = pandas.read_csv(DATA_PATH / 'data_from_article_fixed.csv')

In [2]:
from data_genertion.consts import *

In [3]:
# Feature generation

In [None]:
import numpy as np

all_data_no_nan = all_data.dropna(subset=[INHIBITION]).copy()
all_data_no_nan.loc[:, 'log_inhibition'] = -np.log(-all_data_no_nan[INHIBITION] + 100.001)

In [None]:
all_data_no_nan_human = all_data_no_nan[all_data_no_nan[CELL_LINE_ORGANISM] == 'human']
genes = all_data_no_nan[CANONICAL_GENE].copy()
genes_u = list(set(genes))
genes_u.remove('HBV')
genes_u.remove('negative_control')

In [12]:
from read_human_genome import get_locus_to_data_dict
import pickle
from consts import CACHE_DIR


cache_path = CACHE_DIR / 'gene_to_data_simple_cache.pickle'
if not cache_path.exists():
    gene_to_data = get_locus_to_data_dict(include_introns=True, gene_subset=genes_u)
    with open(cache_path, 'wb') as f:
        pickle.dump(gene_to_data, f)
else:
    with open(cache_path, 'rb') as f:
        gene_to_data = pickle.load(f)

Time took to read fasta: 10.494274854660034
Length:  639


In [None]:
from asodesigner.util import get_antisense

all_data_human_gene = all_data_no_nan_human[all_data_no_nan_human[CANONICAL_GENE].isin(genes_u)].copy()
PREMRNA_FOUND = 'pre_mrna_found'
SENSE_START = 'sense_start'
SENSE_LENGTH = 'sense_length'
SENSE_TYPE = 'sense_type'

found = 0
not_found = 0
all_data_human_gene[SENSE_START] = np.zeros_like(all_data_human_gene[CANONICAL_GENE], dtype=int)
all_data_human_gene[SENSE_LENGTH] = np.zeros_like(all_data_human_gene[CANONICAL_GENE], dtype=int)
all_data_human_gene[SENSE_TYPE] = "NA"
for index, row in all_data_human_gene.iterrows():
     gene_name = row[CANONICAL_GENE]
     locus_info = gene_to_data[gene_name]
     pre_mrna = locus_info.full_mrna
     antisense = row[SEQUENCE]
     sense = get_antisense(antisense)
     idx = pre_mrna.find(sense)
     all_data_human_gene.loc[index, SENSE_START] = idx
     all_data_human_gene.loc[index, SENSE_LENGTH] = len(antisense)
     if idx != -1:
         genome_corrected_index = idx + locus_info.exon_indices[0][0]
         found = False
         for exon_indices in locus_info.exon_indices:
            # print(exon[0], exon[1])
            if exon_indices[0] <=  genome_corrected_index <= exon_indices[1]:
                all_data_human_gene.loc[index, SENSE_TYPE] = 'exon'
                found = True
                break
     if not found:
         all_data_human_gene.loc[index, SENSE_TYPE] = 'intron'

In [None]:
# Filter genes that were not found in genome
all_data_human_gene_premrna = all_data_human_gene[all_data_human_gene[SENSE_START] != -1]
len(all_data_human_gene_premrna)

In [None]:
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection  import train_test_split, cross_val_score
from sklearn.metrics import make_scorer
import pandas as pd
from data_genertion.consts import *

In [None]:
all_data_human_gene_premrna_no_nan = all_data_human_gene_premrna.copy()
filtered = all_data_human_gene_premrna_no_nan
filtered = filtered[filtered['Cell line organism'] == 'human']
filtered = filtered[filtered['Cell line organism'] == 'human']
print(len(filtered))
filtered = filtered.dropna(subset=[INHIBITION]).copy()
filtered = filtered.dropna(subset=[DENSITY]).copy()

ONE_HOT_FEATURE_NAMES = [CELL_LINE, TRANSFECTION, MODIFICATION, SENSE_TYPE]
# append more one-hot features 'first_nucleotide', 'second_nucleotide'

for one_hot_feature in ONE_HOT_FEATURE_NAMES:
    filtered = pd.get_dummies(filtered, columns=[one_hot_feature]).copy()

filtered.loc[:, 'log_volume'] = np.log(filtered[VOLUME])
filtered.loc[:, 'log_density'] = np.log(filtered[DENSITY])

# more feature generation
feature_base = [SENSE_START, SENSE_LENGTH]
experiment_features = [VOLUME, 'log_volume', 'log_density', TREATMENT_PERIOD]

one_hot_encoded_features = []
for one_hot_feature in ONE_HOT_FEATURE_NAMES:
    one_hot_encoded_features.append([feature for feature in filtered.columns if one_hot_feature in feature and one_hot_feature != feature])

# Flatten the list of list to a single big list
flat_one_hot_encoded_features = [feature for sublist in one_hot_encoded_features for feature in sublist]

features = feature_base + experiment_features + flat_one_hot_encoded_features

train, test = train_test_split(filtered, test_size=0.2)

X = train[features]
Y = train['log_inhibition']

model = LinearRegression()
model.fit(X[features], train['log_inhibition'])

baseline_score = cross_val_score(model, X, Y, cv=5).mean()

def spearman_score(y_true, y_pred):
    return stats.spearmanr(y_true, y_pred).correlation ** 2

spearman_scorer = make_scorer(spearman_score, greater_is_better=True)
spearman_score = cross_val_score(model, X, Y, cv=5, scoring=spearman_scorer, n_jobs=-1).mean()

print(f"Baseline score: {baseline_score}")
print("Spearman score: ", spearman_score)

In [None]:
import matplotlib.pyplot as plt

y_predicted = model.predict(test[features])
y_expected = test['log_inhibition']


corr, p_value = pearsonr(y_expected, y_predicted)
print("Pearson Corr: ", corr, " p_value: ", p_value)
corr, p_value = spearmanr(y_expected, y_predicted)
print("Spearman Corr: ", corr, " p_value: ", p_value)
plt.scatter(y_predicted, y_expected, alpha=0.2)
# plt.scatter(y_test, test['gc_content'])
x = np.linspace(np.min(y_predicted), np.max(y_predicted), 100)
plt.plot(x, x, color='red')

plt.xlabel('Predicted Inhibition')
plt.ylabel('Actual Inhibition')
plt.title('Predicted vs Actual Inhibition')
plt.show()

In [None]:
from sklearn.model_selection import permutation_test_score

score, permutation_scores, pvalue = permutation_test_score(model, X, Y, cv=5, n_permutations=100, n_jobs=-1, scoring='r2')
print(score, permutation_scores, pvalue)

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=42)
rf.fit(filtered[features], filtered['log_inhibition'])

# Get the feature importances
importances = rf.feature_importances_
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

In [None]:
for index, row in importance_df.iterrows():
    print(f"{row['Feature']:<40} {row['Importance']:.3}")

In [None]:
# Evaluate the model's choice of features by leaving one out and calculating the score

cv = 5
scorer = spearman_scorer
jobs = -1
feature_importances = []


for feature in feature_base:
    score_without_feature = cross_val_score(model, X.drop(columns=[feature]), Y, cv=cv, scoring=scorer, n_jobs=jobs).mean()
    # print(f"Dropping {feature}: performance drop = {baseline_score - score_without_feature}")
    feature_importances.append((feature, float(baseline_score - score_without_feature)))

print("Done base features")

for i in range(len(one_hot_encoded_features)):
    feature_name = ONE_HOT_FEATURE_NAMES[i]
    feature = one_hot_encoded_features[i]

    score_without_feature = cross_val_score(model, X.drop(columns=feature), Y, cv=cv, scoring=scorer, n_jobs=jobs).mean()
    performance_diff = baseline_score - score_without_feature
    feature_importances.append((feature_name, float(performance_diff)))

sorted_data = sorted(feature_importances, key=lambda x: x[1], reverse=True)
for datum in sorted_data:
    print(datum)

print("The baseline is: ", baseline_score)
print("The spearman baseline is: ", spearman_score)

In [None]:
from asodesigner.read_human_genome import get_human_genome_annotation_db, read_human_genome_fasta_dict

db = get_human_genome_annotation_db(create_db=False)

In [None]:
gene_to_gene_obj = {}

for gene in genes_u:
    gene_to_gene_obj[gene] = db[gene]

In [None]:
import sys
sys.getsizeof(gene_to_data)

In [None]:
from pympler import asizeof
asizeof.asizeof(gene_to_data)

In [None]:
asizeof.asizeof(fasta_dict)

In [None]:
from consts import HUMAN_DB_BASIC_INTRONS
import gffutils

db = gffutils.FeatureDB(str(HUMAN_DB_BASIC_INTRONS))
db.query('SELECT * ')


# gene_feature = db.features_of_type('gene')
