IMPORTS
==================



In [None]:
from Bio import SeqIO
import pandas as pd
from io import StringIO
import os
import pickle

In [None]:
from src.tauso.off_target.Roni.off_target_functions import dna_to_rna_reverse_complement, normalize_chrom, get_min_max_coords, parse_risearch_output, aggregate_off_targets, parse_gtf, parse_fasta, name2accession

from src.tauso.off_target.Roni.get_premRNA_sequences import get_relevant_exp_data, get_premrna_coords, add_original_sequence, final_func_premrna

from src.tauso.off_target.Roni.mutate_cell_line_transcriptome import get_expression_of_cell_line, get_mutations_of_cell_line, mutate, mutation_dict, celline_list, find_shift

DATA
====

Paths

In [None]:
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
DATA_DIR = os.path.join(PROJECT_ROOT, "scripts", "data_genertion", "cell_line_expression")

fasta_path = os.path.join(DATA_DIR, "Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa")
gtf_path = os.path.join(DATA_DIR, "gencode.v48.chr_patch_hapl_scaff.annotation.gtf")
exp_path = os.path.join(DATA_DIR, "OmicsExpressionProteinCodingGenesTPMLogp1.csv")
mut_path = os.path.join(DATA_DIR, "OmicsSomaticMutations.csv")

Select the cell-line(s) you work on

In [None]:
cell_line_lst = ["ACH-000681"]
cell_line2data = {}

I. GET SEQUENCES
=

In [None]:
# Step 0 - Expression and mutation data per cell line
for cell_line in cell_line_lst:
    exp_data = get_expression_of_cell_line(cell_line, exp_path)
    mut_data = get_mutations_of_cell_line(cell_line, mut_path)
    cell_line2data[cell_line] = [exp_data, mut_data]

# Step 1 - Parse FASTA
chrom2seq = parse_fasta("fasta_file_path", "fasta_pickle_output_file_path")

# Step 2 - Parse GTF
t_id2coords = parse_gtf("gtf_file_path", "gtf_pickle_output_file_path")

# Step 3 - ???
# =========== GET SMALLER SEQUENCES FILES (only if not enough GB RAM)

# Step 4 - Extract pre-mRNA sequences
transcriptomes = final_func_premrna(cell_line2data, chrom2seq, t_id2coords)


II. MUTATE SEQUENCES
=

In [None]:

addition_exp = '_transcriptome_premRNA.merged.csv'

with open(DATA_DIR + '_gtf_annotations.pkl', 'rb') as f:
    annotations = pickle.load(f)
print('GTF annotations loaded.')

for cell_line in celline_list:
    print(f'processing {cell_line}')
    # Extract necessary data
    mut_data = pd.read_csv(DATA_DIR + cell_line + '_mutations.csv')
    exp_data = pd.read_csv(DATA_DIR + cell_line + addition_exp)

    exp_data_indexed = exp_data.set_index('Transcript_ID')
    print('Data loaded successfully.')

    for idx, row in mut_data.iterrows():
        mut_dict = mutation_dict(row)

        if mut_dict is not None:
            tid = mut_dict['id']
            mut_idx = mut_dict['start']


            if tid in exp_data_indexed.index:
                try:
                    shift = find_shift(annotations[tid], mut_idx)
                    print(f"{tid} shift: {shift}")
                    if pd.isna(exp_data_indexed.at[tid, 'Mutated Transcript Sequence']):
                        seq = exp_data_indexed.at[tid, 'Original Transcript Sequence']

                    else:
                        seq = exp_data_indexed.at[tid, 'Mutated Transcript Sequence']

                    mutated_seq = mutate(mut_dict, shift, seq)
                    exp_data_indexed.at[tid, 'Mutated Transcript Sequence'] = mutated_seq
                    if seq != mutated_seq:
                        print(f'really mutated {tid}')
                    else:
                        print(f'didnt really do anything... check {tid}')
                except Exception as e:
                    print(f"Skipping {tid} due to error: {e}")

    exp_data_indexed.to_csv(DATA_DIR + cell_line + '.mutated' + addition_exp)
    print(f'Saved {cell_line}')