In [1]:
%load_ext autoreload
%autoreload 2

IMPORTS
==================



In [2]:
import pandas as pd
import os
import pickle
from pyprojroot import here

In [3]:
from src.tauso.off_target.Roni.off_target_pipeline.off_target_functions import parse_gtf, parse_fasta

from src.tauso.off_target.Roni.off_target_pipeline.get_premRNA_sequences import final_func_premrna, enrich_expression_data_with_sequence

from src.tauso.off_target.Roni.off_target_pipeline.mutate_cell_line_transcriptome import get_expression_of_cell_line, get_mutations_of_cell_line, mutate, mutation_dict, celline_list, find_shift, mutate_transcriptome

DATA
====

Paths

In [4]:
PROJECT_ROOT = here()
DATA_DIR = os.path.join(PROJECT_ROOT, "src", "tauso", "off_target", "Roni", "data")
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "src", "tauso", "off_target", "Roni", "outputs")

fasta_path = os.path.join(DATA_DIR, "GRCh38.p13.genome.fa")
gtf_path = os.path.join(DATA_DIR, "gencode.v34.chr_patch_hapl_scaff.annotation.gtf")
exp_path = os.path.join(DATA_DIR, "OmicsExpressionTPMLogp1HumanProteinCodingGenes.csv")
mut_path = os.path.join(DATA_DIR, "OmicsSomaticMutations.csv")

Select the cell-line(s) you work on

In [5]:
save_csv = True

In [6]:
cell_line_lst = ["ACH-000681"]
transcriptomes = {}

I. GET SEQUENCES
=

In [None]:
# Step 1 - Parse FASTA
chrom2seq = parse_fasta(fasta_path, "../outputs/fasta_pickle_output_file_path.pkl")

# Step 2 - Parse GTF
annotations = parse_gtf(gtf_path, "../outputs/gtf_pickle_output_file_path.pkl")

In [33]:
# Step 3 - Expression and mutation data per cell line
for cell_line in cell_line_lst:

    exp_data = get_expression_of_cell_line(cell_line, exp_path, OUTPUT_DIR)
    mut_data = get_mutations_of_cell_line(cell_line, mut_path, OUTPUT_DIR)

    transcriptomes[cell_line] = [exp_data, mut_data]
    transcriptomes = enrich_expression_data_with_sequence(transcriptomes, cell_line, chrom2seq, annotations)

    exp_df = transcriptomes[cell_line][0]
    exp_df["Original Transcript Sequence"] = (exp_df["Original Transcript Sequence"].astype(str).str.replace("T", "U"))
    mut_df = transcriptomes[cell_line][1]

    mut_exp_data = mutate_transcriptome(exp_df, mut_df, annotations)
    transcriptomes[cell_line].append(mut_exp_data)

    if save_csv:
        for cell_line, data in transcriptomes.items():
            df = pd.DataFrame(data[-1])
            df.to_csv(os.path.join(OUTPUT_DIR, cell_line + "_transcriptome.csv"), index=False)


Successfully processed ACH-000681. Found 16047 expressed genes.
Obtained mutation data for ACH-000681
Building Gene -> Transcript index...
Processing ACH-000681...
Enrichment complete.
                  Gene  ACH-000681_expression_norm  expression_TPM  \
0        MT-CO3 (4514)                   15.250876    3.899060e+04   
1        MT-CO1 (4512)                   14.970982    3.211450e+04   
2        MT-CO2 (4513)                   14.783070    2.819240e+04   
3       MT-ATP6 (4508)                   14.529565    2.364920e+04   
4        MT-ND4 (4538)                   13.919915    1.549830e+04   
...                ...                         ...             ...   
16042     USH2A (7399)                    0.001412    9.790000e-04   
16043    XKR4 (114786)                    0.001374    9.530000e-04   
16044     NXF2 (56001)                    0.000049    3.400000e-05   
16045  CTAG1A (246100)                    0.000022    1.500000e-05   
16046      SSX2 (6757)                    0.0

In [9]:
transcriptomes["ACH-000681"][-1].head()

Unnamed: 0_level_0,Gene,ACH-000681_expression_norm,expression_TPM,Original Transcript Sequence,Mutated Transcript Sequence
Transcript_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENST00000362079.2,MT-CO3 (4514),15.250876,38990.6,ATGACCCACCAATCACATGCCTATCATATAGTAAAACCCAGCCCAT...,
ENST00000361624.2,MT-CO1 (4512),14.970982,32114.5,ATGTTCGCCGACCGTTGACTATTCTCTACAAACCACAAAGACATTG...,
ENST00000361739.1,MT-CO2 (4513),14.78307,28192.4,ATGGCACATGCAGCGCAAGTAGGTCTACAAGACGCTACTTCCCCTA...,
ENST00000361899.2,MT-ATP6 (4508),14.529565,23649.2,ATGAACGAAAATCTGTTCGCTTCATTCATTGCCCCCACAATCCTAG...,
ENST00000361381.2,MT-ND4 (4538),13.919915,15498.3,ATGCTAAAACTAATCGTCCCAACAATTATATTACTACCACTGACAT...,


II. MUTATE SEQUENCES
=

III. OFF-TARGET CALCULATION
=

In [11]:
exp_yes = transcriptomes["ACH-000681"][0]
exp_yes.to_csv("ACH-000681.transcriptome.csv", index=False)

In [13]:
hi = "AUGACCCACCAAUCACAUGCCUAUCAUAUAGUAAAACCCAGCCCAUGACCCCUAACAGGGGCCCUCUCAGCCCUCCUAAUGACCUCCGGCCUAGCCAUGUGAUUUCACUUCCACUCCAUAACGCUCCUCAUACUAGGCCUACUAACCAACACACUAACCAUAUACCAAUGAUGGCGCGAUGUAACACGAGAAAGCACAUACCAAGGCCACCACACACCACCUGUCCAAAAAGGCCUUCGAUACGGGAUAAUCCUAUUUAUUACCUCAGAAGUUUUUUUCUUCGCAGGAUUUUUCUGAGCCUUUUACCACUCCAGCCUAGCCCCUACCCCCCAAUUAGGAGGGCACUGGCCCCCAACAGGCAUCACCCCGCUAAAUCCCCUAGAAGUCCCACUCCUAAACACAUCCGUAUUACUCGCAUCAGGAGUAUCAAUCACCUGAGCUCACCAUAGUCUAAUAGAAAACAACCGAAACCAAAUAAUUCAAGCACUGCUUAUUACAAUUUUACUGGGUCUCUAUUUUACCCUCCUACAAGCCUCAGAGUACUUCGAGUCUCCCUUCACCAUUUCCGACGGCAUCUACGGCUCAACAUUUUUUGUAGCCACAGGCUUCCACGGACUUCACGUCAUUAUUGGCUCAACUUUCCUCACUAUCUGCUUCAUCCGCCAACUAAUAUUUCACUUUACAUCCAAACAUCACUUUGGCUUCGAAGCCGCCGCCUGAUACUGGCAUUUUGUAGAUGUGGUUUGACUAUUUCUGUAUGUCUCCAUCUAUUGAUGAGGGUCUU"

len(hi)

784

In [14]:
bye = "ATGACCCACCAATCACATGCCTATCATATAGTAAAACCCAGCCCATGACCCCTAACAGGGGCCCTCTCAGCCCTCCTAATGACCTCCGGCCTAGCCATGTGATTTCACTTCCACTCCATAACGCTCCTCATACTAGGCCTACTAACCAACACACTAACCATATACCAATGATGGCGCGATGTAACACGAGAAAGCACATACCAAGGCCACCACACACCACCTGTCCAAAAAGGCCTTCGATACGGGATAATCCTATTTATTACCTCAGAAGTTTTTTTCTTCGCAGGATTTTTCTGAGCCTTTTACCACTCCAGCCTAGCCCCTACCCCCCAATTAGGAGGGCACTGGCCCCCAACAGGCATCACCCCGCTAAATCCCCTAGAAGTCCCACTCCTAAACACATCCGTATTACTCGCATCAGGAGTATCAATCACCTGAGCTCACCATAGTCTAATAGAAAACAACCGAAACCAAATAATTCAAGCACTGCTTATTACAATTTTACTGGGTCTCTATTTTACCCTCCTACAAGCCTCAGAGTACTTCGAGTCTCCCTTCACCATTTCCGACGGCATCTACGGCTCAACATTTTTTGTAGCCACAGGCTTCCACGGACTTCACGTCATTATTGGCTCAACTTTCCTCACTATCTGCTTCATCCGCCAACTAATATTTCACTTTACATCCAAACATCACTTTGGCTTCGAAGCCGCCGCCTGATACTGGCATTTTGTAGATGTGGTTTGACTATTTCTGTATGTCTCCATCTATTGATGAGGGTCTT"

len(bye)

784