In [1]:
import pandas as pd
import numpy as np

from utils_v2_for_jupyter import *
from features import *

import genomepy

In [7]:
def import_clash_df(data="../data/supplementary_files/clash.tsv", drop_irrelevant_columns=True):

    

    if drop_irrelevant_columns:
        columns_to_keep = ["microRNA_name", "miRNA_seq", "mRNA_name",
                           "mRNA_start", "mRNA_end_extended", "mRNA_seq_extended", "seed_type", "folding_class"]
        clash_df = pd.read_csv(data, sep="\t", usecols=columns_to_keep)
    
    else:
        clash_df = pd.read_csv(data, sep="\t")

    # process microRNA_name column

    new_cols = clash_df['microRNA_name'].str.split('_', expand=True)
    new_cols.columns = ['accession', "from", 'mirna_name', 'temp']
    clash_df = pd.concat([clash_df, new_cols], axis=1)
    clash_df = clash_df.drop('microRNA_name', axis=1)
    clash_df = clash_df.drop('temp', axis=1)
    clash_df = clash_df.drop('from', axis=1)

    # process mRNA_name column

    new_cols = clash_df['mRNA_name'].str.split('_', expand=True)
    new_cols.columns = ['ensg', "enst", 'gene_name', 'temp']
    clash_df = pd.concat([clash_df, new_cols], axis=1)
    clash_df = clash_df.drop('mRNA_name', axis=1)
    clash_df = clash_df.drop('temp', axis=1)


    return clash_df

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
clash_df = import_clash_df(drop_irrelevant_columns=False)
clash_df.head()

Unnamed: 0,seq_ID,miRNA_start,miRNA_end,miRNA_seq,mRNA_start,mRNA_end_extended,mRNA_seq_extended,chimeras_decompressed,experiments,experiments_list,microRNA_first,two_way_merged,seed_type,num_basepairs,seed_basepairs,folding_energy,5'UTR,CDS,3'UTR,folding_class,conservation_score,log2_target_enrichment,CLASH_single_reads_ovlp,CLASH_cluster_ovlp,PAR_CLIP_cluster_ovlp,accession,mirna_name,ensg,enst,gene_name
0,0727A-1038930_1,1,22,TGAGGTAGTAGGTTGTATAGTT,1791,1890,ATTTGTATCTACGATAAAAATTTTTATACAGAACCTACTGCCTCAA...,31,3,"E2,E3,E4",1,0,noncanonical_seed,20,6,-25.1,,,1.0,III,0.210342,-0.020802,270.0,,,MIMAT0000062,let-7a,ENSG00000113328,ENST00000340828,CCNG1
1,L1HS-1112536_1,1,22,TGAGGTAGTAGGTTGTATAGTT,3857,3928,CAGGAAATACCCGTGCAACCAACTACCTCATATTCCATTCAGAATT...,9,2,"E3,E4",1,0,9-mer,17,6,-24.4,,1.0,,II,,0.628759,24.0,,1.0,MIMAT0000062,let-7a,ENSG00000100697,ENST00000343455,DICER1
2,L2HS-818542_2,1,22,TGAGGTAGTAGGTTGTATAGTT,2385,2434,ACCCGCTATATGACCTGATGCCTTTCCTTCATTAAAGATGATTCTG...,2,1,E4,1,0,noncanonical_seed,19,6,-22.2,,1.0,1.0,III,,0.022816,56.0,1.0,1.0,MIMAT0000062,let-7a,ENSG00000080546,ENST00000436639,SESN1
3,L2HS-1161339_2,1,22,TGAGGTAGTAGGTTGTATAGTT,6570,6623,CAATGACTATGCAACCATACCTTACCACTAAATGTAGTACGCAAAA...,2,1,E4,0,0,noncanonical_seed,18,6,-22.1,,1.0,,III,,-0.007294,7.0,,,MIMAT0000062,let-7a,ENSG00000164190,ENST00000282516,NIPBL
4,L2-407944_2,1,22,TGAGGTAGTAGGTTGTATAGTT,1164,1208,AATACTGGGAAACCTGCTACTTCGTCAGCTAACCAGAAACCTGTG,2,1,E4,1,0,noncanonical_seed,18,6,-21.9,,1.0,,III,,0.026476,6.0,,,MIMAT0000062,let-7a,ENSG00000138785,ENST00000340139,INTS12


In [None]:
clash_coordinates = clash_df[["seq_ID", "enst", "mRNA_start", "mRNA_end_extended"]]
clash_basic = clash_df[["seq_ID", "num_basepairs", "seed_basepairs", "folding_class", "seed_type", "miRNA_seq", "mRNA_seq_extended"]]
clash_features = clash_df[["seq_ID", "folding_energy", "5'UTR", "CDS", "3'UTR", "conservation_score"]]

In [None]:
clash_coordinates

In [5]:
a = genomepy.Annotation("GRCh38")



In [3]:
!genomepy search GRCh38

[1mname                 provider accession         tax_id annotation species                                  other_info                              [0m
[0m                                                        n r e k   <- UCSC options (see help)                                                       [0m
[0mGRCh38               GENCODE  GCA_000001405.15    9606     [32m✓[39m      Homo sapiens                             GENCODE annotation + UCSC genome        [0m
[0mGRCh38.p13           Ensembl  GCA_000001405.28    9606     [32m✓[39m      Homo sapiens                             2014-01-Ensembl/2022-11                 [0m
[0mhg38                 UCSC     GCA_000001405.15    9606  [32m✓[39m [32m✓[39m [31m✗[39m [32m✓[39m   Homo sapiens                             Dec. 2013 (GRCh38/hg38)                 [0m
[0mGRCh38               NCBI     GCF_000001405.26    9606     [32m✓[39m      Homo sapiens                             Genome Reference Consortium            

In [4]:
!genomepy install GRCh38 -p GENCODE

[0m