In [1]:
cd ../../..

/run/media/nazif/2F946E411BA61D49/thesis


In [2]:
import pandas as pd
import csv

# pyensembl
from pyensembl import EnsemblRelease
import os
os.environ["PYENSEMBL_CACHE_DIR"] = "../data"
ens60 = EnsemblRelease(60)
ens60.download()
ens60.index()

INFO:pyensembl.sequence_data:Loaded sequence dictionary from /run/media/nazif/2F946E411BA61D49/data/pyensembl/GRCh37/ensembl60/Homo_sapiens.GRCh37.60.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /run/media/nazif/2F946E411BA61D49/data/pyensembl/GRCh37/ensembl60/Homo_sapiens.GRCh37.60.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /run/media/nazif/2F946E411BA61D49/data/pyensembl/GRCh37/ensembl60/Homo_sapiens.GRCh37.60.pep.all.fa.gz.pickle


# read CLASH mmc1.txt and save to csv

In [3]:
with open("data/raw/clash/mmc1.txt") as f:

    # iter(f) is the same as (line for line in f)
    lines = iter(f)

    columns = []
    data = []
    for line in lines:
        if line.startswith("#"):
            line = line[1:]
            row = next(csv.reader([line], delimiter="\t"))
            columns.append(row)
        else:
            row = next(csv.reader([line], delimiter="\t"))
            data.append(row)

    # removes header text
    columns.pop(0)

    # create the output directory if it doesn"t exist
    output_dir = "."
    os.makedirs(output_dir, exist_ok=True)

    # write the output CSV files
    with open(os.path.join(output_dir, "clash_column_details.csv"), "w") as f1, \
            open(os.path.join(output_dir, "clash_raw.csv"), "w") as f2:
        writer1 = csv.writer(f1, delimiter=",")
        writer2 = csv.writer(f2, delimiter=",")
        writer1.writerows(columns)
        writer2.writerows(data)

# wrangling previously exported csv

In [4]:
df = pd.read_csv("data/raw/clash/clash_raw.csv")

# dropping unused CLASH columns
cols_to_drop = ["miRNA_start", "miRNA_end", "chimeras_decompressed",
                "experiments", "experiments_list", "microRNA_first", "two_way_merged",
                "log2_target_enrichment", "CLASH_single_reads_ovlp",
                "5'UTR", "CDS", "3'UTR", "conservation_score",
                "CLASH_cluster_ovlp", "PAR_CLIP_cluster_ovlp"]

df.drop(columns=cols_to_drop, inplace=True)

# process microRNA_name and mRNA_name columns
new_mirna_cols = df["microRNA_name"].str.split("_", expand=True)
new_mirna_cols.columns = ["mirna_accession", "temp1", "mirna_name", "temp2"]
df = pd.concat([df, new_mirna_cols], axis=1)

new_mrna_cols = df["mRNA_name"].str.split("_", expand=True)
new_mrna_cols.columns = ["ensg", "enst", "gene_name", "temp3"]
df = pd.concat([df, new_mrna_cols], axis=1)

# dropping temporary columns
temp_cols = ["microRNA_name", "mRNA_name", "temp1", "temp2", "temp3"]
df.drop(columns=temp_cols, inplace=True)

# renaming columns
rename_dict = {
    "seq_ID": "id",
    "miRNA_seq": "mirna_sequence",
    "mRNA_seq_extended": "mrna_sequence",
    "mRNA_start": "true_start",
    "mRNA_end_extended": "true_end",
    "seed_type": "true_seed_type",
    "folding_class": "true_folding_class"
}

df = df.rename(columns=rename_dict)

# augmenting with pyensembl

In [5]:
def get_sequence_slice(row, start_offset=0, end_offset=0):
    sequence = row["full_sequence"]
    start = row["true_start"] - 1 - start_offset
    end = row["true_end"] + end_offset
    return sequence[start:end]


def get_upstream_n_nucleotides(row, n):
    sequence = row["full_sequence"]
    start = row["true_start"] - 1 - n
    end = row["true_start"] - 1
    return sequence[start:end]


def get_downstream_n_nucleotides(row, n):
    sequence = row["full_sequence"]
    start = row["true_end"]
    end = row["true_end"] + n
    return sequence[start:end]

In [6]:
# get unique ENSTs
ensts_to_fetch = df.enst.unique().tolist()

# get sequences of ENSTs from ENSEMBL 60 to a dict
seq_dict = {
    i: ens60.transcript_by_id(i).sequence
    if ens60.transcript_by_id(i).sequence
    else None
    for i in ensts_to_fetch
}

# appending full sequences to the clash df
df["full_sequence"] = df["enst"].map(seq_dict)

df["upstream_30_nucleotides"] = df.apply(
    get_upstream_n_nucleotides, args=(30,), axis=1)
df["sequence_slice"] = df.apply(get_sequence_slice, axis=1)
df["downstream_30_nucleotides"] = df.apply(
    get_downstream_n_nucleotides, args=(30,), axis=1)

In [7]:
ensts = df["enst"].values.tolist()

chromosomes = []
starts = []
ends = []
strands = []
ensembl_sequences = []


for i in ensts:
    obj = ens60.transcript_by_id(i)

    chromosomes.append(obj.contig)
    starts.append(obj.start)
    ends.append(obj.end)
    strands.append(obj.strand)
    ensembl_sequences.append(obj.sequence)

df["transcript_chr"] = chromosomes
df["transcript_start"] = starts
df["transcript_end"] = ends


df.head()

Unnamed: 0,id,mirna_sequence,true_start,true_end,mrna_sequence,true_seed_type,num_basepairs,seed_basepairs,folding_energy,true_folding_class,...,ensg,enst,gene_name,full_sequence,upstream_30_nucleotides,sequence_slice,downstream_30_nucleotides,transcript_chr,transcript_start,transcript_end
0,0727A-1038930_1,TGAGGTAGTAGGTTGTATAGTT,1791,1890,ATTTGTATCTACGATAAAAATTTTTATACAGAACCTACTGCCTCAA...,noncanonical_seed,20,6,-25.1,III,...,ENSG00000113328,ENST00000340828,CCNG1,AGGGCAGGCGCGGCCCCTTCGGCTCCGAGCTGACCCTGATCAGGGC...,TTTAATATTTTTTTCTAGAAAACAGGTGAC,ATTTGTATCTACGATAAAAATTTTTATACAGAACCTACTGCCTCAA...,TATCACTTCGAAAACTTGCTTTCCCACACT,5,162864587,162872022
1,L1HS-1112536_1,TGAGGTAGTAGGTTGTATAGTT,3857,3928,CAGGAAATACCCGTGCAACCAACTACCTCATATTCCATTCAGAATT...,9-mer,17,6,-24.4,II,...,ENSG00000100697,ENST00000343455,DICER1,CGGAGGCGCGGCGCAGGCTGCTGCAGGCCCAGGTGAATGGAGTAAC...,TGCCAAGGAAATCAGCTAAATTACTACAAG,CAGGAAATACCCGTGCAACCAACTACCTCATATTCCATTCAGAATT...,CCCAGCGATGAATGTACTCTCCTGAGTAAT,14,95552566,95623759
2,L2HS-818542_2,TGAGGTAGTAGGTTGTATAGTT,2385,2434,ACCCGCTATATGACCTGATGCCTTTCCTTCATTAAAGATGATTCTG...,noncanonical_seed,19,6,-22.2,III,...,ENSG00000080546,ENST00000436639,SESN1,GATTGCCAGGGCCGCCCTGTGCCCTCTGGCTCGGCGGTGGTGGGCG...,GCAGAACTCCTTTATGCTCTGAGAGCCATT,ACCCGCTATATGACCTGATGCCTTTCCTTCATTAAAGATGATTCTG...,GATCAGCAGATATAGTCTACAAGGGGGAAG,6,109307640,109416022
3,L2HS-1161339_2,TGAGGTAGTAGGTTGTATAGTT,6570,6623,CAATGACTATGCAACCATACCTTACCACTAAATGTAGTACGCAAAA...,noncanonical_seed,18,6,-22.1,III,...,ENSG00000164190,ENST00000282516,NIPBL,TCCGGTCGGCATTTTGTTCTGAGAGGGAGAGACGGAACGAGAGAGA...,AAATAAGACCCCAGCTCATGGTTAAACATG,CAATGACTATGCAACCATACCTTACCACTAAATGTAGTACGCAAAA...,TGGTTATCTGCAATGTTGCAAAAATCCTAG,5,36876861,37066515
4,L2-407944_2,TGAGGTAGTAGGTTGTATAGTT,1164,1208,AATACTGGGAAACCTGCTACTTCGTCAGCTAACCAGAAACCTGTG,noncanonical_seed,18,6,-21.9,III,...,ENSG00000138785,ENST00000340139,INTS12,AGGGACCACCGGGAACAGACGGATCGGCAGGGCGGGGCGGAACGGT...,ACAGCAAAATTGAGTTCAACAACACAAAAC,AATACTGGGAAACCTGCTACTTCGTCAGCTAACCAGAAACCTGTG,GGTTTGACTGGTCTGGCAACATCATCCAAA,4,106603784,106629838


In [9]:
df.to_csv("data/processed/clash/clash_parsed.csv", index=False)