In [1]:
cd ..

/run/media/nazif/2F946E411BA61D49/thesis


In [2]:
import pandas as pd
import csv

# pyensembl
from pyensembl import EnsemblRelease
import os
os.environ["PYENSEMBL_CACHE_DIR"] = "../data"
ens60 = EnsemblRelease(60)
ens60.download()
ens60.index()

INFO:pyensembl.sequence_data:Loaded sequence dictionary from /run/media/nazif/2F946E411BA61D49/data/pyensembl/GRCh37/ensembl60/Homo_sapiens.GRCh37.60.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /run/media/nazif/2F946E411BA61D49/data/pyensembl/GRCh37/ensembl60/Homo_sapiens.GRCh37.60.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /run/media/nazif/2F946E411BA61D49/data/pyensembl/GRCh37/ensembl60/Homo_sapiens.GRCh37.60.pep.all.fa.gz.pickle


# reading CLASH txt and saving as csv

In [3]:
with open("data/raw/clash/mmc1.txt") as f:

    # iter(f) is the same as (line for line in f)
    lines = iter(f)

    columns = []
    data = []
    for line in lines:
        if line.startswith("#"):
            line = line[1:]
            row = next(csv.reader([line], delimiter="\t"))
            columns.append(row)
        else:
            row = next(csv.reader([line], delimiter="\t"))
            data.append(row)

    # removes header text
    columns.pop(0)

    # create the output directory if it doesn"t exist
    output_dir = "."
    os.makedirs(output_dir, exist_ok=True)

    # write the output CSV files
    with open(os.path.join(output_dir, "data/raw/clash/clash_column_details.csv"), "w") as f1, \
            open(os.path.join(output_dir, "data/raw/clash/clash_raw.csv"), "w") as f2:
        writer1 = csv.writer(f1, delimiter=",")
        writer2 = csv.writer(f2, delimiter=",")
        writer1.writerows(columns)
        writer2.writerows(data)

# wrangling values

In [4]:
df = pd.read_csv("data/raw/clash/clash_raw.csv")

In [5]:
# dropping unused CLASH columns
cols_to_drop = ["miRNA_start", "miRNA_end", "chimeras_decompressed",
                "experiments", "experiments_list", "microRNA_first", "two_way_merged",
                "log2_target_enrichment", "CLASH_single_reads_ovlp",
                "5'UTR", "CDS", "3'UTR", "conservation_score",
                "CLASH_cluster_ovlp", "PAR_CLIP_cluster_ovlp"]

df.drop(columns=cols_to_drop, inplace=True)

In [6]:
# process microRNA_name and mRNA_name columns
new_mirna_cols = df["microRNA_name"].str.split("_", expand=True)
new_mirna_cols.columns = ["mirna_accession", "temp1", "mirna_name", "temp2"]
df = pd.concat([df, new_mirna_cols], axis=1)

new_mrna_cols = df["mRNA_name"].str.split("_", expand=True)
new_mrna_cols.columns = ["ensg", "enst", "gene_name", "temp3"]
df = pd.concat([df, new_mrna_cols], axis=1)

# dropping temporary columns
temp_cols = ["microRNA_name", "mRNA_name", "temp1", "temp2", "temp3"]
df.drop(columns=temp_cols, inplace=True)

In [7]:
# converting start:end biological coordinates into 0-based index
# subtracting 1 from start is enough.

df["mRNA_start"] = df["mRNA_start"] - 1
df["mRNA_end_extended"] = df["mRNA_end_extended"]

In [8]:
# renaming columns
rename_dict = {
    "seq_ID": "id",
    "miRNA_seq": "mirna_sequence",
    "mRNA_seq_extended": "mrna_sequence",
    "mRNA_start": "true_start_index",
    "mRNA_end_extended": "true_end_index",
    "seed_type": "true_seed_type",
    "folding_class": "true_folding_class"
}

df = df.rename(columns=rename_dict)

# augmenting df with full sequences of transcripts using pyensembl

In [9]:
# get sequences of ENSTs from ENSEMBL 60 to a dict
seq_dict = {
    i: ens60.transcript_by_id(i).sequence
    if ens60.transcript_by_id(i).sequence
    else None
    for i in df.enst.unique().tolist()
}

# appending full sequences
df["full_sequence_of_transcript"] = df["enst"].map(seq_dict)

In [10]:
# getting sequence slices from start:end positions
# if slice is closer to the 5' end and can't be extended for 30 nucleotides,
# then return the start position as 0

def get_sequence_slice(row):
    sequence = row["full_sequence_of_transcript"]
    start = max(row["true_start_index"], 0)
    end = row["true_end_index"] 
    return sequence[start:end], start, end


df[["extended_sequence", 'extended_start', "extended_end"]] = df.apply(
    lambda row: get_sequence_slice(row), axis=1, result_type='expand')


df.head()

Unnamed: 0,id,mirna_sequence,true_start_index,true_end_index,mrna_sequence,true_seed_type,num_basepairs,seed_basepairs,folding_energy,true_folding_class,mirna_accession,mirna_name,ensg,enst,gene_name,full_sequence_of_transcript,extended_sequence,extended_start,extended_end
0,0727A-1038930_1,TGAGGTAGTAGGTTGTATAGTT,1790,1890,ATTTGTATCTACGATAAAAATTTTTATACAGAACCTACTGCCTCAA...,noncanonical_seed,20,6,-25.1,III,MIMAT0000062,let-7a,ENSG00000113328,ENST00000340828,CCNG1,AGGGCAGGCGCGGCCCCTTCGGCTCCGAGCTGACCCTGATCAGGGC...,ATTTGTATCTACGATAAAAATTTTTATACAGAACCTACTGCCTCAA...,1790,1890
1,L1HS-1112536_1,TGAGGTAGTAGGTTGTATAGTT,3856,3928,CAGGAAATACCCGTGCAACCAACTACCTCATATTCCATTCAGAATT...,9-mer,17,6,-24.4,II,MIMAT0000062,let-7a,ENSG00000100697,ENST00000343455,DICER1,CGGAGGCGCGGCGCAGGCTGCTGCAGGCCCAGGTGAATGGAGTAAC...,CAGGAAATACCCGTGCAACCAACTACCTCATATTCCATTCAGAATT...,3856,3928
2,L2HS-818542_2,TGAGGTAGTAGGTTGTATAGTT,2384,2434,ACCCGCTATATGACCTGATGCCTTTCCTTCATTAAAGATGATTCTG...,noncanonical_seed,19,6,-22.2,III,MIMAT0000062,let-7a,ENSG00000080546,ENST00000436639,SESN1,GATTGCCAGGGCCGCCCTGTGCCCTCTGGCTCGGCGGTGGTGGGCG...,ACCCGCTATATGACCTGATGCCTTTCCTTCATTAAAGATGATTCTG...,2384,2434
3,L2HS-1161339_2,TGAGGTAGTAGGTTGTATAGTT,6569,6623,CAATGACTATGCAACCATACCTTACCACTAAATGTAGTACGCAAAA...,noncanonical_seed,18,6,-22.1,III,MIMAT0000062,let-7a,ENSG00000164190,ENST00000282516,NIPBL,TCCGGTCGGCATTTTGTTCTGAGAGGGAGAGACGGAACGAGAGAGA...,CAATGACTATGCAACCATACCTTACCACTAAATGTAGTACGCAAAA...,6569,6623
4,L2-407944_2,TGAGGTAGTAGGTTGTATAGTT,1163,1208,AATACTGGGAAACCTGCTACTTCGTCAGCTAACCAGAAACCTGTG,noncanonical_seed,18,6,-21.9,III,MIMAT0000062,let-7a,ENSG00000138785,ENST00000340139,INTS12,AGGGACCACCGGGAACAGACGGATCGGCAGGGCGGGGCGGAACGGT...,AATACTGGGAAACCTGCTACTTCGTCAGCTAACCAGAAACCTGTG,1163,1208


# checking differences between fetched & df sequences

In [11]:
def compare_sequences(df):
    df["substring"] = df.apply(lambda row: row["full_sequence_of_transcript"][row["true_start_index"]:row["true_end_index"]], axis=1)
    df["comparison_result"] = df["substring"] == df["mrna_sequence"]
    return df

print(compare_sequences(df).comparison_result.value_counts())
print("6 rows' sequences are different")



comparison_result
True     18508
False        6
Name: count, dtype: int64
6 rows' sequences are different


In [12]:
compare_sequences(df)[compare_sequences(df)["comparison_result"] == False].gene_name.unique()

array(['DDX21'], dtype=object)

In [13]:
from diff_match_patch import diff_match_patch
from IPython.display import display, HTML


def highlight_differences(string1, string2):
    dmp = diff_match_patch()
    diffs = dmp.diff_main(string1, string2)
    dmp.diff_cleanupSemantic(diffs)

    highlighted_string = ""
    for op, data in diffs:
        if op == 0:  # no change
            highlighted_string += data
        elif op == -1:  # deletion
            highlighted_string += f'<del>{data}</del>'
        elif op == 1:  # insertion
            highlighted_string += f'<ins>{data}</ins>'

    return highlighted_string


# Assuming you have a DataFrame called 'df' with a 'highlighted' column
def add_color(row):
    highlighted = row['highlighted']
    if '<del>' in highlighted:
        highlighted = highlighted.replace('<del>', '<span style="color: red;">')
        highlighted = highlighted.replace('</del>', '</span>')
    if '<ins>' in highlighted:
        highlighted = highlighted.replace('<ins>', '<span style="color: green;">')
        highlighted = highlighted.replace('</ins>', '</span>')
    return highlighted

different_values_df = compare_sequences(df)[compare_sequences(df)["comparison_result"] == False]

# creating difference strings
different_values_df['highlighted'] = different_values_df.apply(lambda row: highlight_differences(row['mrna_sequence'], row['extended_sequence']), axis=1)


# render the differences
different_values_df['highlighted'] = different_values_df.apply(add_color, axis=1)

display(HTML(different_values_df[["highlighted"]].to_html(escape=False)))

print("reds symbolize sequences deleted from df sequences")
print("greens symbolize sequences added by the ENSEMBL60 sequence fetch tool")
print("whites symbolize sequences that are the same")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  different_values_df['highlighted'] = different_values_df.apply(lambda row: highlight_differences(row['mrna_sequence'], row['extended_sequence']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  different_values_df['highlighted'] = different_values_df.apply(add_color, axis=1)


Unnamed: 0,highlighted
4508,ATGGATTTAGCTACTTTTTGGTTCTAAATGAACTTGTTGGGTTAGCTTGAAATAATCTGGCATAAATTAAGAGT
6518,CCCCTGGCCAGATAACTGCCTGATTTCTCAGATATTGGGGTTTTTAGGTAGTCAGTGGCCTAGTTTCTCTGGGAAACATTCCCTAAAGCCACAGTATAGGATCTGTTACAA
8028,AGTAGAGGCCCGAGATGGCATGATTCACGACGCTGGACAGCGATCAGGAGGTGGCAATCTCTGTGGCCACAGAGCAAACCAGTAACATGGAAGGATCCACAAAACAAAG
10039,AGTCGAGGCTTCAGGGGACAGCGGGACGGAAACAGAAGGGAAAGCTGGGTGTTTGCTTTGATGTACCTACCGCATTCAGTAAC
15835,AACTGCCCTCGCCTTAATTTGATCACCACCAGAGATTATTTCTGTACTAGGAGCTCTGATAGCCAGCTCAAATCCTGGAACTACT
17039,TAGAGGAGTTAGTACAAGTGGAGCTGTAAAAAGCGGGAAGCTCTGGCAGCAGCACTGGCTTAAGTTCAAACG


reds symbolize sequences deleted from df sequences
greens symbolize sequences added by the ENSEMBL60 sequence fetch tool
whites symbolize sequences that are the same


# extending both end of sequence for 30 nts 

In [14]:
def get_sequence_slice(row):
    sequence = row["full_sequence_of_transcript"]
    start = max(row["true_start_index"]-30, 0)
    end = row["true_end_index"]+30 
    return sequence[start:end], start, end


df[["extended_sequence", 'extended_start', "extended_end"]] = df.apply(
    lambda row: get_sequence_slice(row), axis=1, result_type='expand')


df.head()

Unnamed: 0,id,mirna_sequence,true_start_index,true_end_index,mrna_sequence,true_seed_type,num_basepairs,seed_basepairs,folding_energy,true_folding_class,...,mirna_name,ensg,enst,gene_name,full_sequence_of_transcript,extended_sequence,extended_start,extended_end,substring,comparison_result
0,0727A-1038930_1,TGAGGTAGTAGGTTGTATAGTT,1790,1890,ATTTGTATCTACGATAAAAATTTTTATACAGAACCTACTGCCTCAA...,noncanonical_seed,20,6,-25.1,III,...,let-7a,ENSG00000113328,ENST00000340828,CCNG1,AGGGCAGGCGCGGCCCCTTCGGCTCCGAGCTGACCCTGATCAGGGC...,TTTAATATTTTTTTCTAGAAAACAGGTGACATTTGTATCTACGATA...,1760,1920,ATTTGTATCTACGATAAAAATTTTTATACAGAACCTACTGCCTCAA...,True
1,L1HS-1112536_1,TGAGGTAGTAGGTTGTATAGTT,3856,3928,CAGGAAATACCCGTGCAACCAACTACCTCATATTCCATTCAGAATT...,9-mer,17,6,-24.4,II,...,let-7a,ENSG00000100697,ENST00000343455,DICER1,CGGAGGCGCGGCGCAGGCTGCTGCAGGCCCAGGTGAATGGAGTAAC...,TGCCAAGGAAATCAGCTAAATTACTACAAGCAGGAAATACCCGTGC...,3826,3958,CAGGAAATACCCGTGCAACCAACTACCTCATATTCCATTCAGAATT...,True
2,L2HS-818542_2,TGAGGTAGTAGGTTGTATAGTT,2384,2434,ACCCGCTATATGACCTGATGCCTTTCCTTCATTAAAGATGATTCTG...,noncanonical_seed,19,6,-22.2,III,...,let-7a,ENSG00000080546,ENST00000436639,SESN1,GATTGCCAGGGCCGCCCTGTGCCCTCTGGCTCGGCGGTGGTGGGCG...,GCAGAACTCCTTTATGCTCTGAGAGCCATTACCCGCTATATGACCT...,2354,2464,ACCCGCTATATGACCTGATGCCTTTCCTTCATTAAAGATGATTCTG...,True
3,L2HS-1161339_2,TGAGGTAGTAGGTTGTATAGTT,6569,6623,CAATGACTATGCAACCATACCTTACCACTAAATGTAGTACGCAAAA...,noncanonical_seed,18,6,-22.1,III,...,let-7a,ENSG00000164190,ENST00000282516,NIPBL,TCCGGTCGGCATTTTGTTCTGAGAGGGAGAGACGGAACGAGAGAGA...,AAATAAGACCCCAGCTCATGGTTAAACATGCAATGACTATGCAACC...,6539,6653,CAATGACTATGCAACCATACCTTACCACTAAATGTAGTACGCAAAA...,True
4,L2-407944_2,TGAGGTAGTAGGTTGTATAGTT,1163,1208,AATACTGGGAAACCTGCTACTTCGTCAGCTAACCAGAAACCTGTG,noncanonical_seed,18,6,-21.9,III,...,let-7a,ENSG00000138785,ENST00000340139,INTS12,AGGGACCACCGGGAACAGACGGATCGGCAGGGCGGGGCGGAACGGT...,ACAGCAAAATTGAGTTCAACAACACAAAACAATACTGGGAAACCTG...,1133,1238,AATACTGGGAAACCTGCTACTTCGTCAGCTAACCAGAAACCTGTG,True


In [15]:
# dropping temp cols and saving to csv

cols_to_drop = ["substring", "comparison_result"]

df.drop(columns=cols_to_drop, inplace=True)
df.head()

Unnamed: 0,id,mirna_sequence,true_start_index,true_end_index,mrna_sequence,true_seed_type,num_basepairs,seed_basepairs,folding_energy,true_folding_class,mirna_accession,mirna_name,ensg,enst,gene_name,full_sequence_of_transcript,extended_sequence,extended_start,extended_end
0,0727A-1038930_1,TGAGGTAGTAGGTTGTATAGTT,1790,1890,ATTTGTATCTACGATAAAAATTTTTATACAGAACCTACTGCCTCAA...,noncanonical_seed,20,6,-25.1,III,MIMAT0000062,let-7a,ENSG00000113328,ENST00000340828,CCNG1,AGGGCAGGCGCGGCCCCTTCGGCTCCGAGCTGACCCTGATCAGGGC...,TTTAATATTTTTTTCTAGAAAACAGGTGACATTTGTATCTACGATA...,1760,1920
1,L1HS-1112536_1,TGAGGTAGTAGGTTGTATAGTT,3856,3928,CAGGAAATACCCGTGCAACCAACTACCTCATATTCCATTCAGAATT...,9-mer,17,6,-24.4,II,MIMAT0000062,let-7a,ENSG00000100697,ENST00000343455,DICER1,CGGAGGCGCGGCGCAGGCTGCTGCAGGCCCAGGTGAATGGAGTAAC...,TGCCAAGGAAATCAGCTAAATTACTACAAGCAGGAAATACCCGTGC...,3826,3958
2,L2HS-818542_2,TGAGGTAGTAGGTTGTATAGTT,2384,2434,ACCCGCTATATGACCTGATGCCTTTCCTTCATTAAAGATGATTCTG...,noncanonical_seed,19,6,-22.2,III,MIMAT0000062,let-7a,ENSG00000080546,ENST00000436639,SESN1,GATTGCCAGGGCCGCCCTGTGCCCTCTGGCTCGGCGGTGGTGGGCG...,GCAGAACTCCTTTATGCTCTGAGAGCCATTACCCGCTATATGACCT...,2354,2464
3,L2HS-1161339_2,TGAGGTAGTAGGTTGTATAGTT,6569,6623,CAATGACTATGCAACCATACCTTACCACTAAATGTAGTACGCAAAA...,noncanonical_seed,18,6,-22.1,III,MIMAT0000062,let-7a,ENSG00000164190,ENST00000282516,NIPBL,TCCGGTCGGCATTTTGTTCTGAGAGGGAGAGACGGAACGAGAGAGA...,AAATAAGACCCCAGCTCATGGTTAAACATGCAATGACTATGCAACC...,6539,6653
4,L2-407944_2,TGAGGTAGTAGGTTGTATAGTT,1163,1208,AATACTGGGAAACCTGCTACTTCGTCAGCTAACCAGAAACCTGTG,noncanonical_seed,18,6,-21.9,III,MIMAT0000062,let-7a,ENSG00000138785,ENST00000340139,INTS12,AGGGACCACCGGGAACAGACGGATCGGCAGGGCGGGGCGGAACGGT...,ACAGCAAAATTGAGTTCAACAACACAAAACAATACTGGGAAACCTG...,1133,1238


In [16]:
df.to_csv("data/processed/clash/clash_parsed.csv", index=False)
