In [1]:
# importing dependencies
from Bio import SeqIO
import pandas as pd


# questions:

which version of fasta file? Soft masked? Masked? Raw?

do we need match diagrams for MRE & seed interactions?

In [11]:
def import_fasta(fasta):
    """ imports fasta file into a python string

    Args:
        fasta (_type_): relative path of fasta file

    Returns:
        _type_: python string containing sequence
    """
    with open(fasta) as f:
        records = SeqIO.parse(f, "fasta")
        transcripts = [str(rec.seq) for rec in records]
        print("sequence loaded")
        return "".join(transcripts)


def create_mirna_database(mirna_fasta):
    with open(mirna_fasta) as f:

        identifiers = []
        lengths = []
        sequences = []
        cDNA = []

        for seq_record in SeqIO.parse(f, "fasta"):
            identifiers.append(seq_record.id)
            lengths.append(len(seq_record.seq))
            sequences.append(str(seq_record.seq))
            # back_transcribe() changes U's into T's, complement() takes the complement
            temp = seq_record.seq.back_transcribe()
            temp = temp.complement()
            cDNA.append(str(temp))

    # zipping lists into a pandas df
    mirna_df = pd.DataFrame(list(zip(identifiers, sequences, cDNA)), columns=[
                            "name", "miRNA_seq", "cDNA_seq"])
    # removing non-human entries
    mirna_df = mirna_df[mirna_df["name"].str.contains("hsa")]
    # fixing index numbers
    mirna_df.index = range(1, len(mirna_df)+1)
    # deriving MRE sequences and adding as a new column
    mirna_df["MRE"] = mirna_df["cDNA_seq"].str[1:9]
    # deriving kmer columns
    mirna_df["8mer"] = mirna_df["cDNA_seq"].str[:8]
    mirna_df["7mer-A1"] = mirna_df["cDNA_seq"].str[:7]
    mirna_df["7mer-m8"] = mirna_df["cDNA_seq"].str[1:8]
    mirna_df["6mer"] = mirna_df["cDNA_seq"].str[1:7]
    mirna_df["offset_6mer"] = mirna_df["cDNA_seq"].str[2:8]
    # dropping 8mer and 7mer-A1 values that doesn't start with an A
    mirna_df["8mer"].replace({"^[C,T,G]": None}, regex=True, inplace=True)
    mirna_df["7mer-A1"].replace({"^[C,T,G]": None}, regex=True, inplace=True)

    print("miRNA dataframe loaded")
    return mirna_df


def generate_kmers(sequence, k):
    for nucleotide in range(len(sequence) - k + 1):
        yield sequence[nucleotide: nucleotide + k]


def generate_seed_kmer_dictionary(k, mirna_df):
    target_seed = []
    target_name = []

    if k == 6:
        target_name.extend(mirna_df["name"].tolist())
        target_seed.extend(mirna_df["6mer"].tolist())

        target_name.extend(mirna_df["name"].tolist())
        target_seed.extend(mirna_df["offset_6mer"].tolist())

    if k == 7:
        target_name.extend(mirna_df["name"].tolist())
        target_seed.extend(mirna_df["7mer-A1"].tolist())

        target_name.extend(mirna_df["name"].tolist())
        target_seed.extend(mirna_df["7mer-m8"].tolist())

    if k == 8:
        target_name.extend(mirna_df["name"].tolist())
        target_seed.extend(mirna_df["8mer"].tolist())

        # dictionary comprehension
    return {target_seed[i]: target_name[i] for i in range(len(target_name))}


def find_MRE_positions(kmer_generator, kmer_dictionary, k):
    kmer_list = []
    mirna_name = []
    starting_position = []
    ending_position = []

    for count, kmer in enumerate(kmer_generator, start=0):

        if kmer in kmer_dictionary:

            position_of_first_nucleotide = count + 1
            position_of_last_nucleotide = count + k

            kmer_list.append(kmer)
            mirna_name.append(kmer_dictionary[kmer])
            starting_position.append(position_of_first_nucleotide)
            ending_position.append(position_of_last_nucleotide)

    return pd.DataFrame(list(zip(kmer_list, mirna_name, starting_position, ending_position)), columns=["kmer", "name", "start", "end"])


def run_script(k, fasta, mirna_fasta):

    # DNA branch
    sequence = import_fasta(fasta)
    kmer_generator = generate_kmers(sequence, k)

    # miRNA branch
    mirna_df = create_mirna_database(mirna_fasta)
    kmer_dictionary = generate_seed_kmer_dictionary(k, mirna_df)

    return find_MRE_positions(kmer_generator, kmer_dictionary, k)


## Files help:

Fasta of GRCh38, chromosome 1, soft masked downloaded from:

https://ftp.ensembl.org/pub/current_fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.1.fa.gz

Fasta of mature miRNA sequences downloaded from:

https://www.mirbase.org/ftp.shtml

## How to run the script:

1. enter k size
2. point to the target fasta
3. point to the miRNA database fasta

In [21]:
k = 8
fasta = "../sequences/grch38/chromosomes/Homo_sapiens.GRCh38.dna_sm.chromosome.1.fa"
bcl2_fasta = "../data/bcl2_canonical_3utr.fa"
mirna_fasta = "../sequences/mirna/mature_miRNAs.fa"

In [29]:
bcl2_results = run_script(k, bcl2_fasta, mirna_fasta)

bcl2_results.head()

sequence loaded
miRNA dataframe loaded


Unnamed: 0,kmer,name,start,end
0,ATTGTCAG,hsa-miR-132-3p,64,71
1,ACATCACA,hsa-miR-142-3p,128,135
2,ACACACAC,hsa-miR-6867-5p,133,140
3,ACACACAC,hsa-miR-6867-5p,147,154
4,ACACACAC,hsa-miR-6867-5p,149,156


In [26]:
chr1_results = run_script(8, fasta, mirna_fasta)

chr1_results.head()
len(chr1_results)

sequence loaded
miRNA dataframe loaded


1544795

# Work in progress cells
