In [1]:
import pandas as pd
import numpy as np
from Bio import SeqIO
import glob

from datetime import datetime
from fasta_one_hot_encoder import FastaOneHotEncoder

## Convert fasta files into one hot encoded files

In [2]:
encoder = FastaOneHotEncoder(
    nucleotides="acgt",
    lower = True,
    sparse = False,
    handle_unknown = "ignore")
for fasta in glob.glob("../chroms/*.fa"):
    path = fasta
    chr_tmp = encoder.transform_to_df(path, verbose=True)
    print(path.split("/")[-1].split(".")[0])
    chr_tmp.to_csv("../chroms/oe_chroms/{}.csv".format(path.split("/")[-1].split(".")[0]))


1it [00:44, 44.62s/it]


chr16


1it [00:56, 56.15s/it]


chr13


1it [01:29, 89.00s/it]


chr5


1it [00:40, 40.77s/it]


chr17


1it [01:16, 76.15s/it]


chrX


1it [02:03, 123.56s/it]


chr1


1it [00:28, 28.80s/it]


chr19


1it [00:39, 39.53s/it]


chr18


1it [01:07, 67.58s/it]


chr9


1it [00:23, 23.12s/it]


chr21


1it [01:05, 65.71s/it]


chr10


1it [01:11, 71.04s/it]


chr8


1it [00:28, 28.12s/it]


chrY


1it [00:50, 50.07s/it]


chr15


1it [01:59, 119.54s/it]


chr2


1it [01:06, 66.06s/it]


chr11


1it [01:41, 101.25s/it]


chr3


1it [00:52, 52.26s/it]


chr14


1it [00:32, 32.76s/it]


chr20


1it [01:23, 83.59s/it]


chr6


1it [01:05, 65.31s/it]


chr12


1it [00:24, 24.97s/it]


chr22


1it [01:33, 93.22s/it]


chr4


1it [01:17, 77.94s/it]


chr7


## Create training data from one hot encoded files

In [None]:
def createMLData(chromosomes, step=200, nuc_context=1000):
    bed_cols_names = ["Chromosome", "Start", "End", "Name", "Score", "Strand"]
    pol3_df = pd.read_csv("../data/polr3d.bed", sep="\s+", header=None, names=bed_cols_names)
    for chrom in chromosomes:
        print(chrom+":")
        print("     Creating necessary directories...")
        output_dir1 = "../data/tmp_data/"
        output_dir2 = "../data/chr_trainingData/"
        if not os.path.exists(output_dir1):
            os.makedirs(output_dir1)
        if not os.path.exists(output_dir2):
            os.makedirs(output_dir2)
        #Process chromosome oe df to create training data
        print("     Processing one-hot encoded dataframes...")
        chr_df = pd.read_csv("../chroms/oe_chroms/{}_oe.csv".format(chrom))
        chr_df["Label"] = 0
        pol3_chr_df = pol3_df[pol3_df["Chromosome"] == "{}".format(chrom)]
        for row in range(len(pol3_chr_df)):
            beg_range = pol3_chr_df.iloc[row]["Start"]
            end_range = pol3_chr_df.iloc[row]["End"]
            chr_df.loc[beg_range:end_range, "Label"] = 1

        print("     Creating training data. This may take a while...")
        #Start creating training data
        labels = []
        file_names = []
        final_data = []
        j = 1
        #Get first and last non-N index
        fasta_sequences = SeqIO.parse(open("../chroms/{}.fa".format(chrom)),'fasta')
        for seq in fasta_sequences:
            name, sequence = seq.id, str(seq.seq)
        a_idx = sequence.lower().index("a")
        c_idx = sequence.lower().index("c")
        g_idx = sequence.lower().index("g")
        t_idx = sequence.lower().index("t")
        chr_start_idx = min(a_idx,c_idx,g_idx,t_idx)
        a_idx = sequence.lower().rfind("a")
        c_idx = sequence.lower().rfind("c")
        g_idx = sequence.lower().rfind("g")
        t_idx = sequence.lower().rfind("t")
        chr_end_idx = max(a_idx,c_idx,g_idx,t_idx)
        for i in range(chr_start_idx, chr_end_idx+1, step):
            if i <= chr_end_idx:
                beg_seq = []
                end_seq = []
                start_idx = i - nuc_context
                if start_idx < 0:
                    start_idx = 0
                    n_count = (i - nuc_context) * -1
                    beg_seq = [[0,0,0,0]] * n_count
                end_idx = i+step+nuc_context
                if end_idx > len(chr_df):
                    end_idx = len(chr_df)
                    n_count = (i+step+nuc_context) - len(chr_df)
                    end_seq = [[0,0,0,0]] * n_count

                if beg_seq == [] and end_seq == []:
                    training_seq = chr_df[start_idx:end_idx].drop(columns=["Unnamed: 0", "Label"]).to_numpy()
                elif beg_seq == [] and len(end_seq) != 0:
                    training_seq = chr_df[start_idx:end_idx].drop(columns=["Unnamed: 0", "Label"]).to_numpy() + np.array(end_seq)
                elif len(beg_seq) != 0 and end_seq == []:
                    training_seq = beg_seq + chr_df[start_idx:end_idx].drop(columns=["Unnamed: 0", "Label"]).values.tolist()
                tmp_df = chr_df[start_idx:end_idx]
                grouped_df = tmp_df.groupby("Label").count().reset_index()
                try:
                    if grouped_df[grouped_df["Label"] == 1]["Unnamed: 0"][1] >= 65:
                        labels.append([1])
                    else:
                        labels.append([0])
                except KeyError:
                    labels.append([0])
                training_seq = np.array([training_seq])
                if j == 1: 
                    training_data = training_seq
                else:
                    training_data = np.append(training_data, training_seq, axis=0)
                if j % 10 == 0:
                    np.save("../data/tmp_data/tmp_{}.npy".format(i), training_data)
                    file_names.append("../data/tmp_data/tmp_{}.npy".format(i))
                    j = 1
                else:
                    j+=1
        
        print("     Finalizing training data...")
        fpath ="../data/chr_trainingData/{}_trainingData.npz".format(chrom)
        for npfile in file_names:
            final_data.append(np.load(npfile))

        labels = np.array(labels)
        np.savez(fpath, np.concatenate(final_data), labels)

        dir = '../data/tmp_data/'
        shutil.rmtree(dir)
        
        print("Completed {}!".format(chrom))

In [None]:
training_chroms = ["chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14",
                  "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX", "chrY"]
testing_chroms = ["chr7", "chr8"]


In [None]:
createMLData(training_chroms, 200, 1000)

In [None]:
createMLData(testing_chroms, 200, 1000)