In [3]:
############################################################################################

from sklearn.model_selection import train_test_split
def df_strat_split(
    df_input, stratify_colname = 'Species', 
    frac_train = 0.6, frac_val = 0.2, frac_test = 0.2, 
    random_state = None):
    '''
    Splits a Pandas dataframe into three subsets 
    (train, val, and test) following fractional ratios provided by the user, 
    where each subset is stratified by the values in a specific column 
    (each subset has the same relative frequency of the values in the column). 
    It performs this splitting by running train_test_split() twice.

    Parameters
    ----------
    df_input : Pandas dataframe
        Input dataframe to be split.
    stratify_colname : str
        The name of the column that will be used for stratification. 
        Usually this column would be for the label.
    frac_train : float
    frac_val   : float
    frac_test  : float
        The ratios with which the dataframe will be split into 
        train, val, and test data. (float fractions and sum = 1.0)
    random_state : int, None, or RandomStateInstance
        Value to be passed to train_test_split(). 
        (simply sets a seed to the random generator)

    Returns
    -------
    df_train, df_val, df_test :
        Dataframes containing the three splits.
    '''
    # -----------------------------------------------------------------------------------
    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError(
            'fractions {}, {}, {} do not add up to 1.0'.format(
                frac_train, frac_val, frac_test))

    if stratify_colname not in df_input.columns:
        raise ValueError(
            '{} is not a column in the dataframe'.format(
                stratify_colname))
    # -----------------------------------------------------------------------------------
    X = df_input                     # dontains all columns
    y = df_input[[stratify_colname]] # dataframe of just the column on which to stratify
    # split original dataframe into train and temp dataframes
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify = y, 
        test_size = (1.0 - frac_train), 
        random_state = random_state)
    # split the temp dataframe into val and test dataframes
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(
        df_temp, y_temp, stratify = y_temp, 
        test_size = relative_frac_test, 
        random_state = random_state)
    # -----------------------------------------------------------------------------------
    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
    return df_train, df_val, df_test
    
############################################################################################
# k-mer encoding the sequences
# function to generate all possible k-mers
def all_kmers(k, alphabet = "ACGT"):
    return [''.join(chars) for chars in product(*(k*(alphabet,)))]
two_mers, for_mers, six_mers, eig_mers = all_kmers(2), all_kmers(4), all_kmers(6), all_kmers(8)

def allowed_k(segment, search=re.compile(r'^[AGTC]+$').search):
    return bool(search(segment))

def k_mer_encode(seq, k_mers, max_len=1600):
    k = len(k_mers[0])
    # cut sequence in k-mers
    chunks = [seq[i:i+k] for i in range(0, len(seq) - k+1) if allowed_k(seq[i:i+k])]
    # counting and regularizing all unique k-mers in the sequence
    segments, counts = np.unique(chunks, return_counts = True)
    counts = counts / np.amax(counts)
    # find and regularize the index of each k-mer
    indeces = np.where(np.isin(k_mers, segments))[0] / 4**k
    # returning 2D array with [k_mer index, count] with equal size for each k-mer
    arr = np.dstack((indeces, counts))[0]

    current_len = len(arr)
    tot_len = min(4**k, max_len)
    if current_len < tot_len:
        to_add = [[0.,0.] for i in range(tot_len - current_len)]
        arr = np.append(arr, to_add, axis = 0)
        
    return arr

############################################################################################
# one-hot-encoding the sequences
one_hot_dict = {
    'A': [71.9, 17.8, 6.1, 4.2], 'G':[13.9, 78.8, 4.4, 2.9], 'T':[5.6, 4.9, 70.8, 18.7], 'C':[4.8, 4.1, 12.0, 79.1], 
    'Y':[0.,0.,0.5,0.5], 'R':[0.5,0.5,0.,0.], 'W':[0.5,0.,0.5,0.], 'S':[0.,0.5,0.,0.5], 'K':[0.,0.5,0.5,0.], 'M':[0.5,0.,0.,0.5], 
    'D':[0.33,0.33,0.33,0.], 'V':[0.33,0.33,0.,0.33], 'H':[0.33,0.,0.33,0.33], 'B':[0.,0.33,0.33,0.33], 
    'X':[0.25,0.25,0.25,0.25], 'N':[0.25,0.25,0.25,0.25], '-':[0.,0.,0.,0.]
    }

def one_hot_seq(sequence, one_hot_dict=one_hot_dict, max_len=1600):
    # padding the sequences to a fixed length
	sequence += '-'*(max_len - len(sequence))
    # generating list of one-hot-lists using the dictionary
	onehot_encoded = [one_hot_dict[nucleotide] for nucleotide in sequence]
    # returning the list of lists as a numpy array
	return np.array(onehot_encoded)

############################################################################################
# One-hot-encoding the labels
taxa = ['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']

def get_taxon_dict(df, taxon):
    # listing all unique taxon labels
    taxon_list = list(df[taxon].unique())

    # generating a dictionary to associate every unique taxon to a number
    taxon_dict = dict(zip(taxon_list, range(0, len(taxon_list))))
    # and the reversed dictionary as a lookup table
    taxon_dict_lookup = {v: k for k, v in taxon_dict.items()}

    return taxon_dict, taxon_dict_lookup

# y_train_gen_na = to_categorical(
#     y = train_na[taxon].map(taxon_dict).astype(np.float32))
# np.save('arrays/genus/y_train_gen_na.npy', y_train_gen_na)

# label encoding should be applied to all taxa instead of just one!

In [16]:
4.8+4.1+12.0

20.9

In [17]:
100 - 20.9

79.1

In [2]:
import matplotlib.pyplot as plt
import sklearn
from itertools import product
import statistics
import numpy as np
import pandas as pd
import re

import tensorflow as tf
import pathlib
import os
import matplotlib.pyplot as plt

np.set_printoptions(precision=4)

In [5]:
df = pd.read_csv('data/test_df.csv')
df

Unnamed: 0,Kingdom,Phylum,Class,Order,Family,Genus,Species,Sequence
0,Bacteria,Proteobacteria,Alphaproteobacteria,Micropepsales,Micropepsaceae,Rhizomicrobium,Rhizomicrobium sp,GGCGGACGGGTGAGTAACACGTGGGAACATGCCTATCGGTTCGGAA...
1,Bacteria,Lentisphaerae,Lentisphaeria,Victivallales,Victivallaceae,Victivallis,Victivallis sp,GGCGGAAGGGTGAGGAACGCGTGAGTAATCTGCCCTCAAGTTGGGA...
2,Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacterales,Erwiniaceae,Buchnera,Buchnera aphidicola,GGCAAACGGGTGAGTAATATCTGGGGATCTACCCAAAAGAGGGGGA...
3,Bacteria,Proteobacteria,Betaproteobacteria,Burkholderiales,Burkholderiaceae,Burkholderia,Burkholderia multivorans,GGCGAACGGGTGAGTAATACATCGGAACATGTCCTGTAGTGGGGGA...
4,Bacteria,Firmicutes,Bacilli,Caryophanales,Paenibacillaceae,Paenibacillus,Paenibacillus mucilaginosus,GGCGGACGGGTGAGTAACACGTAGGCAACCTGCCTGTAAGATCGGG...
...,...,...,...,...,...,...,...,...
995,Bacteria,Proteobacteria,Gammaproteobacteria,Pasteurellales,Pasteurellaceae,Haemophilus,Haemophilus influenzae,GGCGGACGGGTGAGTAATGCTTGGGAATCTGGCTTATGGAGGGGGA...
996,Bacteria,Proteobacteria,Gammaproteobacteria,Vibrionales,Vibrionaceae,Vibrio,Vibrio vulnificus,GGCGGACGGGTGAGTAATGCCTGGGAAATTGCCCTGATGTGGGGGA...
997,Bacteria,Firmicutes,Bacilli,Lactobacillales,Streptococcaceae,Lactococcus,Lactococcus lactis,AGCGAACGGGTGAGTAACGCGTGGGGAATCTGCCTTTGAGCGGGGG...
998,Bacteria,Bacteroidetes,Flavobacteriia,Flavobacteriales,Flavobacteriaceae,g_Flavobacteriaceae,g_Flavobacteriaceae sp,GGCGCACGGGTGCGTAACGCGTATACAACCTACCCATTACAAAGGA...


In [8]:
two = df_sample['Sequence'].apply(lambda seq: k_mer_encode(seq, two_mers))
fur = df_sample['Sequence'].apply(lambda seq: k_mer_encode(seq, for_mers))
six = df_sample['Sequence'].apply(lambda seq: k_mer_encode(seq, six_mers))
# eig = df_sample['Sequence'].apply(lambda seq: k_mer_encode(seq, eig_mers))

In [10]:
fur

13733    [[0.0, 0.3333333333333333], [0.00390625, 0.466...
24679    [[0.0, 0.21428571428571427], [0.00390625, 0.28...
8146     [[0.0, 1.0], [0.00390625, 0.6875], [0.0078125,...
3753     [[0.0, 0.21052631578947367], [0.00390625, 0.31...
16565    [[0.00390625, 0.47058823529411764], [0.0078125...
                               ...                        
16852    [[0.0, 0.05263157894736842], [0.00390625, 0.26...
24674    [[0.0, 0.23529411764705882], [0.00390625, 0.35...
22686    [[0.0, 0.2], [0.00390625, 0.6666666666666666],...
9292     [[0.0, 1.0], [0.00390625, 0.36666666666666664]...
11257    [[0.0, 0.10526315789473684], [0.00390625, 0.47...
Name: Sequence, Length: 1000, dtype: object

In [11]:
one_hot_seq('ATCGATCGGGG')

array([[1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       ...,
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])