# Creating k-mer representations

In [1]:
import sys
sys.path.append('../')
from make_representations.representation_maker import Metagenomic16SRepresentation, FastaRepresentations
from utility.file_utility import FileUtility


## Example of Crohn's disease - parallel representation creation -- with sub-sampling

In [10]:
# LER ARQUIVOS FASTAQ
# fasta_files: the fastaq files
# mapping: the order in which they've been read/iterated

fasta_files, mapping = FileUtility.read_fasta_directory('../../crohns_disease/sra_download/samples_fastq','fastq')

In [None]:
# ENTRENAMENTO COS MELLORES TAMAÑOS DE K-MER E OS SEUS CORRESPONDENTES TAMAÑOS DE MOSTRA (SEGÚN OS RESULTADOS DE BOOTSTRAPPING)
# dictionary of k-mers and their sampling values 
# 6:[100,1000,2000,5000,10000,-1]  :  k-mer:[sampling values]      (-1 means using all sequences)

sampling_dict={3:[20],4:[100],5:[500],6:[100,1000,2000,5000,10000,-1],7:[5000],8:[8000]}

# number of cores
nump_p=20

# Para cada arquivo fastaq, creamos unha representación por cada posible combinación de tamaño de k-mer e tamaño de mostra
for k in sampling_dict.keys():  # for each k-mer
    for N in sampling_dict[k]:  # select a sampling size N
        print(k,'-mers with sampling size ',N)
        
        # create a k-mer representation with sampling size N for each fastaq file
        RS=Metagenomic16SRepresentation(fasta_files, mapping, N, nump_p)
        # path to save the generated files
        RS.generate_kmers_all(k, save='../../crohns_disease/datasets/dataset_'+str(k)+'-mers_rate_complete1359_seq_'+str(N))


3 -mers with sampling size  20


  0%|          | 2/1359 [00:03<34:43,  1.54s/it]  

## Example of Ecological/Organismal Environments -- without sub-sampling

## ECO_18K

In [29]:
#map_type=FileUtility.load_obj('../../datasets/processed_data/eco/map_label_type.pickle')
#eco=['soil', 'marine','bioreactor','freshwater','groundwater','sediment','bioreactor_sludge','food_fermentation','compost','rhizosphere','food','hydrocarbon','marine_sediment','activated_sludge','aquatic','hot_springs','freshwater_sediment','ant_fungus_garden']
#orgs=['human_gut','bovine_gut','mouse_gut','chicken_gut','termite_gut']
#corpus_eco_18K, labels_eco_18K=FastaRep.get_samples(eco,1000)
#corpus_eco_180K, labels_eco_180K=FastaRep.get_samples(eco,10000)
#corpus_5guts, labels_5guts=FastaRep.get_samples(orgs,620)
#FileUtility.create_fasta_file('../../datasets_downloaded/ECO/ECO_18K.fasta', corpus_eco_18K, labels_eco_18K)
#FileUtility.create_fasta_file('../../datasets_downloaded/ECO/ECO_180K.fasta', corpus_eco_180K, labels_eco_180K)
#FileUtility.create_fasta_file('../../datasets_downloaded/5GUTs/fiveGuts.fasta', corpus_5guts, labels_5guts)
FastaRep=FastaRepresentations('../../datasets_downloaded/ECO/ECO_18K.fasta',label_modifying_func=lambda x:x.split('.')[1] )
# FastaRep.labels ==> labels
# FastaRep.corpus ==> sequence corpus

In [21]:
for k in [3,4,5,6,7,8]:
    print (k,'-mer creation')
    vec_pres=FastaRep.get_vector_rep(FastaRep.corpus, k,restricted=True)
    FileUtility.save_sparse_csr('../../datasets_downloaded/ECO/ECO_18K_k-mer_representations_labels/'+str(k)+'-mer'+'_eco18k_restrictedmer.npz', vec_pres)
    FileUtility.save_list('../../datasets_downloaded/ECO/ECO_18K_k-mer_representations_labels/ECO_18K_labels.txt',FastaRep.labels)

3 -mer creation
4 -mer creation
5 -mer creation
6 -mer creation
7 -mer creation
8 -mer creation


## ECO_180K

In [22]:

FastaRep=FastaRepresentations('../../datasets_downloaded/ECO/ECO_180K.fasta',label_modifying_func=lambda x:x.split('.')[1] )
# FastaRep.labels ==> labels
# FastaRep.corpus ==> sequence corpus

In [23]:
for k in [3,4,5,6,7,8]:
    print (k,'-mer creation')
    vec_pres=FastaRep.get_vector_rep(FastaRep.corpus, k,restricted=True)
    FileUtility.save_sparse_csr('../../datasets_downloaded/ECO/ECO_180K_k-mer_representations_labels/'+str(k)+'-mer'+'_eco180k_restrictedmer.npz', vec_pres)
    FileUtility.save_list('../../datasets_downloaded/ECO/ECO_180K_k-mer_representations_labels/ECO_180K_labels.txt',FastaRep.labels)

3 -mer creation
4 -mer creation
5 -mer creation
6 -mer creation
7 -mer creation
8 -mer creation


## 5Guts

In [None]:
FastaRep=FastaRepresentations('../../datasets_downloaded/5GUTs/fiveGuts.fasta',label_modifying_func=lambda x:x.split('.')[1] )


In [None]:
for k in [3,4,5,6,7,8]:
    print (k,'-mer creation')
    vec_pres=FastaRep.get_vector_rep(FastaRep.corpus, k,restricted=True)
    FileUtility.save_sparse_csr('../../datasets_downloaded/5GUTs/k-mer_representations_labels/'+str(k)+'-mer'+'_5Guts_restrictedmer.npz', vec_pres)
    FileUtility.save_list('../../datasets_downloaded/5GUTs/k-mer_representations_labels/5Guts_labels.txt',FastaRep.labels)

## Oral bacteria (example of how to do it wout subsampling)

In [2]:
FastaRep=FastaRepresentations('../../oral_bacteria/oral_bacteria.fasta',label_modifying_func=lambda x:x.split('-')[1] )
# FastaRep.labels ==> labels
# FastaRep.corpus ==> sequence corpus

In [5]:
for k in [8]:
    print (k,'-mer creation')
    vec_pres=FastaRep.get_vector_rep(FastaRep.corpus, k,restricted=True)
    FileUtility.save_sparse_csr('../../oral_bacteria/k-mer_representations_labels/'+str(k)+'-mer'+'_oral_bacteria_restrictedmer.npz', vec_pres)
    FileUtility.save_list('../../oral_bacteria/k-mer_representations_labels/oral_bacteria_labels.txt',FastaRep.labels)

8 -mer creation
