In [9]:
import os
import csv
import pandas as pd
import numpy as np
import statistics
import json
from scipy.fftpack import fft
from Bio import Entrez
from tqdm import tqdm
from Bio import SeqIO

### Setup

Primeiramente é feita a filtragem dos Dados iniciais.

In [None]:
file_paths = os.listdir('./data/')
file_paths = list(set(file_paths) - set(['data.csv', 'sequences']))
print(file_paths)

In [None]:
data = []

for path in file_paths:
    with open('./data/' + path, 'r') as f:
        reader = csv.reader(f, delimiter='\t')

        class_name = path.split('.')[0].split('_')[1]

        for row in reader:
            if (row[6] == '+'):
                data.append([class_name] + row)

In [None]:
header = ['Class', 'Chr', 'Source Annotation', 'Class/Order/Superfamily', 'Start', 'End', 'Score', 'Strand', 'Phase', 'Attributes']

data_df = pd.DataFrame(data, columns=header)
data_df = data_df[['Class', 'Chr', 'Start', 'End']]

data_df.head()

In [None]:
data_df.to_csv('./data/data.csv')

In [None]:
## Checando os possíveis valores dos cromossomos
data_df.Chr.unique()

Em sequência será feita a extração dos genomas por meio de Biopython

In [None]:
id_dict = {"LR618874.1": "Chr_1.txt", "LR618875.1": "Chr_2.txt", "LR618876.1": "Chr_3.txt", "LR618877.1": "Chr_4.txt", 
           "LR618878.1": "Chr_5.txt", "LR618879.1": "Chr_6.txt", "LR618880.1": "Chr_7.txt", "LR618881.1": "Chr_8.txt", 
           "LR618882.1": "Chr_9.txt", "LR618883.1": "Chr_10.txt", "AY506529.1":"Chr_Mt.txt", "X86563.2": "Chr_Pt.txt"}

Entrez.email = "pedro.guilherme2305@usp.br"
for id in tqdm(id_dict, total=len(id_dict)):
    stream = Entrez.efetch(db="nuccore", id=id, rettype="fasta")
    
    with open(f"./data/sequences/{id_dict[id]}", "w") as file:
        file.write(stream.read())
    stream.close()

### Processamento

In [2]:
data_df = pd.read_csv("./data/data.csv")
chromosomes_to_keep = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', 'Mt', 'Pt']
data_df = data_df.query("Chr in @chromosomes_to_keep") 

In [3]:
aux_list = []

for chromosome in chromosomes_to_keep:
    rows = data_df.query(f"Chr == '{chromosome}'").to_dict(orient="records")
    record = SeqIO.read(f"./data/sequences/Chr_{chromosome}.txt", "fasta")
    for row in tqdm(rows, total=len(rows)):
        aux_dict = dict()

        aux_dict['Chr'] = row['Chr']
        aux_dict['Sequence'] = record[row['Start']:row['End']].seq
        aux_dict['Class'] = row['Class']

        if aux_dict['Sequence'] == '': aux_dict['Sequence'] = np.nan

        aux_list.append(aux_dict)


100%|██████████| 48971/48971 [00:00<00:00, 175792.17it/s]
100%|██████████| 37569/37569 [00:00<00:00, 132603.18it/s]
100%|██████████| 36629/36629 [00:00<00:00, 105458.47it/s]
100%|██████████| 50378/50378 [00:00<00:00, 166314.69it/s]
100%|██████████| 34257/34257 [00:00<00:00, 187684.21it/s]
100%|██████████| 25934/25934 [00:00<00:00, 95889.45it/s] 
100%|██████████| 28016/28016 [00:00<00:00, 202045.81it/s]
100%|██████████| 27986/27986 [00:00<00:00, 192118.00it/s]
100%|██████████| 25350/25350 [00:00<00:00, 114801.89it/s]
100%|██████████| 23244/23244 [00:00<00:00, 238348.50it/s]
100%|██████████| 78/78 [00:00<00:00, 245612.40it/s]
100%|██████████| 30/30 [00:00<00:00, 137218.23it/s]


In [4]:
final_df = pd.DataFrame(aux_list)
final_df = final_df.dropna()
final_df

Unnamed: 0,Chr,Sequence,Class
0,1,"(G, C, G, T, G, G, A, G, C, G, G, T, G, G, C, ...",TIR
1,1,"(G, T, G, C, A, T, C, A, T, A, T, G, T, C, T, ...",TIR
2,1,"(C, A, C, G, A, C, G, G, C, G, A, A, G, C, G, ...",TIR
3,1,"(T, T, C, C, G, C, T, G, A, A, A, G, T, A, A, ...",TIR
4,1,"(T, T, T, C, C, T, A, A, A, T, G, A, T, T, T, ...",TIR
...,...,...,...
338437,Pt,"(T, G, T, A, T, G, A, A, C, A, A, G, G, G, T, ...",Helitron
338438,Pt,"(G, T, A, G, T, T, G, A, A, A, T, A, A, C, G, ...",Helitron
338439,Pt,"(A, T, T, T, A, T, A, A, T, A, C, T, T, C, A, ...",Helitron
338440,Pt,"(A, A, A, A, G, A, T, G, A, A, A, A, A, A, C, ...",Helitron


### Feauture Extraction - Accumulated Nucle Frequency Fourier

In [6]:
def feature_extraction(spectrum, spectrumTwo):
    features = []

    average = sum(spectrum)/len(spectrum)
    features.append(average)
    ###################################
    median = np.median(spectrum)
    features.append(median)
	###################################
    maximum = np.max(spectrum)
    features.append(maximum)
    ###################################
    minimum = np.min(spectrum)
    features.append(minimum)
    ###################################
    peak = (len(spectrum)/3)/(average)
    features.append(peak)
    ###################################
    peak_two = (len(spectrumTwo)/3)/(np.mean(spectrumTwo))
    features.append(peak_two)
    ###################################
    standard_deviation = np.std(spectrum) # standard deviation
    features.append(standard_deviation)
    ###################################
    standard_deviation_pop = statistics.stdev(spectrum) # population sample standard deviation 
    features.append(standard_deviation_pop)
    ###################################
    percentile15 = np.percentile(spectrum, 15)
    features.append(percentile15)
    ###################################
    percentile25 = np.percentile(spectrum, 25)
    features.append(percentile25)
    ###################################
    percentile50 = np.percentile(spectrum, 50)
    features.append(percentile50)
    ###################################
    percentile75 = np.percentile(spectrum, 75)
    features.append(percentile75)
    ###################################
    amplitude = maximum - minimum
    features.append(amplitude)
    ###################################
    # mode = statistics.mode(spectrum)
    ###################################
    variance = statistics.variance(spectrum)
    features.append(variance)
    ###################################
    interquartile_range = np.percentile(spectrum, 75) - np.percentile(spectrum, 25)
    features.append(interquartile_range)
    ###################################
    semi_interquartile_range = (np.percentile(spectrum, 75) - np.percentile(spectrum, 25))/2 
    features.append(semi_interquartile_range)
    ###################################
    coefficient_of_variation = standard_deviation/average
    features.append(coefficient_of_variation)
    ###################################
    skewness = (3 * (average - median))/standard_deviation
    features.append(skewness)   
    ###################################
    kurtosis = (np.percentile(spectrum, 75) - np.percentile(spectrum, 25)) / (2 * (np.percentile(spectrum, 90) - np.percentile(spectrum, 10))) 
    features.append(kurtosis)
    ###################################
    return features


def accumulated_nucle_frequency_fourier(seq):
    
    seq = seq.upper()
    features = []
    spectrum = []
    spectrumTwo = []
    mapping = []
    A = 0
    C = 0
    T = 0
    G = 0
    for i in range(len(seq)):
        if seq[i] == 'A':
            A += 1
            mapping.append(A / (i + 1))
        elif seq[i] == 'C':
            C += 1
            mapping.append(C / (i + 1))
        elif seq[i] == 'T' or seq[i] == 'U':
            T += 1
            mapping.append(T / (i + 1))
        else:
            G += 1
            mapping.append(G / (i + 1))
    Fmap = fft(mapping)
    for i in range(len(mapping)):
        specTotal = (abs(Fmap[i])**2)
        specTwo = (abs(Fmap[i]))
        spectrum.append(specTotal)
        spectrumTwo.append(specTwo)
    
    features = feature_extraction(spectrum, spectrumTwo)

    return features

In [7]:
sequence_list = final_df['Sequence'].to_list()

features_list = []
for seq in tqdm(sequence_list, total=len(sequence_list)):
    features_list.append(accumulated_nucle_frequency_fourier(seq))

features_list = np.array(features_list)

  kurtosis = (np.percentile(spectrum, 75) - np.percentile(spectrum, 25)) / (2 * (np.percentile(spectrum, 90) - np.percentile(spectrum, 10)))
100%|██████████| 338319/338319 [55:47<00:00, 101.08it/s] 


In [8]:
columns = ['average', 'median', 'maximum', 'minimum', 'peak', 'none_levated_peak', 'sample_standard_deviation', 'population_standard_deviation', \
            'percentile15', 'percentile25', 'percentile50', 'percentile75', 'amplitude', 'variance', 'interquartile_range', 'semi_interquartile_range', \
            'coefficient_of_variation', 'skewness', 'kurtosis']

In [14]:
features_dict = {}
for i in tqdm(range(len(columns))):
    features_dict[columns[i]] = list(features_list[:, i])

100%|██████████| 19/19 [00:00<00:00, 43.55it/s]


In [15]:
with open("./data/features.json", 'w') as f: json.dump(features_dict, f)

In [16]:
for column in columns:
    final_df[column] = features_dict[column]

In [17]:
final_df

Unnamed: 0,Chr,Sequence,Class,average,median,maximum,minimum,peak,none_levated_peak,sample_standard_deviation,...,percentile25,percentile50,percentile75,amplitude,variance,interquartile_range,semi_interquartile_range,coefficient_of_variation,skewness,kurtosis
0,1,"(G, C, G, T, G, G, A, G, C, G, G, T, G, G, C, ...",TIR,22.785062,2.628776,4.437555e+03,0.019035,3.408666,39.438801,289.868354,...,0.862362,2.628776,5.812890,4.437535e+03,8.438583e+04,4.950528,2.475264,12.721859,0.208608,0.291036
1,1,"(G, T, G, C, A, T, C, A, T, A, T, G, T, C, T, ...",TIR,18.520799,1.130638,3.946991e+03,0.003330,4.139490,57.273004,259.602728,...,0.561190,1.130638,2.018632,3.946988e+03,6.768787e+04,1.457442,0.728721,14.016821,0.200963,0.263600
2,1,"(C, A, C, G, A, C, G, G, C, G, A, A, G, C, G, ...",TIR,16.990299,0.876251,3.273105e+03,0.002598,4.119998,53.444585,225.235761,...,0.402295,0.876251,1.822468,3.273103e+03,5.097388e+04,1.420173,0.710087,13.256727,0.214629,0.240582
3,1,"(T, T, C, C, G, C, T, G, A, A, A, G, T, A, A, ...",TIR,13.620105,1.285846,1.499464e+03,0.018993,3.157098,27.641093,131.345830,...,0.837292,1.285846,3.023629,1.499445e+03,1.738651e+04,2.186337,1.093168,9.643526,0.281720,0.244503
4,1,"(T, T, T, C, C, T, A, A, A, T, G, A, T, T, T, ...",TIR,12.712800,1.984341,1.020541e+03,0.045592,2.674470,19.088565,100.313441,...,0.967886,1.984341,3.733240,1.020495e+03,1.016242e+04,2.765354,1.382677,7.890743,0.320848,0.223924
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338437,Pt,"(T, G, T, A, T, G, A, A, C, A, A, G, G, G, T, ...",Helitron,8.466189,1.112580,5.566899e+02,0.006788,3.031667,19.734616,62.894145,...,0.324996,1.112580,1.898055,5.566831e+02,4.007722e+03,1.573059,0.786529,7.428862,0.350761,0.300157
338438,Pt,"(G, T, A, G, T, T, G, A, A, A, T, A, A, C, G, ...",Helitron,5.634133,0.888755,1.924762e+02,0.015920,2.366528,11.384402,29.924223,...,0.373623,0.888755,1.228398,1.924603e+02,9.184196e+02,0.854776,0.427388,5.311238,0.475739,0.255587
338439,Pt,"(A, T, T, T, A, T, A, A, T, A, C, T, T, C, A, ...",Helitron,18.938916,1.674825,2.716961e+03,0.003399,2.992076,31.623691,207.567355,...,0.732332,1.674825,4.380410,2.716958e+03,4.333914e+04,3.648077,1.824039,10.959833,0.249520,0.298017
338440,Pt,"(A, A, A, A, G, A, T, G, A, A, A, A, A, A, C, ...",Helitron,13.746950,2.016618,4.537654e+02,0.039160,1.212390,8.078962,63.476970,...,0.834743,2.016618,3.416610,4.537262e+02,4.111557e+03,2.581867,1.290934,4.617531,0.554390,0.084025


In [18]:
final_df.to_csv('./data/final.csv')

In [19]:
x = pd.read_csv('./data/final.csv')

  x = pd.read_csv('./data/final.csv')


In [22]:
x = x[[column for column in x.columns if column != 'Unnamed: 0']]
x

Unnamed: 0,Chr,Sequence,Class,average,median,maximum,minimum,peak,none_levated_peak,sample_standard_deviation,...,percentile25,percentile50,percentile75,amplitude,variance,interquartile_range,semi_interquartile_range,coefficient_of_variation,skewness,kurtosis
0,1,GCGTGGAGCGGTGGCGAGAGTAGCGTGTACCCTCCGTAGCAAGAGG...,TIR,22.785062,2.628776,4.437555e+03,0.019035,3.408666,39.438801,289.868354,...,0.862362,2.628776,5.812890,4.437535e+03,8.438583e+04,4.950528,2.475264,12.721859,0.208608,0.291036
1,1,GTGCATCATATGTCTATTCCTGGACCATACCGACCACTGCGGTGTG...,TIR,18.520799,1.130638,3.946991e+03,0.003330,4.139490,57.273004,259.602728,...,0.561190,1.130638,2.018632,3.946988e+03,6.768787e+04,1.457442,0.728721,14.016821,0.200963,0.263600
2,1,CACGACGGCGAAGCGACTACACCTCTAGTTCCTCTAATTAATTAGC...,TIR,16.990299,0.876251,3.273105e+03,0.002598,4.119998,53.444585,225.235761,...,0.402295,0.876251,1.822468,3.273103e+03,5.097388e+04,1.420173,0.710087,13.256727,0.214629,0.240582
3,1,TTCCGCTGAAAGTAAAAGGCGAAGAAGCTCCTAAGGGAGGCTTACA...,TIR,13.620105,1.285846,1.499464e+03,0.018993,3.157098,27.641093,131.345830,...,0.837292,1.285846,3.023629,1.499445e+03,1.738651e+04,2.186337,1.093168,9.643526,0.281720,0.244503
4,1,TTTCCTAAATGATTTTGGTGGTTGAAATGCCCAACACAAATAATTG...,TIR,12.712800,1.984341,1.020541e+03,0.045592,2.674470,19.088565,100.313441,...,0.967886,1.984341,3.733240,1.020495e+03,1.016242e+04,2.765354,1.382677,7.890743,0.320848,0.223924
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338314,Pt,TGTATGAACAAGGGTTGATTTTACTTCCGCACTTAGCTACTCTAGG...,Helitron,8.466189,1.112580,5.566899e+02,0.006788,3.031667,19.734616,62.894145,...,0.324996,1.112580,1.898055,5.566831e+02,4.007722e+03,1.573059,0.786529,7.428862,0.350761,0.300157
338315,Pt,GTAGTTGAAATAACGGAATTGGAACTTGTTTGGTCGAGTA,Helitron,5.634133,0.888755,1.924762e+02,0.015920,2.366528,11.384402,29.924223,...,0.373623,0.888755,1.228398,1.924603e+02,9.184196e+02,0.854776,0.427388,5.311238,0.475739,0.255587
338316,Pt,ATTTATAATACTTCAGGAGCTAATGAAACTATTTTAGTCAAATTCA...,Helitron,18.938916,1.674825,2.716961e+03,0.003399,2.992076,31.623691,207.567355,...,0.732332,1.674825,4.380410,2.716958e+03,4.333914e+04,3.648077,1.824039,10.959833,0.249520,0.298017
338317,Pt,AAAAGATGAAAAAACCAAAAAAAGCTCTGCCCTTCCATCTCTTGGA...,Helitron,13.746950,2.016618,4.537654e+02,0.039160,1.212390,8.078962,63.476970,...,0.834743,2.016618,3.416610,4.537262e+02,4.111557e+03,2.581867,1.290934,4.617531,0.554390,0.084025
