In [234]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pl
import openpyxl
#import plotly.express as px

In [235]:
# Read data
data = pd.read_excel('./Datos_Metagenetica.xlsx', sheet_name='El_cielo', engine='openpyxl').append(pd.read_excel('./Datos_Metagenetica.xlsx', sheet_name='Chamela',engine='openpyxl'))

#mateo
#data = pd.read_excel('./Datos_Metagenetica.xlsx', sheet_name='El_cielo').append(pd.read_excel('./Datos_Metagenetica.xlsx', sheet_name='Chamela'))
# replace 0 with NaN
data = data.replace(0, np.nan)
#drop columns que no se van a usar 
data = data.drop(['Database','.id', 'similarity', 'phylum_final', 
                  'class_final', 'subfamily_final', 'tribe_final',
                   'subspecies_final', 'BASE', 'OTU'], axis=1)
data

Unnamed: 0,Sequence,order_final,family_final,genus_final,species_final,Unnamed: 13
0,aataaacaatataagattttggttattgcctccttcattatcactc...,Coleoptera,Mordellidae,,,
1,aataaataatataagtttttgacttcttcctccttctttaacctta...,Coleoptera,Carabidae,Glyptolenus,,
2,tttaaacaatataagattttgattgttaccaccttcattaactttc...,Coleoptera,Coccinellidae,,,
3,tataaacaatataagattctgacttcttccaccttcattaagatta...,Coleoptera,Mordellidae,,,
4,aataaataatataagattttgactacttcctccgtcacttaccctt...,Coleoptera,Nitidulidae,,,
...,...,...,...,...,...,...
1778,aataaataatataagtttttgacttttacctcctgcattaacactt...,Diptera,Tachinidae,Ischyrophaga,,
1779,aataaataatataagattttgattattaccaccatcaataattata...,Hymenoptera,Ichneumonidae,ichneuMalaiseNA1,,
1780,aataaataacataagattttgattactcccaccttctcttttttta...,Hymenoptera,Ichneumonidae,,,
1781,aataaataatataagtttctgacttcttcccccttctttaattctt...,Lepidoptera,Erebidae,Arugisa,,


# Balance data

In [236]:
def balance_one_tax_data(df, col_tax_to_balance, tax_to_balance, max_samples) -> pd.DataFrame:
    '''
    df: dataframe with all data to balance
    col_tax_to_balance: column name of the tax to balance
    tax_to_balance: tax to balance
    max_samples: maximum number to save of each tax
    return: balanced dataframe
    '''
    col_index = df.columns.get_loc(col_tax_to_balance)
    if col_index + 1 < len(df.columns):
        next_tax_col = df.columns[col_index + 1]
        vc = df[df[col_tax_to_balance] == tax_to_balance][next_tax_col].value_counts()
        q25 = vc.quantile(0.25)
        selected_values = vc[vc > 4 * q25]
        balanced = pd.DataFrame(columns = df.columns)
        for i in selected_values.index:
            if vc[i] > max_samples:
                balanced = pd.concat((balanced, df[df[next_tax_col] == i].sample(max_samples)))
            else:
                balanced = pd.concat((balanced, df[df[next_tax_col] == i]))
    return balanced

print('Original data')
print(data.value_counts('family_final'))
print('Balanced data')
balance_one_tax_data(data, 'order_final', 'Diptera', 50).value_counts('family_final')

Original data
family_final
Erebidae         236
Tachinidae       200
Phoridae         162
Ichneumonidae    154
Cecidomyiidae    143
                ... 
Pyrrhocoridae      1
Eupelmidae         1
Evaniidae          1
Fanniidae          1
f__Triozidae       1
Length: 216, dtype: int64
Balanced data


family_final
Tachinidae         50
Sciaridae          50
Ceratopogonidae    50
Phoridae           50
Cecidomyiidae      50
Sarcophagidae      33
Drosophilidae      32
Chironomidae       31
Dolichopodidae     30
Mycetophilidae     25
Muscidae           24
Syrphidae          24
Chloropidae        22
Stratiomyidae      20
Sphaeroceridae     19
Pipunculidae       15
Tabanidae          14
Lauxaniidae        14
Limoniidae         12
Psychodidae        10
dtype: int64

---
# Encoding & Concatenation

In [237]:
data['Sequence'] = data['Sequence'].apply(lambda x: x.upper())

In [238]:
def sequence_encoding(sequence):
    mapping = {"A": 0, "C": 1, "G": 2, "T": 3}
    encoded_sequence = [mapping[i] for i in sequence]
    return np.eye(4)[encoded_sequence]

In [239]:
elem0 = data['Sequence'].iloc[0]
elem1 = data['Sequence'].iloc[1]
elem0

'AATAAACAATATAAGATTTTGGTTATTGCCTCCTTCATTATCACTCCTTTTAATAAGAAGAATCGTAGAAACCGGTGCAGGTACAGGTTGAACAGTGTACCCCCCGCTGTCATCCAATATTGCCCACAGAGGTGCTTCAGTTGATTTAGCTATTTTTAGACTACATTTAGCTGGTATTTCTTCTATTTTAGGAGCAATTAATTTTATTTCTACAATAATTAATATACGACCCGCAGGAATAACCTTTGACCGAATACCCTTATTTGTCTGAGCTATTGCTATTACTGCCGTACTTCTACTATTATCTCTTCCTGTCTTAGCTGGAGCAATTACTATATTATTAACTGATCGAAATTTAAATACTACCTTTTTTGATCCCGCCGGAGGAGGAGATCCAATCTTATATCAACATCTCTTT'

In [240]:
enc0 = sequence_encoding(elem0)
enc1 = sequence_encoding(elem1)
enc0

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       ...,
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]])

In [241]:
enc1.shape

(418, 4)

In [242]:
def long_sequence(seq1, seq2):
    return np.concatenate((seq1, seq2), axis=1)

In [243]:
long_seq = long_sequence(enc0,enc1)
long_seq.shape
#type(long_seq)

(418, 8)

In [244]:
long_seq

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [245]:
def deep_sequence(seq1, seq2):
    #s1 = seq1[np.newaxis, :, :]
    #s2 = seq2[np.newaxis, :, :]
    #sequence = np.concatenate((s1, s2), axis=0)
    sequence = np.dstack((seq1, seq2))
    return sequence

In [246]:
deep_seq = deep_sequence(enc0,enc1)
deep_seq.shape

(418, 4, 2)

In [247]:
deep_seq

array([[[1., 1.],
        [0., 0.],
        [0., 0.],
        [0., 0.]],

       [[1., 1.],
        [0., 0.],
        [0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.],
        [0., 0.],
        [1., 1.]],

       ...,

       [[0., 0.],
        [0., 0.],
        [0., 0.],
        [1., 1.]],

       [[0., 0.],
        [0., 0.],
        [0., 0.],
        [1., 1.]],

       [[0., 0.],
        [0., 0.],
        [0., 0.],
        [1., 1.]]])

---
# Combinaciones

In [248]:
datos = {'Col1': ['C', 'A', 'R','M', 'E', 'N'],
        'Col2': ['S','S','M','S','M','M']}

df = pd.DataFrame(datos)
#df

In [249]:
import itertools

def combination_list(dataframe,column):
    sequences = dataframe[column].tolist()
    combinations = list(itertools.combinations(sequences, 2))
    return combinations

In [250]:
combinations = combination_list(df,'Col1')
#combinations

In [251]:
def combination_matrix(df,sequence_column,tax_column):
    combinaciones = []
    for sequence1, sequence2 in combination_list(df,sequence_column):
        z = zip(df[df[sequence_column] == sequence1][tax_column], df[df[sequence_column] == sequence2][tax_column])
        for clase1, clase2 in z:
            combinaciones.append([sequence1, sequence2, clase1, clase2])

    df_combinaciones = pd.DataFrame(combinaciones, columns=['Sequence1', 'Sequence2', 'Tax1', 'Tax2'])
    return df_combinaciones

In [252]:
nuevo_df = combination_matrix(df,'Col1','Col2')
#nuevo_df

In [253]:
def tax_comparison(dataframe, tax1, tax2):
    dataframe['Same'] = dataframe[tax1] == dataframe[tax2]
    return dataframe

In [254]:
input_matrix = tax_comparison(nuevo_df,'Tax1','Tax2')
input_matrix

Unnamed: 0,Sequence1,Sequence2,Tax1,Tax2,Same
0,C,A,S,S,True
1,C,R,S,M,False
2,C,M,S,S,True
3,C,E,S,M,False
4,C,N,S,M,False
5,A,R,S,M,False
6,A,M,S,S,True
7,A,E,S,M,False
8,A,N,S,M,False
9,R,M,M,S,False


---
# Final Matrix

In [255]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def final_matrix(data,concat_type):
    
    #Pasamos a letras mayúsculas las cadenas
    data['Sequence'] = data['Sequence'].apply(lambda x: x.upper())
    #Llamamos a la función para filtrar los datos
    balanced_data = balance_one_tax_data(data, 'order_final', 'Diptera', 50)
    #Llamamos a la función para realizar todas las combinaciones
    combinations = combination_matrix(balanced_data,'Sequence',"family_final")
    #Llamamos a la función para determinar si los Taxones son iguales
    DNA_matrix = tax_comparison(combinations,'Tax1','Tax2')
    #Pasamos los valores boolean a integer
    DNA_matrix['Same'] = DNA_matrix['Same'].astype(int)
    
    DNA_matrix['Paired_seq'] = ''  # Creamos una columna vacía para almacenar los resultados
    for index, row in DNA_matrix.iterrows():
        sequence1 = row['Sequence1']
        sequence2 = row['Sequence2']
        encoding1 = sequence_encoding(sequence1)
        encoding2 = sequence_encoding(sequence2)
        vectors_padded = pad_sequences([encoding1, encoding2], padding='post')
        paired_sequences = concat_type(vectors_padded[0], vectors_padded[0])
        DNA_matrix.at[index, 'Paired_seq'] = paired_sequences
    return DNA_matrix

In [256]:
final_matrix = final_matrix(p,long_sequence)
final_matrix

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Sequence1,Sequence2,Tax1,Tax2,Same,Paired_seq
0,AATAAATAATATAAGATTTTGACTTCTTCCTCCTTCTCTAACTCTT...,AATAAATAATATAAGTTTTTGAATACTTCCTCCTTCTTTAACTTTA...,Phoridae,Phoridae,1,"[[1, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 1, 0, ..."
1,AATAAATAATATAAGATTTTGACTTCTTCCTCCTTCTCTAACTCTT...,AATAAATAATATAAGATTTTGAATACTCCCCCCATCATTAACATTA...,Phoridae,Phoridae,1,"[[1, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 1, 0, ..."
2,AATAAATAATATAAGATTTTGACTTCTTCCTCCTTCTCTAACTCTT...,TATAAATAATATAAGATTTTGAATATTACCTCCATCATTAACACTA...,Phoridae,Phoridae,1,"[[1, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 1, 0, ..."
3,AATAAATAATATAAGATTTTGACTTCTTCCTCCTTCTCTAACTCTT...,TATAAATAATATAAGATTCTGAATACTGCCTCCTTCTTTAACCCTA...,Phoridae,Phoridae,1,"[[1, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 1, 0, ..."
4,AATAAATAATATAAGATTTTGACTTCTTCCTCCTTCTCTAACTCTT...,AATAAATAACATAAGATTTTGATTATTACCACCATCTCTAACTTTA...,Phoridae,Phoridae,1,"[[1, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 1, 0, ..."
...,...,...,...,...,...,...
31621,AATAAATAATATAAGATTTTGATTATTACCACCTTCCTTAACTTTA...,AATAAATAATATAAGATTTTGACTATTACCACCATCTTTAACCTTA...,Pipunculidae,Pipunculidae,1,"[[1, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 1, 0, ..."
31622,AATAAATAATATAAGATTTTGATTATTACCACCTTCCTTAACTTTA...,AATAAATAATATAAGATTTTGAATACTTCCCCCTTCCCTTACTCTT...,Pipunculidae,Pipunculidae,1,"[[1, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 1, 0, ..."
31623,AATAAACAATATGAGTTTTTGAATATTACCTCCGTCCCTTACCCTA...,AATAAATAATATAAGATTTTGACTATTACCACCATCTTTAACCTTA...,Pipunculidae,Pipunculidae,1,"[[1, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 1, 0, ..."
31624,AATAAACAATATGAGTTTTTGAATATTACCTCCGTCCCTTACCCTA...,AATAAATAATATAAGATTTTGAATACTTCCCCCTTCCCTTACTCTT...,Pipunculidae,Pipunculidae,1,"[[1, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 1, 0, ..."


In [257]:
final_matrix.dtypes

Sequence1     object
Sequence2     object
Tax1          object
Tax2          object
Same           int64
Paired_seq    object
dtype: object

(31626, 418, 8)

---
# Data preparation for CNN

In [267]:
from sklearn.model_selection import train_test_split

# Obtenemos las secuencias de ADN en una variable 'X' y las etiquetas en una variable 'y'
DNA = np.array(list(final_matrix.loc[:, 'Paired_seq']))
labels = np.array(list(final_matrix.loc[:, 'Same']))
#X

In [268]:
# Divide los datos en conjuntos de entrenamiento y prueba (80% para entrenamiento, 20% para prueba)
DNA_train, DNA_test, labels_train, labels_test = train_test_split(
    DNA, labels, test_size=0.20, random_state=42)

print("Forma de X_train:", DNA_train.shape)
print("Forma de y_train:", labels_train.shape)

print("Forma de X_test:", DNA_test.shape)
print("Forma de y_test:", labels_test.shape)

Forma de X_train: (25300, 418, 8)
Forma de y_train: (25300,)
Forma de X_test: (6326, 418, 8)
Forma de y_test: (6326,)


In [269]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.activations import relu, sigmoid
from tensorflow.keras.optimizers import SGD

"""
CNN = Sequential([
    #Input(shape=(418, 4, 2)),

    Conv1D(filters=32, kernel_size=3, activation=relu, padding='same', input_shape=(418,4,2)),
    MaxPooling1D(pool_size=2),

    Conv1D(filters=64, kernel_size=3, activation=relu),
    MaxPooling1D(pool_size=2),

    Flatten(),
    Dense(128, activation='relu'),
    Dense(1, activation='sigmoid')
])

CNN.summary()
"""
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=3, activation=relu, input_shape=(418, 8)))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=64, kernel_size=3, activation=relu))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation=relu))
model.add(Dense(1, activation=sigmoid))

model.summary()

Model: "sequential_19"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_19 (Conv1D)           (None, 416, 32)           800       
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, 208, 32)           0         
_________________________________________________________________
conv1d_20 (Conv1D)           (None, 206, 64)           6208      
_________________________________________________________________
max_pooling1d_14 (MaxPooling (None, 103, 64)           0         
_________________________________________________________________
flatten_7 (Flatten)          (None, 6592)              0         
_________________________________________________________________
dense_14 (Dense)             (None, 128)               843904    
_________________________________________________________________
dense_15 (Dense)             (None, 1)               

In [270]:
# Compile: Define training parameters

epochs = 50
lrate = 0.001
decay = lrate / epochs
optim = SGD(learning_rate = lrate, momentum = 0.90, decay = decay, nesterov = True)
model.compile(loss='binary_crossentropy', optimizer=optim, metrics=['binary_accuracy'])

In [271]:
BATCHES = DNA_matrix.shape[0] // 64

model.fit(DNA_train, labels_train, batch_size=BATCHES, epochs=epochs, verbose=2, validation_split=0.30)

Train on 17710 samples, validate on 7590 samples
Epoch 1/50
17710/17710 - 52s - loss: 0.3833 - binary_accuracy: 0.8870 - val_loss: 0.3405 - val_binary_accuracy: 0.8928
Epoch 2/50
17710/17710 - 50s - loss: 0.3438 - binary_accuracy: 0.8913 - val_loss: 0.3404 - val_binary_accuracy: 0.8928
Epoch 3/50
17710/17710 - 50s - loss: 0.3433 - binary_accuracy: 0.8913 - val_loss: 0.3403 - val_binary_accuracy: 0.8928
Epoch 4/50
17710/17710 - 50s - loss: 0.3431 - binary_accuracy: 0.8913 - val_loss: 0.3398 - val_binary_accuracy: 0.8928
Epoch 5/50
17710/17710 - 50s - loss: 0.3429 - binary_accuracy: 0.8913 - val_loss: 0.3396 - val_binary_accuracy: 0.8928
Epoch 6/50
17710/17710 - 50s - loss: 0.3427 - binary_accuracy: 0.8913 - val_loss: 0.3394 - val_binary_accuracy: 0.8928
Epoch 7/50
17710/17710 - 50s - loss: 0.3425 - binary_accuracy: 0.8913 - val_loss: 0.3393 - val_binary_accuracy: 0.8928
Epoch 8/50
17710/17710 - 50s - loss: 0.3424 - binary_accuracy: 0.8913 - val_loss: 0.3391 - val_binary_accuracy: 0.8928

KeyboardInterrupt: 