In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pl
#import plotly.express as px

In [32]:
# Read data
data = pd.read_excel('./metagenetica.xlsx', sheet_name='El_cielo').append(pd.read_excel('./metagenetica.xlsx', sheet_name='Chamela'))
# replace 0 with NaN
data = data.replace(0, np.nan)
#drop columns que no se van a usar 
data = data.drop(['Database','.id', 'similarity', 'phylum_final', 
                  'class_final', 'subfamily_final', 'tribe_final',
                   'subspecies_final', 'BASE', 'OTU'], axis=1)
data

Unnamed: 0,Sequence,order_final,family_final,genus_final,species_final
0,aataaacaatataagattttggttattgcctccttcattatcactc...,Coleoptera,Mordellidae,,
1,aataaataatataagtttttgacttcttcctccttctttaacctta...,Coleoptera,Carabidae,Glyptolenus,
2,tttaaacaatataagattttgattgttaccaccttcattaactttc...,Coleoptera,Coccinellidae,,
3,tataaacaatataagattctgacttcttccaccttcattaagatta...,Coleoptera,Mordellidae,,
4,aataaataatataagattttgactacttcctccgtcacttaccctt...,Coleoptera,Nitidulidae,,
...,...,...,...,...,...
1778,aataaataatataagtttttgacttttacctcctgcattaacactt...,Diptera,Tachinidae,Ischyrophaga,
1779,aataaataatataagattttgattattaccaccatcaataattata...,Hymenoptera,Ichneumonidae,ichneuMalaiseNA1,
1780,aataaataacataagattttgattactcccaccttctcttttttta...,Hymenoptera,Ichneumonidae,,
1781,aataaataatataagtttctgacttcttcccccttctttaattctt...,Lepidoptera,Erebidae,Arugisa,


---
# Encoding & Concatenation

In [58]:
data['Sequence'] = data['Sequence'].apply(lambda x: x.upper())

In [59]:
def one_hot_encoding(sequence):
    mapping = {"A": 0, "C": 1, "G": 2, "T": 3}
    encoded_sequence = [mapping[i] for i in sequence]
    return np.eye(4)[encoded_sequence]

In [60]:
elem0 = data['Sequence'].iloc[0]
elem1 = data['Sequence'].iloc[1]
elem0

'AATAAACAATATAAGATTTTGGTTATTGCCTCCTTCATTATCACTCCTTTTAATAAGAAGAATCGTAGAAACCGGTGCAGGTACAGGTTGAACAGTGTACCCCCCGCTGTCATCCAATATTGCCCACAGAGGTGCTTCAGTTGATTTAGCTATTTTTAGACTACATTTAGCTGGTATTTCTTCTATTTTAGGAGCAATTAATTTTATTTCTACAATAATTAATATACGACCCGCAGGAATAACCTTTGACCGAATACCCTTATTTGTCTGAGCTATTGCTATTACTGCCGTACTTCTACTATTATCTCTTCCTGTCTTAGCTGGAGCAATTACTATATTATTAACTGATCGAAATTTAAATACTACCTTTTTTGATCCCGCCGGAGGAGGAGATCCAATCTTATATCAACATCTCTTT'

In [61]:
enc0 = one_hot_encoding(elem0)
enc1 = one_hot_encoding(elem1)
enc0

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       ...,
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]])

In [62]:
def concatenate_sequence_1(seq1, seq2):
    return np.concatenate((seq1, seq2), axis=1)

In [63]:
concat_seq1 = concatenate_sequence_1(enc0,enc1)
concat_seq1.shape
#type(concat_seq1)

(418, 8)

In [64]:
concat_seq1

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [65]:
def concatenate_sequence_2(seq1, seq2):
    s1 = seq1[np.newaxis, :, :]
    s2 = seq2[np.newaxis, :, :]
    return np.concatenate((s1, s2), axis=0)

In [66]:
concat_seq2 = concatenate_sequence_2(enc0,enc1)
concat_seq2.shape

(2, 418, 4)

In [67]:
concat_seq2

array([[[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 0., 1.],
        ...,
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.]],

       [[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 0., 1.],
        ...,
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 0., 1.]]])

---
# Combinaciones

In [106]:
datos = {'Col1': ['C', 'A', 'R','M', 'E', 'N'],
        'Col2': ['S','S','M','S','M','M']}

df = pd.DataFrame(datos)
df

Unnamed: 0,Col1,Col2
0,C,S
1,A,S
2,R,M
3,M,S
4,E,M
5,N,M


In [107]:
np.math.comb(6, 2)

15

In [108]:
import itertools

def obtener_combinaciones(dataframe,columna):
    secuencias = dataframe[columna].tolist()
    combinaciones = list(itertools.combinations(secuencias, 2))
    return combinaciones

In [109]:
combinaciones = obtener_combinaciones(df,'Col1')
combinaciones

[('C', 'A'),
 ('C', 'R'),
 ('C', 'M'),
 ('C', 'E'),
 ('C', 'N'),
 ('A', 'R'),
 ('A', 'M'),
 ('A', 'E'),
 ('A', 'N'),
 ('R', 'M'),
 ('R', 'E'),
 ('R', 'N'),
 ('M', 'E'),
 ('M', 'N'),
 ('E', 'N')]

In [110]:
def obtener_df(df,columna1,columna2):
    combinaciones = []
    for secuencia1, secuencia2 in itertools.combinations(df[columna1], 2):
        z = zip(df[df[columna1] == secuencia1][columna2], df[df[columna1] == secuencia2][columna2])
        for clase1, clase2 in z:
            combinaciones.append([secuencia1, secuencia2, clase1, clase2])

    df_combinaciones = pd.DataFrame(combinaciones, columns=['Secuencia1', 'Secuencia2', 'Identificador1', 'Identificador2'])
    return df_combinaciones

In [111]:
nuevo_df = obtener_df(df,'Col1','Col2')
nuevo_df

Unnamed: 0,Secuencia1,Secuencia2,Identificador1,Identificador2
0,C,A,S,S
1,C,R,S,M
2,C,M,S,S
3,C,E,S,M
4,C,N,S,M
5,A,R,S,M
6,A,M,S,S
7,A,E,S,M
8,A,N,S,M
9,R,M,M,S


In [102]:
p = data.head(1000)
p

Unnamed: 0,Sequence,order_final,family_final,genus_final,species_final
0,AATAAACAATATAAGATTTTGGTTATTGCCTCCTTCATTATCACTC...,Coleoptera,Mordellidae,,
1,AATAAATAATATAAGTTTTTGACTTCTTCCTCCTTCTTTAACCTTA...,Coleoptera,Carabidae,Glyptolenus,
2,TTTAAACAATATAAGATTTTGATTGTTACCACCTTCATTAACTTTC...,Coleoptera,Coccinellidae,,
3,TATAAACAATATAAGATTCTGACTTCTTCCACCTTCATTAAGATTA...,Coleoptera,Mordellidae,,
4,AATAAATAATATAAGATTTTGACTACTTCCTCCGTCACTTACCCTT...,Coleoptera,Nitidulidae,,
...,...,...,...,...,...
995,AATAAATAATATGAGATTTTGGTTGCTCCCGCCTTCATTAACTTTA...,Hemiptera,Cicadellidae,,
996,AATAAATAATATAAGATTTTGATTATTACCACCATCTTTAATTACC...,Hymenoptera,Ichneumonidae,Hyposoter,
997,AATAAATAATATAAGATTTTGGTTATTAATTCCTTCTTTAATACTC...,Hymenoptera,Braconidae,Aleiodes,
998,AATAAATAATATAAGATTTTGATTATTACCCCCTTCTTTAATAATT...,Hymenoptera,Ichneumonidae,Hyposoter,


In [104]:
comb = obtener_df(p,'Sequence',"order_final")
comb

Unnamed: 0,Secuencia1,Secuencia2,Identificador1,Identificador2
0,AATAAACAATATAAGATTTTGGTTATTGCCTCCTTCATTATCACTC...,AATAAATAATATAAGTTTTTGACTTCTTCCTCCTTCTTTAACCTTA...,Coleoptera,Coleoptera
1,AATAAACAATATAAGATTTTGGTTATTGCCTCCTTCATTATCACTC...,TTTAAACAATATAAGATTTTGATTGTTACCACCTTCATTAACTTTC...,Coleoptera,Coleoptera
2,AATAAACAATATAAGATTTTGGTTATTGCCTCCTTCATTATCACTC...,TATAAACAATATAAGATTCTGACTTCTTCCACCTTCATTAAGATTA...,Coleoptera,Coleoptera
3,AATAAACAATATAAGATTTTGGTTATTGCCTCCTTCATTATCACTC...,AATAAATAATATAAGATTTTGACTACTTCCTCCGTCACTTACCCTT...,Coleoptera,Coleoptera
4,AATAAACAATATAAGATTTTGGTTATTGCCTCCTTCATTATCACTC...,AATAAACAATATAAGATTTTGACTTTTACCTCCTTCTTTAACTTTA...,Coleoptera,Coleoptera
...,...,...,...,...
499495,AATAAATAATATAAGATTTTGATTATTACCACCATCTTTAATTACC...,AATAAATAATATAAGATTTTGATTATTACCCCCTTCTTTAATAATT...,Hymenoptera,Hymenoptera
499496,AATAAATAATATAAGATTTTGATTATTACCACCATCTTTAATTACC...,AATAAATAACATAAGTTTTTGATTATTACCTCCATCATTAATTTTA...,Hymenoptera,Lepidoptera
499497,AATAAATAATATAAGATTTTGGTTATTAATTCCTTCTTTAATACTC...,AATAAATAATATAAGATTTTGATTATTACCCCCTTCTTTAATAATT...,Hymenoptera,Hymenoptera
499498,AATAAATAATATAAGATTTTGGTTATTAATTCCTTCTTTAATACTC...,AATAAATAACATAAGTTTTTGATTATTACCTCCATCATTAATTTTA...,Hymenoptera,Lepidoptera


In [105]:
# 3:01
# 3:01.05

In [112]:
def comparar_identificadores(dataframe, columna1, columna2):
    dataframe['SC'] = dataframe[columna1] == dataframe[columna2]
    return dataframe

In [116]:
i = comparar_identificadores(comb,'Identificador1','Identificador2')
i

Unnamed: 0,Secuencia1,Secuencia2,Identificador1,Identificador2,SC
0,AATAAACAATATAAGATTTTGGTTATTGCCTCCTTCATTATCACTC...,AATAAATAATATAAGTTTTTGACTTCTTCCTCCTTCTTTAACCTTA...,Coleoptera,Coleoptera,True
1,AATAAACAATATAAGATTTTGGTTATTGCCTCCTTCATTATCACTC...,TTTAAACAATATAAGATTTTGATTGTTACCACCTTCATTAACTTTC...,Coleoptera,Coleoptera,True
2,AATAAACAATATAAGATTTTGGTTATTGCCTCCTTCATTATCACTC...,TATAAACAATATAAGATTCTGACTTCTTCCACCTTCATTAAGATTA...,Coleoptera,Coleoptera,True
3,AATAAACAATATAAGATTTTGGTTATTGCCTCCTTCATTATCACTC...,AATAAATAATATAAGATTTTGACTACTTCCTCCGTCACTTACCCTT...,Coleoptera,Coleoptera,True
4,AATAAACAATATAAGATTTTGGTTATTGCCTCCTTCATTATCACTC...,AATAAACAATATAAGATTTTGACTTTTACCTCCTTCTTTAACTTTA...,Coleoptera,Coleoptera,True
...,...,...,...,...,...
499495,AATAAATAATATAAGATTTTGATTATTACCACCATCTTTAATTACC...,AATAAATAATATAAGATTTTGATTATTACCCCCTTCTTTAATAATT...,Hymenoptera,Hymenoptera,True
499496,AATAAATAATATAAGATTTTGATTATTACCACCATCTTTAATTACC...,AATAAATAACATAAGTTTTTGATTATTACCTCCATCATTAATTTTA...,Hymenoptera,Lepidoptera,False
499497,AATAAATAATATAAGATTTTGGTTATTAATTCCTTCTTTAATACTC...,AATAAATAATATAAGATTTTGATTATTACCCCCTTCTTTAATAATT...,Hymenoptera,Hymenoptera,True
499498,AATAAATAATATAAGATTTTGGTTATTAATTCCTTCTTTAATACTC...,AATAAATAACATAAGTTTTTGATTATTACCTCCATCATTAATTTTA...,Hymenoptera,Lepidoptera,False
