### Uniprot alignment analysis - Antibiotic Biosynthesis
This notebook takes the output from DIAMOND and analise the alignment between the CDS from samples BGCs and sequences from antibiotic biosynthesis associated proteins (from Uniprot).

In [3]:
import pandas as pd

df = pd.read_csv("/home/pedro/antismash/BIG-SCAPE/cds_vs_uniprot.tsv", sep="\t", header=None)
df.columns = ["qseqid", "sseqid", "pident", "length", "qstart", "qend", "sstart", "send", "evalue", "bitscore"]
df

Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore
0,ctg60_15_9201-10740,A0AAP7A715,28.6,140,376,512,387,514,5.330000e-08,57.8
1,ctg60_15_9201-10740,A0A6L8URE3,29.2,120,376,493,849,962,3.870000e-06,52.0
2,ctg60_15_9201-10740,A0AAW4XIQ1,27.4,124,370,493,10751,10866,6.890000e-06,51.2
3,ctg60_15_9201-10740,A0A4R2RPK8,27.9,129,372,493,1461,1583,8.180000e-06,50.8
4,ctg60_15_9201-10740,A0A0V9UQR3,27.4,124,370,493,9710,9825,9.070000e-06,50.8
...,...,...,...,...,...,...,...,...,...,...
394,ctg1_11_8753-10004,A0AA45R3E8,27.9,448,1,410,4,424,1.790000e-15,79.7
395,ctg1_11_8753-10004,C6WPG9,27.9,448,1,410,4,424,1.790000e-15,79.7
396,ctg1_11_8753-10004,A0A344L5G4,26.5,456,4,414,7,429,1.390000e-13,73.9
397,ctg1_11_8753-10004,A0A7Z1AXG1,28.1,381,6,346,9,364,1.060000e-12,71.2


In [9]:
df.sort_values(by=['pident'], ascending=False)

Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore
201,ctg34_30_5545-6838,A0A841B2S3,57.8,415,4,417,1,415,3.150000e-174,495.0
205,ctg34_30_5545-6838,A0A229SME2,57.6,415,4,417,1,415,1.520000e-172,491.0
203,ctg34_30_5545-6838,A0A1H5QUE1,57.3,415,4,417,1,415,1.070000e-172,491.0
204,ctg34_30_5545-6838,A0A154MTV3,57.2,416,4,418,1,416,1.470000e-172,491.0
202,ctg34_30_5545-6838,A0A1R0KVW1,57.2,416,4,418,1,416,6.330000e-174,494.0
...,...,...,...,...,...,...,...,...,...,...
283,ctg77_10_15334-17791,A0A1C3NYK1,22.2,590,191,759,2083,2593,2.610000e-10,66.6
282,ctg77_10_15334-17791,A0A385D6T2,21.9,475,228,694,336,746,2.290000e-10,66.6
280,ctg77_10_15334-17791,A0A117Q2I7,21.4,604,191,773,2096,2645,2.600000e-12,73.2
281,ctg77_10_15334-17791,A0A6H9UZV7,21.0,644,191,817,2069,2640,1.730000e-11,70.5


In [4]:
# Selecting the lowest evalue per qseqid
df_lowest_evalue = df.loc[df.groupby('qseqid')['evalue'].idxmin()]
df_lowest_evalue

Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore
366,ctg109_17_12713-15959,A0A1Y0IQG4,38.6,578,2,563,8,569,2.440000e-105,367.0
104,ctg10_24_1297-3028,O31427,30.9,220,22,240,19,213,4.460000e-14,73.9
105,ctg10_30_9256-9949,A0A6G3TKE2,30.5,246,3,226,10,246,4.160000e-20,87.4
206,ctg11_49_7949-8693,A0A6G3TJA6,28.8,243,14,246,15,248,2.050000e-14,72.4
209,ctg11_56_14753-15536,A0A1I0V737,29.2,120,64,176,1655,1773,1.020000e-06,51.6
...,...,...,...,...,...,...,...,...,...,...
66,ctg8_28_24318-44052,A0A239GGA1,36.7,6735,50,6249,1733,8291,0.000000e+00,3896.0
71,ctg8_43_61254-67836,A0A561EVH5,34.4,2162,31,2155,20,2068,0.000000e+00,1029.0
76,ctg8_44_67914-70146,A0A369BA02,37.8,664,1,661,1,648,5.870000e-131,431.0
188,ctg9_14_15337-16498,A0A841AWX8,38.0,374,13,385,8,372,2.720000e-67,219.0


In [10]:
df_lowest_evalue.sort_values(by=['pident'], ascending=False)

Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,sample
201,ctg34_30_5545-6838,A0A841B2S3,57.8,415,4,417,1,415,3.150000e-174,495.0,ctg34_30
33,ctg22_15_11204-11987,A0A1V0U7E9,53.3,257,7,260,4,260,6.080000e-88,263.0,ctg22_15
246,ctg1_42_2702-3905,A0A344L695,50.0,372,8,375,18,381,5.490000e-107,322.0,ctg1_42
256,ctg82_15_22668-26190,A0A2P1CZE6,47.1,922,3,913,4,920,3.600000e-274,859.0,ctg82_15
262,ctg31_18_3120-4284,A0A1R0KYE3,46.5,385,2,382,16,392,1.290000e-101,308.0,ctg31_18
...,...,...,...,...,...,...,...,...,...,...,...
333,ctg1_335_25135-26425,A0A8J8SHW0,24.6,167,145,311,2530,2688,3.630000e-06,51.6,ctg1_335
234,ctg13_33_10904-12845,A0A7W7PYW3,24.2,528,99,610,163,646,8.590000e-20,96.7,ctg13_33
353,ctg1_350_38287-39253,Q4K418,24.0,312,14,312,37,338,7.720000e-08,55.1,ctg1_350
183,ctg9_5_4576-6550,A0A516RKI2,23.1,527,111,620,485,948,5.080000e-18,90.9,ctg9_5


In [5]:
# Creating a new column that takes the value "ctgXXX_XX" from the "qseqid" column
df_lowest_evalue['sample'] = df_lowest_evalue['qseqid'].str.extract(r'(ctg\d+_\d+)')
df_lowest_evalue["sample"].value_counts()

sample
ctg109_17    1
ctg10_24     1
ctg10_30     1
ctg11_49     1
ctg11_56     1
            ..
ctg8_28      1
ctg8_43      1
ctg8_44      1
ctg9_14      1
ctg9_5       1
Name: count, Length: 108, dtype: int64

In [15]:
df_filtered = df_lowest_evalue.drop(columns=["length", "qstart", "qend", "sstart", "send"])
df_filtered.sort_values(by=['pident'], ascending=False)

Unnamed: 0,qseqid,sseqid,pident,evalue,bitscore,sample
201,ctg34_30_5545-6838,A0A841B2S3,57.8,3.150000e-174,495.0,ctg34_30
33,ctg22_15_11204-11987,A0A1V0U7E9,53.3,6.080000e-88,263.0,ctg22_15
246,ctg1_42_2702-3905,A0A344L695,50.0,5.490000e-107,322.0,ctg1_42
256,ctg82_15_22668-26190,A0A2P1CZE6,47.1,3.600000e-274,859.0,ctg82_15
262,ctg31_18_3120-4284,A0A1R0KYE3,46.5,1.290000e-101,308.0,ctg31_18
...,...,...,...,...,...,...
333,ctg1_335_25135-26425,A0A8J8SHW0,24.6,3.630000e-06,51.6,ctg1_335
234,ctg13_33_10904-12845,A0A7W7PYW3,24.2,8.590000e-20,96.7,ctg13_33
353,ctg1_350_38287-39253,Q4K418,24.0,7.720000e-08,55.1,ctg1_350
183,ctg9_5_4576-6550,A0A516RKI2,23.1,5.080000e-18,90.9,ctg9_5


### Investigating the results from alignment 
This section will take the results from others notebooks and put together with the alignment analysis. 