# Code

## Modules

In [56]:
import pandas as pd
import subprocess
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

## Import data

### Negative data base

In [57]:
# Read negative data
neg_data = pd.read_csv('./new_data_v2/negative_database.csv', sep=',', header=0)
print(neg_data.shape)
print(neg_data.dtypes)
neg_data.head()

(898, 5)
sseqid     object
sstart      int64
send        int64
sstrand    object
sseq       object
dtype: object


Unnamed: 0,sseqid,sstart,send,sstrand,sseq
0,LinJ.01,36104,36243,plus,GACAGACCGACACACGCAGCCGTGTGATGCCGCCGCCGAGGGCAGT...
1,LinJ.01,146413,146531,plus,CGAATTGTGTTCTGCGCATGCCTCTTCTCTGCCGTGCAGCATGCGG...
2,LinJ.01,271364,271651,plus,GAACGCCGCCCTCAATCGCGCGCTGAACTTCACGCGGCGGTCGACC...
3,LinJ.02,17014,17130,plus,GGAGGGGACGCGCGGAGCTGCGATGCGAAAGTGAGAGCAACACAGA...
4,LinJ.02,74330,74460,plus,TGTCACAAGCGACTCGAAGAGGACGAAAAGACACACGGCCGCACAC...


In [58]:
# Get what's needed from the negative data
neg_data_bed = neg_data[["sseqid", "sstart", "send"]].copy()
print(neg_data_bed.head())

    sseqid  sstart    send
0  LinJ.01   36104   36243
1  LinJ.01  146413  146531
2  LinJ.01  271364  271651
3  LinJ.02   17014   17130
4  LinJ.02   74330   74460


In [59]:
# Get the type of data
neg_data_bed.dtypes

sseqid    object
sstart     int64
send       int64
dtype: object

### CDS data base

In [60]:
path_cds = './3.Neg_data_vs_CDS/data/TriTrypDB-68_LinfantumJPCM5_AnnotatedCDSs.fasta'
cds_data = []
with open(path_cds, "r") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        # print(f"{record.description}")  # Here are all the data needed
        start, end = record.description.split("|")[3].split(":")[1].replace("(-)", "").replace("(+)","").strip().split("-")  # start and end coordinates
        sense = record.description.split("|")[3].split(":")[1].split("(")[1].replace(")","").strip()  # sense
        chr_num = record.description.split("|")[3].split(":")[0].split("=")[1]  # chromosome number
        attribute = record.description.split("|")[2].strip()
        # Now let's create a bed file with chr_num, start and end only, with tab sep
        cds_data.append([chr_num, start, end, sense, attribute])
        # print(f"Chromosome: {chr_num}, Start: {start}, End: {end}, Attribute: {attribute}")
print(len(cds_data))
[print(x) for x in cds_data[:5]]

8527
['LinJ.01', '3710', '4711', '-', 'product=Protein of unknown function (DUF2946)']
['LinJ.01', '5804', '7438', '-', 'product=Endonuclease/Exonuclease/phosphatase family']
['LinJ.01', '9038', '11059', '-', 'product=Kinesin-13']
['LinJ.01', '12041', '12601', '-', 'product=hypothetical protein - conserved']
['LinJ.01', '14957', '16954', '-', 'product=carboxylase - putative']


[None, None, None, None, None]

In [61]:
# Transform the list into a dataframe
cds_data = pd.DataFrame(cds_data, columns=["sseqid", "sstart", "send", "sense", "attribute"])
cds_data[['sstart', 'send']] = cds_data[['sstart', 'send']].apply(pd.to_numeric)
print(cds_data.dtypes)
print(cds_data.head())

sseqid       object
sstart        int64
send          int64
sense        object
attribute    object
dtype: object
    sseqid  sstart   send sense  \
0  LinJ.01    3710   4711     -   
1  LinJ.01    5804   7438     -   
2  LinJ.01    9038  11059     -   
3  LinJ.01   12041  12601     -   
4  LinJ.01   14957  16954     -   

                                           attribute  
0      product=Protein of unknown function (DUF2946)  
1  product=Endonuclease/Exonuclease/phosphatase f...  
2                                 product=Kinesin-13  
3           product=hypothetical protein - conserved  
4                     product=carboxylase - putative  


## Compare `neg_data_bed` against `cds_data`

In [62]:
## And check with our data have CDS inside
neg_with_cds = {}
for index, row in neg_data_bed.iterrows():
    overlapping_cds = cds_data[
       (
           ((cds_data['sstart'] >= row['sstart']) & (cds_data['sstart'] <= row['send'])) |
           ((cds_data['send'] <= row['send']) & (cds_data['send'] >= row['sstart']))
       ) &
       (cds_data['sseqid'] == row['sseqid'])
        ]
    if not overlapping_cds.empty:
        neg_with_cds[f'{row['sseqid']}_{row['sstart']}_{row['send']}'] = overlapping_cds
    else:
        continue

In [63]:
for key, value in neg_with_cds.items():
    print(f"Key: {key}")
    if len(value) >= 1:
        for index, row in value.iterrows():
            print(f"\tValue: {list(row)}")

Key: LinJ.04_175855_176491
	Value: ['LinJ.04', 176402, 179428, '-', 'product=hypothetical protein - conserved']
Key: LinJ.04_208748_209410
	Value: ['LinJ.04', 209331, 210374, '-', 'product=hypothetical protein - conserved']
Key: LinJ.06_301842_302407
	Value: ['LinJ.06', 302199, 305753, '+', 'product=hypothetical protein - conserved']
Key: LinJ.06_515364_516028
	Value: ['LinJ.06', 515416, 515787, '-', 'product=Cytochrome b5-like Heme/Steroid binding domain containing protein - putative']
Key: LinJ.08_43301_43873
	Value: ['LinJ.08', 43655, 45412, '+', 'product=hypothetical protein - conserved']
Key: LinJ.08_84310_84882
	Value: ['LinJ.08', 84819, 86387, '+', 'product=ABC1 family - putative']
Key: LinJ.08_288974_290999
	Value: ['LinJ.08', 289983, 290597, '+', 'product=amastin-like protein']
Key: LinJ.08_294597_296614
	Value: ['LinJ.08', 295607, 296212, '+', 'product=amastin-like protein']
Key: LinJ.08_299225_301242
	Value: ['LinJ.08', 300238, 300840, '+', 'product=Amastin surface glycoprot

In [64]:
print(f'Therea are {len(neg_with_cds)}/{len(neg_data)} elements in the negative data that overlap with CDS')
print(f'There are {len(neg_data) - len(neg_with_cds)} elements in the negative data that do not overlap with CDS')

Therea are 184/898 elements in the negative data that overlap with CDS
There are 714 elements in the negative data that do not overlap with CDS


In [65]:
count_cds = 0
for key, value in neg_with_cds.items():
    count_cds += value.shape[0]
print(f'There are {count_cds}/{len(cds_data)} CDS that overlap with the negative data')
print(f'There are {len(cds_data) - count_cds} CDS that do not overlap with the negative data')


There are 191/8527 CDS that overlap with the negative data
There are 8336 CDS that do not overlap with the negative data


## Export as CSV

### Export the relationship with CDS

In [66]:
pre_csv = []
for key, value in neg_with_cds.items():
    for index, row in value.iterrows():
        pre_csv.append([key, f'{row['sseqid']}|{row['sstart']}|{row['send']}|{row['sense']}|{row['attribute']}'])
print(len(pre_csv))
[print(x) for x in pre_csv[:5]]

191
['LinJ.04_175855_176491', 'LinJ.04|176402|179428|-|product=hypothetical protein - conserved']
['LinJ.04_208748_209410', 'LinJ.04|209331|210374|-|product=hypothetical protein - conserved']
['LinJ.06_301842_302407', 'LinJ.06|302199|305753|+|product=hypothetical protein - conserved']
['LinJ.06_515364_516028', 'LinJ.06|515416|515787|-|product=Cytochrome b5-like Heme/Steroid binding domain containing protein - putative']
['LinJ.08_43301_43873', 'LinJ.08|43655|45412|+|product=hypothetical protein - conserved']


[None, None, None, None, None]

In [67]:
pre_csv = pd.DataFrame(pre_csv, columns=["neg_key", "cds_key"])

In [68]:
# Save the data
pre_csv.to_csv('./3.Neg_data_vs_CDS/new_data/neg_with_cds.csv', sep=',', header=True, index=False)

### Export the onlye the rows in neg data that solap with CDS

In [69]:
# Get the elemens that overlap with CDS
neg_data_only_overlaps= []
for key, value in neg_with_cds.items():
    key_chr, key_start, key_end = key.split("_")[0], key.split("_")[1], key.split("_")[2]
    neg_data_only_overlaps.append([key_chr, key_start, key_end])

neg_data_only_overlaps = pd.DataFrame(neg_data_only_overlaps, columns=["sseqid", "sstart", "send"])
neg_data_only_overlaps[['sstart', 'send']] = neg_data_only_overlaps[['sstart', 'send']].apply(pd.to_numeric)
print(neg_data_only_overlaps.dtypes)
print(len(neg_data_only_overlaps))

sseqid    object
sstart     int64
send       int64
dtype: object
184


In [70]:
# Now extract the from neg data the values that are in neg_data_only_solaps and the ones that are now.
new_neg_data_overlaps_with_cds = neg_data.merge(neg_data_only_overlaps, on=["sseqid", "sstart", "send"], how="inner")
new_neg_data_no_overlaps_with_cds = neg_data.merge(neg_data_only_overlaps, on=["sseqid", "sstart", "send"], how="outer", indicator=True).query('_merge == "left_only"').drop('_merge', axis=1)

In [71]:
# Print the data len
print(len(new_neg_data_overlaps_with_cds))
print(len(new_neg_data_no_overlaps_with_cds))

184
714


In [72]:
# Save both to CSV
new_neg_data_overlaps_with_cds.to_csv('./3.Neg_data_vs_CDS/new_data/negative_database_match_CDS.csv', sep=',', header=True, index=False)
new_neg_data_no_overlaps_with_cds.to_csv('./3.Neg_data_vs_CDS/new_data/negative_database_nomatch.csv', sep=',', header=True, index=False)

## Create a GFF file

### Neg data that overlaps with CDS

In [73]:
gff_neg_with_cds = []
column_names = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
for key, value in neg_with_cds.items():
    key_chr, key_start, key_end = key.split("_")[0], key.split("_")[1], key.split("_")[2]
    attr_str = ""
    for _, row in value.iterrows():
        attr_str += f"{row['sseqid']}|{row['sstart']}|{row['send']}|{row['sense']}|{row['attribute']};"
    gff_neg_with_cds.append(
        {
            'seqname': key_chr,
            'source': 'CBM',
            'feature': 'Negative_Repetead_with_CDS',
            'start': key_start,
            'end': key_end,
            'score': '.',
            'strand': '+',
            'frame': '.',
            'attribute': f'Related_to={attr_str}'
        }
    )
print(len(gff_neg_with_cds))
[print(x) for x in gff_neg_with_cds[:5]]

184
{'seqname': 'LinJ.04', 'source': 'CBM', 'feature': 'Negative_Repetead_with_CDS', 'start': '175855', 'end': '176491', 'score': '.', 'strand': '+', 'frame': '.', 'attribute': 'Related_to=LinJ.04|176402|179428|-|product=hypothetical protein - conserved;'}
{'seqname': 'LinJ.04', 'source': 'CBM', 'feature': 'Negative_Repetead_with_CDS', 'start': '208748', 'end': '209410', 'score': '.', 'strand': '+', 'frame': '.', 'attribute': 'Related_to=LinJ.04|209331|210374|-|product=hypothetical protein - conserved;'}
{'seqname': 'LinJ.06', 'source': 'CBM', 'feature': 'Negative_Repetead_with_CDS', 'start': '301842', 'end': '302407', 'score': '.', 'strand': '+', 'frame': '.', 'attribute': 'Related_to=LinJ.06|302199|305753|+|product=hypothetical protein - conserved;'}
{'seqname': 'LinJ.06', 'source': 'CBM', 'feature': 'Negative_Repetead_with_CDS', 'start': '515364', 'end': '516028', 'score': '.', 'strand': '+', 'frame': '.', 'attribute': 'Related_to=LinJ.06|515416|515787|-|product=Cytochrome b5-like H

[None, None, None, None, None]

In [74]:
gff_neg_with_cds = pd.DataFrame(gff_neg_with_cds, columns=column_names)

In [75]:
gff_neg_with_cds.to_csv('./3.Neg_data_vs_CDS/new_data/neg_with_cds.gff', sep='\t', header=False, index=False)

#### Neg data that does not overlap with CDS

In [76]:
gff_neg_no_cds = []
column_names = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
for index, row in new_neg_data_no_overlaps_with_cds.iterrows():
    gff_neg_no_cds.append(
        {
            'seqname': row['sseqid'],
            'source': 'CBM',
            'feature': 'Negative_Repetead_no_CDS',
            'start': row['sstart'],
            'end': row['send'],
            'score': '.',
            'strand': '+',
            'frame': '.',
            'attribute': '.'
        }
    )
print(len(gff_neg_no_cds))
[print(x) for x in gff_neg_no_cds[:5]]

714
{'seqname': 'LinJ.01', 'source': 'CBM', 'feature': 'Negative_Repetead_no_CDS', 'start': 36104, 'end': 36243, 'score': '.', 'strand': '+', 'frame': '.', 'attribute': '.'}
{'seqname': 'LinJ.01', 'source': 'CBM', 'feature': 'Negative_Repetead_no_CDS', 'start': 146413, 'end': 146531, 'score': '.', 'strand': '+', 'frame': '.', 'attribute': '.'}
{'seqname': 'LinJ.01', 'source': 'CBM', 'feature': 'Negative_Repetead_no_CDS', 'start': 271364, 'end': 271651, 'score': '.', 'strand': '+', 'frame': '.', 'attribute': '.'}
{'seqname': 'LinJ.02', 'source': 'CBM', 'feature': 'Negative_Repetead_no_CDS', 'start': 17014, 'end': 17130, 'score': '.', 'strand': '+', 'frame': '.', 'attribute': '.'}
{'seqname': 'LinJ.02', 'source': 'CBM', 'feature': 'Negative_Repetead_no_CDS', 'start': 74330, 'end': 74460, 'score': '.', 'strand': '+', 'frame': '.', 'attribute': '.'}


[None, None, None, None, None]

In [77]:
gff_neg_no_cds = pd.DataFrame(gff_neg_no_cds, columns=column_names)
gff_neg_no_cds.to_csv('./3.Neg_data_vs_CDS/new_data/neg_no_cds.gff', sep='\t', header=False, index=False)