# Code

## Modules

In [45]:
import pandas as pd
import subprocess
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

## Import data

### Negative data base

In [46]:
# Read negative data
neg_data = pd.read_csv('./3.new_data_v2/negative_database.csv', sep=',', header=0)
print(neg_data.shape)
print(neg_data.dtypes)
neg_data.head()

(913, 5)
sseqid     object
sstart      int64
send        int64
sstrand    object
sseq       object
dtype: object


Unnamed: 0,sseqid,sstart,send,sstrand,sseq
0,LinJ.01,36103,36242,plus,AGACAGACCGACACACGCAGCCGTGTGATGCCGCCGCCGAGGGCAG...
1,LinJ.01,113760,114388,plus,CAGCGCCATGCACGACATGGCCGCTGACGTCCGTAGCCCTAACTCG...
2,LinJ.01,146412,146530,plus,GCGAATTGTGTTCTGCGCATGCCTCTTCTCTGCCGTGCAGCATGCG...
3,LinJ.01,261866,262439,plus,CGGACTTGGCAAGTGGCCGCCATCGATGAAAACGCACCATGCCTTT...
4,LinJ.01,271363,271650,plus,CGAACGCCGCCCTCAATCGCGCGCTGAACTTCACGCGGCGGTCGAC...


In [47]:
# Get what's needed from the negative data
neg_data_bed = neg_data[["sseqid", "sstart", "send"]].copy()
print(neg_data_bed.head())

    sseqid  sstart    send
0  LinJ.01   36103   36242
1  LinJ.01  113760  114388
2  LinJ.01  146412  146530
3  LinJ.01  261866  262439
4  LinJ.01  271363  271650


In [48]:
# Get the type of data
neg_data_bed.dtypes

sseqid    object
sstart     int64
send       int64
dtype: object

### CDS data base

In [49]:
path_cds = './4.Neg_data_vs_CDS/data/TriTrypDB-68_LinfantumJPCM5_AnnotatedCDSs.fasta'
cds_data = []
with open(path_cds, "r") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        # print(f"{record.description}")  # Here are all the data needed
        start, end = record.description.split("|")[3].split(":")[1].replace("(-)", "").replace("(+)","").strip().split("-")  # start and end coordinates
        sense = record.description.split("|")[3].split(":")[1].split("(")[1].replace(")","").strip()  # sense
        chr_num = record.description.split("|")[3].split(":")[0].split("=")[1]  # chromosome number
        attribute = record.description.split("|")[2].strip()
        # Now let's create a bed file with chr_num, start and end only, with tab sep
        cds_data.append([chr_num, start, end, sense, attribute])
        # print(f"Chromosome: {chr_num}, Start: {start}, End: {end}, Attribute: {attribute}")
print(len(cds_data))
[print(x) for x in cds_data[:5]]

8527
['LinJ.01', '3710', '4711', '-', 'product=Protein of unknown function (DUF2946)']
['LinJ.01', '5804', '7438', '-', 'product=Endonuclease/Exonuclease/phosphatase family']
['LinJ.01', '9038', '11059', '-', 'product=Kinesin-13']
['LinJ.01', '12041', '12601', '-', 'product=hypothetical protein - conserved']
['LinJ.01', '14957', '16954', '-', 'product=carboxylase - putative']


[None, None, None, None, None]

In [50]:
# Transform the list into a dataframe
cds_data = pd.DataFrame(cds_data, columns=["sseqid", "sstart", "send", "sense", "attribute"])
cds_data[['sstart', 'send']] = cds_data[['sstart', 'send']].apply(pd.to_numeric)
print(cds_data.dtypes)
print(cds_data.head())

sseqid       object
sstart        int64
send          int64
sense        object
attribute    object
dtype: object
    sseqid  sstart   send sense  \
0  LinJ.01    3710   4711     -   
1  LinJ.01    5804   7438     -   
2  LinJ.01    9038  11059     -   
3  LinJ.01   12041  12601     -   
4  LinJ.01   14957  16954     -   

                                           attribute  
0      product=Protein of unknown function (DUF2946)  
1  product=Endonuclease/Exonuclease/phosphatase f...  
2                                 product=Kinesin-13  
3           product=hypothetical protein - conserved  
4                     product=carboxylase - putative  


## Compare `neg_data_bed` against `cds_data`

In [51]:
## And check with our data have CDS inside
neg_with_cds = {}
for index, row in neg_data_bed.iterrows():
    overlapping_cds = cds_data[
       (
           (
               ((cds_data['sstart'] >= row['sstart']) & (cds_data['sstart'] <= row['send'])) |
               ((cds_data['send'] <= row['send']) & (cds_data['send'] >= row['sstart']))
            ) |  # Now let's do for the "neg_data" that are inside the CDS
           (
               (cds_data['sstart'] <= row['sstart']) & (cds_data['send'] >= row['send'])
            )
        ) &
       (cds_data['sseqid'] == row['sseqid'])
        ]
    if not overlapping_cds.empty:
        neg_with_cds[f'{row['sseqid']}_{row['sstart']}_{row['send']}'] = overlapping_cds
    else:
        continue

In [52]:
for key, value in neg_with_cds.items():
    print(f"Key: {key}")
    if len(value) >= 1:
        for index, row in value.iterrows():
            print(f"\tValue: {list(row)}")

Key: LinJ.02_119095_119743
	Value: ['LinJ.02', 119608, 122316, '-', 'product=Glycosyltransferase (GlcNAc) - putative']
Key: LinJ.03_349124_349733
	Value: ['LinJ.03', 349457, 353806, '-', 'product=hypothetical protein - conserved']
Key: LinJ.04_96585_97584
	Value: ['LinJ.04', 97144, 99075, '+', 'product=beta-fructofuranosidase - putative']
Key: LinJ.04_97729_98226
	Value: ['LinJ.04', 97144, 99075, '+', 'product=beta-fructofuranosidase - putative']
Key: LinJ.04_98297_99510
	Value: ['LinJ.04', 97144, 99075, '+', 'product=beta-fructofuranosidase - putative']
Key: LinJ.04_100424_101110
	Value: ['LinJ.04', 100760, 101959, '+', 'product=beta-fructofuranosidase - putative']
Key: LinJ.04_101181_101967
	Value: ['LinJ.04', 100760, 101959, '+', 'product=beta-fructofuranosidase - putative']
Key: LinJ.04_175855_176491
	Value: ['LinJ.04', 176402, 179428, '-', 'product=hypothetical protein - conserved']
Key: LinJ.04_208748_209410
	Value: ['LinJ.04', 209331, 210374, '-', 'product=hypothetical protein -

In [53]:
print(f'Therea are {len(neg_with_cds)}/{len(neg_data)} elements in the negative data that overlap with CDS')
print(f'There are {len(neg_data) - len(neg_with_cds)} elements in the negative data that do not overlap with CDS')

Therea are 231/913 elements in the negative data that overlap with CDS
There are 682 elements in the negative data that do not overlap with CDS


In [54]:
count_cds = 0
for key, value in neg_with_cds.items():
    count_cds += value.shape[0]
print(f'There are {count_cds}/{len(cds_data)} CDS that overlap with the negative data')
print(f'There are {len(cds_data) - count_cds} CDS that do not overlap with the negative data')


There are 239/8527 CDS that overlap with the negative data
There are 8288 CDS that do not overlap with the negative data


## Export as CSV

### Export the relationship with CDS

In [55]:
pre_csv = []
for key, value in neg_with_cds.items():
    for index, row in value.iterrows():
        pre_csv.append([key, f'{row['sseqid']}|{row['sstart']}|{row['send']}|{row['sense']}|{row['attribute']}'])
print(len(pre_csv))
[print(x) for x in pre_csv[:5]]

239
['LinJ.02_119095_119743', 'LinJ.02|119608|122316|-|product=Glycosyltransferase (GlcNAc) - putative']
['LinJ.03_349124_349733', 'LinJ.03|349457|353806|-|product=hypothetical protein - conserved']
['LinJ.04_96585_97584', 'LinJ.04|97144|99075|+|product=beta-fructofuranosidase - putative']
['LinJ.04_97729_98226', 'LinJ.04|97144|99075|+|product=beta-fructofuranosidase - putative']
['LinJ.04_98297_99510', 'LinJ.04|97144|99075|+|product=beta-fructofuranosidase - putative']


[None, None, None, None, None]

In [56]:
pre_csv = pd.DataFrame(pre_csv, columns=["neg_key", "cds_key"])

In [57]:
# Save the data
pre_csv.to_csv('./4.Neg_data_vs_CDS/new_data/neg_with_cds.csv', sep=',', header=True, index=False)

### Export the onlye the rows in neg data that solap with CDS

In [58]:
# Get the elemens that overlap with CDS
neg_data_only_overlaps= []
for key, value in neg_with_cds.items():
    key_chr, key_start, key_end = key.split("_")[0], key.split("_")[1], key.split("_")[2]
    neg_data_only_overlaps.append([key_chr, key_start, key_end])

neg_data_only_overlaps = pd.DataFrame(neg_data_only_overlaps, columns=["sseqid", "sstart", "send"])
neg_data_only_overlaps[['sstart', 'send']] = neg_data_only_overlaps[['sstart', 'send']].apply(pd.to_numeric)
print(neg_data_only_overlaps.dtypes)
print(len(neg_data_only_overlaps))

sseqid    object
sstart     int64
send       int64
dtype: object
231


In [59]:
# Now extract the from neg data the values that are in neg_data_only_solaps and the ones that are now.
new_neg_data_overlaps_with_cds = neg_data.merge(neg_data_only_overlaps, on=["sseqid", "sstart", "send"], how="inner")
new_neg_data_no_overlaps_with_cds = neg_data.merge(neg_data_only_overlaps, on=["sseqid", "sstart", "send"], how="outer", indicator=True).query('_merge == "left_only"').drop('_merge', axis=1)

In [60]:
# Print the data len
print(len(new_neg_data_overlaps_with_cds))
print(len(new_neg_data_no_overlaps_with_cds))

231
682


In [61]:
# Save both to CSV
new_neg_data_overlaps_with_cds.to_csv('./4.Neg_data_vs_CDS/new_data/negative_database_match_CDS.csv', sep=',', header=True, index=False)
new_neg_data_no_overlaps_with_cds.to_csv('./4.Neg_data_vs_CDS/new_data/negative_database_nomatch.csv', sep=',', header=True, index=False)

## Create a GFF file

### Neg data that overlaps with CDS

In [62]:
gff_neg_with_cds = []
column_names = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
for key, value in neg_with_cds.items():
    key_chr, key_start, key_end = key.split("_")[0], key.split("_")[1], key.split("_")[2]
    attr_str = ""
    for _, row in value.iterrows():
        attr_str += f"{row['sseqid']}|{row['sstart']}|{row['send']}|{row['sense']}|{row['attribute']};"
    gff_neg_with_cds.append(
        {
            'seqname': key_chr,
            'source': 'CBM',
            'feature': 'Negative_Repetead_with_CDS',
            'start': key_start,
            'end': key_end,
            'score': '.',
            'strand': '+',
            'frame': '.',
            'attribute': f'Related_to={attr_str}'
        }
    )
print(len(gff_neg_with_cds))
[print(x) for x in gff_neg_with_cds[:5]]

231
{'seqname': 'LinJ.02', 'source': 'CBM', 'feature': 'Negative_Repetead_with_CDS', 'start': '119095', 'end': '119743', 'score': '.', 'strand': '+', 'frame': '.', 'attribute': 'Related_to=LinJ.02|119608|122316|-|product=Glycosyltransferase (GlcNAc) - putative;'}
{'seqname': 'LinJ.03', 'source': 'CBM', 'feature': 'Negative_Repetead_with_CDS', 'start': '349124', 'end': '349733', 'score': '.', 'strand': '+', 'frame': '.', 'attribute': 'Related_to=LinJ.03|349457|353806|-|product=hypothetical protein - conserved;'}
{'seqname': 'LinJ.04', 'source': 'CBM', 'feature': 'Negative_Repetead_with_CDS', 'start': '96585', 'end': '97584', 'score': '.', 'strand': '+', 'frame': '.', 'attribute': 'Related_to=LinJ.04|97144|99075|+|product=beta-fructofuranosidase - putative;'}
{'seqname': 'LinJ.04', 'source': 'CBM', 'feature': 'Negative_Repetead_with_CDS', 'start': '97729', 'end': '98226', 'score': '.', 'strand': '+', 'frame': '.', 'attribute': 'Related_to=LinJ.04|97144|99075|+|product=beta-fructofuranosi

[None, None, None, None, None]

In [63]:
gff_neg_with_cds = pd.DataFrame(gff_neg_with_cds, columns=column_names)

In [64]:
gff_neg_with_cds.to_csv('./4.Neg_data_vs_CDS/new_data/neg_with_cds.gff', sep='\t', header=False, index=False)

#### Neg data that does not overlap with CDS

In [65]:
gff_neg_no_cds = []
column_names = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
for index, row in new_neg_data_no_overlaps_with_cds.iterrows():
    gff_neg_no_cds.append(
        {
            'seqname': row['sseqid'],
            'source': 'CBM',
            'feature': 'Negative_Repetead_no_CDS',
            'start': row['sstart'],
            'end': row['send'],
            'score': '.',
            'strand': '+',
            'frame': '.',
            'attribute': '.'
        }
    )
print(len(gff_neg_no_cds))
[print(x) for x in gff_neg_no_cds[:5]]

682
{'seqname': 'LinJ.01', 'source': 'CBM', 'feature': 'Negative_Repetead_no_CDS', 'start': 36103, 'end': 36242, 'score': '.', 'strand': '+', 'frame': '.', 'attribute': '.'}
{'seqname': 'LinJ.01', 'source': 'CBM', 'feature': 'Negative_Repetead_no_CDS', 'start': 113760, 'end': 114388, 'score': '.', 'strand': '+', 'frame': '.', 'attribute': '.'}
{'seqname': 'LinJ.01', 'source': 'CBM', 'feature': 'Negative_Repetead_no_CDS', 'start': 146412, 'end': 146530, 'score': '.', 'strand': '+', 'frame': '.', 'attribute': '.'}
{'seqname': 'LinJ.01', 'source': 'CBM', 'feature': 'Negative_Repetead_no_CDS', 'start': 261866, 'end': 262439, 'score': '.', 'strand': '+', 'frame': '.', 'attribute': '.'}
{'seqname': 'LinJ.01', 'source': 'CBM', 'feature': 'Negative_Repetead_no_CDS', 'start': 271363, 'end': 271650, 'score': '.', 'strand': '+', 'frame': '.', 'attribute': '.'}


[None, None, None, None, None]

In [66]:
gff_neg_no_cds = pd.DataFrame(gff_neg_no_cds, columns=column_names)
gff_neg_no_cds.to_csv('./4.Neg_data_vs_CDS/new_data/neg_no_cds.gff', sep='\t', header=False, index=False)