# Code

In [53]:
# needed libraries
import pandas as pd
import subprocess

from Bio import SeqIO

## Get bed files

In [44]:
# paths
path_cds_fasta = "./data/TriTrypDB-68_LinfantumJPCM5_AnnotatedCDSs.fasta"
path_neg_data = "./data/neg_data_corrected.csv"

In [41]:
# lets get the .bed file for the CDS
pre_bed_cds = []
with open(path_cds_fasta, "r") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        # print(f"{record.description}")  # Here are all the data needed
        start, end = record.description.split("|")[3].split(":")[1].replace("(-)", "").replace("(+)","").strip().split("-")  # start and end coordinates
        sense = record.description.split("|")[3].split(":")[1].split("(")[1].replace(")","").strip()  # sense
        chr_num = record.description.split("|")[3].split(":")[0].split("=")[1]  # chromosome number
        attribute = record.description.split("|")[2].strip()
        # Now let's create a bed file with chr_num, start and end only, with tab sep
        pre_bed_cds.append([chr_num, start, end, sense, attribute])
        # print(f"Chromosome: {chr_num}, Start: {start}, End: {end}, Attribute: {attribute}")
print(len(pre_bed_cds))
[print(x) for x in pre_bed_cds[:5]]
    

8527
['LinJ.01', '3710', '4711', '-', 'product=Protein of unknown function (DUF2946)']
['LinJ.01', '5804', '7438', '-', 'product=Endonuclease/Exonuclease/phosphatase family']
['LinJ.01', '9038', '11059', '-', 'product=Kinesin-13']
['LinJ.01', '12041', '12601', '-', 'product=hypothetical protein - conserved']
['LinJ.01', '14957', '16954', '-', 'product=carboxylase - putative']


[None, None, None, None, None]

In [42]:
# Create bed file with pre_bed_cds
path_cds_bed = "./bed_files/linfantum_cds.bed"
with open(path_cds_bed, "w") as handle:
    for elemen in pre_bed_cds:
        handle.write(f"{elemen[0]}\t{elemen[1]}\t{elemen[2]}\n")

In [None]:
path_neg_data_bed = "./bed_files/neg_data.bed"
neg_data = pd.read_csv(path_neg_data, sep=",", header=0)
# print(neg_data.head())

In [51]:
neg_data_bed = neg_data[["sseqid", "sstart", "send"]].copy()
print(neg_data_bed.head())

    sseqid  sstart    send
0  LinJ.04  175855  176491
1  LinJ.04  208516  208737
2  LinJ.04  208748  209410
3  LinJ.04  331481  332011
4  LinJ.04  362873  363525


In [52]:
neg_data_bed.to_csv(path_neg_data_bed, sep="\t", header=False, index=False)

## Bedops element of

In [95]:
cmd = f"bedops --element-of {path_neg_data_bed} {path_cds_bed}"
result = subprocess.run(cmd, shell=True, capture_output=True, text=True, universal_newlines=True, executable="/usr/bin/bash")
result_out = result.stdout
result_df = pd.DataFrame([x.split("\t") for x in result_out.split("\n") if x],
                         columns = ["sseqid", "sstart", "send"])

In [96]:
cmd

'bedops --element-of ./bed_files/neg_data.bed ./bed_files/linfantum_cds.bed'

In [97]:
print(f"From {neg_data_bed.shape[0]} negative data, {result2_df.shape[0]} are inside the CDS")

From 691 negative data, 34 are inside the CDS


In [98]:
result_df.head()

Unnamed: 0,sseqid,sstart,send
0,LinJ.05,109889,110888
1,LinJ.08,282267,282840
2,LinJ.08,322677,323962
3,LinJ.09,482503,483178
4,LinJ.09,483332,484362


In [120]:
# Let's check which cds
cmd2 = f"bedops --element-of {path_cds_bed} {path_neg_data_bed}"
result2 = subprocess.run(cmd2, shell=True, capture_output=True, text=True, universal_newlines=True, executable="/usr/bin/bash")
result2_out = result2.stdout
result2_df = pd.DataFrame([x.split("\t") for x in result2_out.split("\n") if x],
                         columns = ["sseqid", "sstart", "send"])

In [119]:
cmd2

'bedops --element-of ./bed_files/linfantum_cds.bed ./bed_files/neg_data.bed'

In [117]:
print(f"From {len(pre_bed_cds)} CDS, {result2_df.shape[0]} are inside the negative data")

From 8527 CDS, 34 are inside the negative data


In [121]:
# check which CDS are inside the negative data
print(result2_df.head()) ## the cds that solap with the negative data

    sseqid  sstart    send
0  LinJ.08  289983  290597
1  LinJ.08  295607  296212
2  LinJ.08  300238  300840
3  LinJ.08  304863  305465
4  LinJ.08  309447  310049


In [122]:
result2_df.dtypes

sseqid    object
sstart    object
send      object
dtype: object

In [123]:
# transform in int
result2_df[["sstart","send"]] = result2_df[["sstart","send"]].astype(int) 

In [124]:
result2_df.dtypes

sseqid    object
sstart     int64
send       int64
dtype: object

In [125]:
# transform the list of list pre_bed_cds in a dataframe
pre_bed_cds_df = pd.DataFrame(pre_bed_cds, columns=["sseqid", "sstart", "send", "sense", "attribute"])
pre_bed_cds_df[["sstart","send"]] = pre_bed_cds_df[["sstart","send"]].astype(int)
print(pre_bed_cds_df.dtypes)
pre_bed_cds_df.head()

sseqid       object
sstart        int64
send          int64
sense        object
attribute    object
dtype: object


Unnamed: 0,sseqid,sstart,send,sense,attribute
0,LinJ.01,3710,4711,-,product=Protein of unknown function (DUF2946)
1,LinJ.01,5804,7438,-,product=Endonuclease/Exonuclease/phosphatase f...
2,LinJ.01,9038,11059,-,product=Kinesin-13
3,LinJ.01,12041,12601,-,product=hypothetical protein - conserved
4,LinJ.01,14957,16954,-,product=carboxylase - putative


In [126]:
common_rows = pd.merge(pre_bed_cds_df, result2_df, on=["sseqid", "sstart", "send"], how="inner")

In [127]:
common_rows

Unnamed: 0,sseqid,sstart,send,sense,attribute
0,LinJ.08,289983,290597,+,product=amastin-like protein
1,LinJ.08,295607,296212,+,product=amastin-like protein
2,LinJ.08,300238,300840,+,product=Amastin surface glycoprotein - putative
3,LinJ.08,304863,305465,+,product=Amastin surface glycoprotein - putative
4,LinJ.08,309447,310049,+,product=Amastin surface glycoprotein - putative
5,LinJ.08,313960,314571,+,product=amastin-like protein
6,LinJ.10,469169,469561,-,product=histone H3 - putative
7,LinJ.10,474582,474974,-,product=histone H3 - putative
8,LinJ.12,375028,375513,+,product=hypothetical protein - conserved
9,LinJ.12,383206,383691,+,product=hypothetical protein - conserved
