In [15]:
import pandas as pd
import subprocess
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [10]:
neg_data = pd.read_csv('./new_data_v2/negative_database.csv', sep=',', header=0)
print(neg_data.shape)
print(neg_data.dtypes)
neg_data.head()

(898, 5)
sseqid     object
sstart      int64
send        int64
sstrand    object
sseq       object
dtype: object


Unnamed: 0,sseqid,sstart,send,sstrand,sseq
0,LinJ.01,36104,36243,plus,GACAGACCGACACACGCAGCCGTGTGATGCCGCCGCCGAGGGCAGT...
1,LinJ.01,146413,146531,plus,CGAATTGTGTTCTGCGCATGCCTCTTCTCTGCCGTGCAGCATGCGG...
2,LinJ.01,271364,271651,plus,GAACGCCGCCCTCAATCGCGCGCTGAACTTCACGCGGCGGTCGACC...
3,LinJ.02,17014,17130,plus,GGAGGGGACGCGCGGAGCTGCGATGCGAAAGTGAGAGCAACACAGA...
4,LinJ.02,74330,74460,plus,TGTCACAAGCGACTCGAAGAGGACGAAAAGACACACGGCCGCACAC...


In [11]:
neg_data_bed = neg_data[["sseqid", "sstart", "send"]].copy()
print(neg_data_bed.head())

    sseqid  sstart    send
0  LinJ.01   36104   36243
1  LinJ.01  146413  146531
2  LinJ.01  271364  271651
3  LinJ.02   17014   17130
4  LinJ.02   74330   74460


In [17]:
path_neg_data_bed = './3.Neg_data_vs_CDS/data/neg_data.bed'
neg_data_bed.to_csv(path_neg_data_bed, sep='\t', index=False, header=False)

In [14]:
path_cds_bed = './3.Neg_data_vs_CDS/data/linfantum_cds.bed'

In [18]:
cmd = f"bedops --element-of {path_neg_data_bed} {path_cds_bed}"
result = subprocess.run(cmd, shell=True, capture_output=True, text=True, universal_newlines=True, executable="/usr/bin/bash")
result_out = result.stdout
result_df = pd.DataFrame([x.split("\t") for x in result_out.split("\n") if x],
                         columns = ["sseqid", "sstart", "send"])

In [20]:
result_df.head()

Unnamed: 0,sseqid,sstart,send
0,LinJ.05,109889,110888
1,LinJ.07,63118,63258
2,LinJ.08,282267,282840
3,LinJ.08,322677,323962
4,LinJ.09,482503,483178


In [19]:
print(f'From {neg_data.shape[0]} negative data, {result_df.shape[0]} are in CDS')

From 898 negative data, 41 are in CDS


In [21]:
# Let's check which cds
cmd2 = f"bedops --element-of {path_cds_bed} {path_neg_data_bed}"
result2 = subprocess.run(cmd2, shell=True, capture_output=True, text=True, universal_newlines=True, executable="/usr/bin/bash")
result2_out = result2.stdout
result2_df = pd.DataFrame([x.split("\t") for x in result2_out.split("\n") if x],
                         columns = ["sseqid", "sstart", "send"])

In [22]:
result2_df.head()

Unnamed: 0,sseqid,sstart,send
0,LinJ.06,515416,515787
1,LinJ.08,289983,290597
2,LinJ.08,295607,296212
3,LinJ.08,300238,300840
4,LinJ.08,304863,305465


In [35]:
# Get the len of path_cds_bed file
cmd3 = f"cat {path_cds_bed} | wc -l"
length_cds = subprocess.run(cmd3, shell=True, capture_output=True, text=True, universal_newlines=True, executable="/usr/bin/bash")
length_cds_out = length_cds.stdout.strip()

In [39]:
print(f"From {length_cds_out} CDS, {result2_df.shape[0]} are inside the negative data")

From 8527 CDS, 48 are inside the negative data


In [41]:
# transform to numeric types
result_df[['sstart', 'send']] = result_df[['sstart', 'send']].apply(pd.to_numeric)
result2_df[['sstart', 'send']] = result2_df[['sstart', 'send']].apply(pd.to_numeric)

In [42]:
# Read cds CSV file
cds_df = pd.read_csv('./3.Neg_data_vs_CDS/data/linfantum_cds.csv', sep=',', header=0)
print(cds_df.shape)
print(cds_df.dtypes)
cds_df.head()

(8527, 5)
sseqid       object
sstart        int64
send          int64
sense        object
attribute    object
dtype: object


Unnamed: 0,sseqid,sstart,send,sense,attribute
0,LinJ.01,3710,4711,-,product=Protein of unknown function (DUF2946)
1,LinJ.01,5804,7438,-,product=Endonuclease/Exonuclease/phosphatase f...
2,LinJ.01,9038,11059,-,product=Kinesin-13
3,LinJ.01,12041,12601,-,product=hypothetical protein - conserved
4,LinJ.01,14957,16954,-,product=carboxylase - putative


In [44]:
common_df = pd.merge(cds_df, result2_df, on=['sseqid', 'sstart', 'send'], how='inner')
common_df

Unnamed: 0,sseqid,sstart,send,sense,attribute
0,LinJ.06,515416,515787,-,product=Cytochrome b5-like Heme/Steroid bindin...
1,LinJ.08,289983,290597,+,product=amastin-like protein
2,LinJ.08,295607,296212,+,product=amastin-like protein
3,LinJ.08,300238,300840,+,product=Amastin surface glycoprotein - putative
4,LinJ.08,304863,305465,+,product=Amastin surface glycoprotein - putative
5,LinJ.08,309447,310049,+,product=Amastin surface glycoprotein - putative
6,LinJ.08,313960,314571,+,product=amastin-like protein
7,LinJ.10,469169,469561,-,product=histone H3 - putative
8,LinJ.10,474582,474974,-,product=histone H3 - putative
9,LinJ.12,375028,375513,+,product=hypothetical protein - conserved
