In [15]:
import pandas as pd
import subprocess
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [10]:
neg_data = pd.read_csv('./new_data_v2/negative_database.csv', sep=',', header=0)
print(neg_data.shape)
print(neg_data.dtypes)
neg_data.head()

(898, 5)
sseqid     object
sstart      int64
send        int64
sstrand    object
sseq       object
dtype: object


Unnamed: 0,sseqid,sstart,send,sstrand,sseq
0,LinJ.01,36104,36243,plus,GACAGACCGACACACGCAGCCGTGTGATGCCGCCGCCGAGGGCAGT...
1,LinJ.01,146413,146531,plus,CGAATTGTGTTCTGCGCATGCCTCTTCTCTGCCGTGCAGCATGCGG...
2,LinJ.01,271364,271651,plus,GAACGCCGCCCTCAATCGCGCGCTGAACTTCACGCGGCGGTCGACC...
3,LinJ.02,17014,17130,plus,GGAGGGGACGCGCGGAGCTGCGATGCGAAAGTGAGAGCAACACAGA...
4,LinJ.02,74330,74460,plus,TGTCACAAGCGACTCGAAGAGGACGAAAAGACACACGGCCGCACAC...


In [11]:
neg_data_bed = neg_data[["sseqid", "sstart", "send"]].copy()
print(neg_data_bed.head())

    sseqid  sstart    send
0  LinJ.01   36104   36243
1  LinJ.01  146413  146531
2  LinJ.01  271364  271651
3  LinJ.02   17014   17130
4  LinJ.02   74330   74460


In [17]:
path_neg_data_bed = './3.Neg_data_vs_CDS/data/neg_data.bed'
neg_data_bed.to_csv(path_neg_data_bed, sep='\t', index=False, header=False)

In [14]:
path_cds_bed = './3.Neg_data_vs_CDS/data/linfantum_cds.bed'

In [18]:
cmd = f"bedops --element-of {path_neg_data_bed} {path_cds_bed}"
result = subprocess.run(cmd, shell=True, capture_output=True, text=True, universal_newlines=True, executable="/usr/bin/bash")
result_out = result.stdout
result_df = pd.DataFrame([x.split("\t") for x in result_out.split("\n") if x],
                         columns = ["sseqid", "sstart", "send"])

In [20]:
result_df.head()

Unnamed: 0,sseqid,sstart,send
0,LinJ.05,109889,110888
1,LinJ.07,63118,63258
2,LinJ.08,282267,282840
3,LinJ.08,322677,323962
4,LinJ.09,482503,483178


In [19]:
print(f'From {neg_data.shape[0]} negative data, {result_df.shape[0]} are in CDS')

From 898 negative data, 41 are in CDS


In [21]:
# Let's check which cds overlaps with our data
cmd2 = f"bedops --element-of {path_cds_bed} {path_neg_data_bed}"
result2 = subprocess.run(cmd2, shell=True, capture_output=True, text=True, universal_newlines=True, executable="/usr/bin/bash")
result2_out = result2.stdout
result2_df = pd.DataFrame([x.split("\t") for x in result2_out.split("\n") if x],
                         columns = ["sseqid", "sstart", "send"])

In [22]:
result2_df.head()

Unnamed: 0,sseqid,sstart,send
0,LinJ.06,515416,515787
1,LinJ.08,289983,290597
2,LinJ.08,295607,296212
3,LinJ.08,300238,300840
4,LinJ.08,304863,305465


In [65]:
# And check wich CDS have out data inside (not overlap)
## Load CDS bed
cds_bed = pd.read_csv(path_cds_bed, sep='\t', header=None)
cds_bed.columns = ["sseqid", "sstart", "send"]
cds_bed[['sstart', 'send']] = cds_bed[['sstart', 'send']].astype(int)

## Load negative data bed
neg_data_bed = pd.read_csv(path_neg_data_bed, sep='\t', header=None)
neg_data_bed.columns = ["sseqid", "sstart", "send"]
neg_data_bed[['sstart', 'send']] = neg_data_bed[['sstart', 'send']].astype(int)

## Check which CDS have our data inside
# cds_with_neg_inside = {}
# for index, row in cds_bed.iterrows():
#     overlapping_negs = neg_data_bed[(neg_data_bed['sstart'] >= row['sstart']) & (neg_data_bed['send'] <= row['send'])]
#     if not overlapping_negs.empty:
#         cds_with_neg_inside[f'{row['sseqid']}_{row['sstart']}_{row['send']}'] = overlapping_negs
#     else:
#         continue

## And check with our data have CDS inside
neg_with_cds_inside = {}
for index, row in neg_data_bed.iterrows():
    overlapping_cds = cds_bed[
       ((cds_bed['sstart'] >= row['sstart']) & (cds_bed['sstart'] <= row['send'])) |
       ((cds_bed['send'] <= row['send']) & (cds_bed['send'] >= row['sstart'])) &
        (cds_bed['sseqid'] == row['sseqid'])
        ]
    if not overlapping_cds.empty:
        neg_with_cds_inside[f'{row['sseqid']}_{row['sstart']}_{row['send']}'] = overlapping_cds
    else:
        continue

In [66]:
neg_with_cds_inside

{'LinJ.01_271364_271651':        sseqid  sstart    send
 321   LinJ.04  271575  272033
 1423  LinJ.12  271565  272131
 3192  LinJ.22  271564  275001,
 'LinJ.02_17014_17130':       sseqid  sstart   send
 775  LinJ.08   17117  19006,
 'LinJ.02_74330_74460':        sseqid  sstart   send
 5646  LinJ.31   74391  75416,
 'LinJ.02_96678_97129':        sseqid  sstart    send
 665   LinJ.07   97023  102620
 2563  LinJ.19   96693   98588
 2928  LinJ.21   96983  100294
 6002  LinJ.32   96878  100786
 6428  LinJ.33   96758   98914,
 'LinJ.03_173456_173625':        sseqid  sstart    send
 6449  LinJ.33  173621  173920,
 'LinJ.04_142412_142843':        sseqid  sstart    send
 1879  LinJ.15  142596  142922
 2399  LinJ.18  142675  143589
 3162  LinJ.22  142474  143298
 6440  LinJ.33  142694  144796,
 'LinJ.04_175855_176491':        sseqid  sstart    send
 299   LinJ.04  176402  179428
 1111  LinJ.10  176092  176334
 1889  LinJ.15  176369  177937
 2246  LinJ.17  176358  176906
 5288  LinJ.30  176153  1

In [67]:
len(neg_with_cds_inside)

674

In [68]:
count_cds = 0
for key, value in neg_with_cds_inside.items():
    count_cds += value.shape[0]


3324