In [14]:
# Prepare needed modules
import pandas as pd
import subprocess

In [6]:
# Load data
data = pd.read_csv("./data/Results_Program_LmSIDERs_30-05-54.csv", sep=",", header=0)
print(data.shape)
print(data.dtypes)
data.head()

(4886, 14)
qseqid      float64
sseqid       object
pident      float64
length        int64
qstart      float64
qend        float64
sstart        int64
send          int64
evalue      float64
bitscore    float64
qlen        float64
slen        float64
sstrand      object
sseq         object
dtype: object


Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen,sstrand,sseq
0,,LinJ.01,,1000,,,1000,1,,,,,minus,GTTCTATCCATCGACCTGCACCTGCACACATGAGCTGCAAAAAGTT...
1,,LinJ.01,,784,,,24876,24093,,,,,minus,CTCCTGTCTGAGAGCGGTGTGGCGCATGGTGCCGCGCCCTTTGCGC...
2,,LinJ.01,,927,,,36297,35371,,,,,minus,GTGGGCCTCTCCGTGTCTCCGTGCCGTCTGCTTTCCCTCTTCTCAA...
3,,LinJ.01,,806,,,40595,39790,,,,,minus,CCCCGCCCCTTGGCTGGCATGGACGGAAATGGACGATGAAGACAGC...
4,,LinJ.01,,927,,,55909,54983,,,,,minus,CCTTGCACAGCGGATTTCCATAGGCTTCTCTCAGCTCAAGGAAATC...


In [8]:
# Let's get only the needed data
data_short = data[["sseqid", "sstart", "send", "sstrand"]].copy()
print(data_short.shape)
print(data_short.dtypes)
data_short.head()

(4886, 4)
sseqid     object
sstart      int64
send        int64
sstrand    object
dtype: object


Unnamed: 0,sseqid,sstart,send,sstrand
0,LinJ.01,1000,1,minus
1,LinJ.01,24876,24093,minus
2,LinJ.01,36297,35371,minus
3,LinJ.01,40595,39790,minus
4,LinJ.01,55909,54983,minus


In [9]:
# Let's divide them by "sstrand"
data_plus = data_short[data_short["sstrand"] == "plus"].copy()
print(data_plus.shape)
data_minus = data_short[data_short["sstrand"] == "minus"].copy()
print(data_minus.shape)

(1936, 4)
(2950, 4)


In [10]:
# Prepare data again
data_plus = data_plus[["sseqid", "sstart", "send"]].copy()
data_minus = data_minus[["sseqid", "sstart", "send"]].copy()
data_minus [["sstart", "send"]] = data_minus[["send", "sstart"]]

In [16]:
data_plus.sort_values(by=["sseqid", "sstart"], inplace=True)
data_minus.sort_values(by=["sseqid", "sstart"], inplace=True)

In [17]:
# bedops data:
path_data_plus_bed = "./bedops_data/plus.bed"
path_data_minus_bed = "./bedops_data/minus.bed"
data_plus.to_csv(path_data_plus_bed, sep="\t", index=False, header=False)
data_minus.to_csv(path_data_minus_bed, sep="\t", index=False, header=False)

In [42]:
# BEDOPS plus in minus
cmd = f"bedops --element-of 1 {path_data_plus_bed} {path_data_minus_bed}"
plus_in_minus = subprocess.check_output(cmd, shell=True, universal_newlines=True)
plus_in_minus_df = pd.DataFrame([x.split("\t") for x in plus_in_minus.split("\n") if x],
                                columns=["sseqid", "sstart", "send"])
plus_in_minus_df[["sstart", "send"]] = plus_in_minus_df[["sstart", "send"]].apply(pd.to_numeric)  # transform columns no numeric using pd.to_numeric

print(f"{plus_in_minus_df.shape[0]} elements out of {data_plus.shape[0]} are in minus strand")
print(f"{data_plus.shape[0] - plus_in_minus_df.shape[0]} elements are not in minus strand")
plus_in_minus_df.head()

1895 elements out of 1936 are in minus strand
41 elements are not in minus strand


Unnamed: 0,sseqid,sstart,send
0,LinJ.01,1,1000
1,LinJ.01,23733,24758
2,LinJ.01,35155,36154
3,LinJ.01,39516,40595
4,LinJ.01,54751,55750


In [43]:
# BEDOPS minus in plus
cmd = f"bedops --element-of 1 {path_data_minus_bed} {path_data_plus_bed}"
minus_in_plus = subprocess.check_output(cmd, shell=True, universal_newlines=True)
minus_in_plus_df = pd.DataFrame([x.split("\t") for x in minus_in_plus.split("\n") if x],
                                columns=["sseqid", "sstart", "send"])
minus_in_plus_df[["sstart", "send"]] = minus_in_plus_df[["sstart", "send"]].apply(pd.to_numeric)  # transform columns no numeric using pd.to_numeric
print(f"{minus_in_plus_df.shape[0]} elements out of {data_minus.shape[0]} are in plus strand")
print(f"{data_minus.shape[0] - minus_in_plus_df.shape[0]} elements are not in plus strand")
minus_in_plus_df.head()

2909 elements out of 2950 are in plus strand
41 elements are not in plus strand


Unnamed: 0,sseqid,sstart,send
0,LinJ.01,1,1000
1,LinJ.01,24093,24876
2,LinJ.01,35371,36297
3,LinJ.01,39790,40595
4,LinJ.01,54983,55909


In [44]:
# BEDOPS NOT plus in minus
cmd = f"bedops --not-element-of 1 {path_data_plus_bed} {path_data_minus_bed}"
NOT_plus_in_minus = subprocess.check_output(cmd, shell=True, universal_newlines=True)
NOT_plus_in_minus_df = pd.DataFrame([x.split("\t") for x in NOT_plus_in_minus.split("\n") if x],
                                    columns=["sseqid", "sstart", "send"])
NOT_plus_in_minus_df[["sstart", "send"]] = NOT_plus_in_minus_df[["sstart", "send"]].apply(pd.to_numeric)  # transform columns no numeric using pd.to_numeric
print(f"{NOT_plus_in_minus_df.shape[0]} elements out of {data_plus.shape[0]} are not in minus strand")
print(f"{data_plus.shape[0] - NOT_plus_in_minus_df.shape[0]} elements are in minus strand")
NOT_plus_in_minus_df.head()

41 elements out of 1936 are not in minus strand
1895 elements are in minus strand


Unnamed: 0,sseqid,sstart,send
0,LinJ.03,215131,216143
1,LinJ.03,216377,217376
2,LinJ.03,243013,244012
3,LinJ.03,263177,264176
4,LinJ.03,281108,282107


In [45]:
# BEDOPS NOT minus in plus
cmd = f"bedops --not-element-of 1 {path_data_minus_bed} {path_data_plus_bed}"
NOT_minus_in_plus = subprocess.check_output(cmd, shell=True, universal_newlines=True)
NOT_minus_in_plus_df = pd.DataFrame([x.split("\t") for x in NOT_minus_in_plus.split("\n") if x],
                                    columns=["sseqid", "sstart", "send"])
NOT_minus_in_plus_df[["sstart", "send"]] = NOT_minus_in_plus_df[["sstart", "send"]].apply(pd.to_numeric)  # transform columns no numeric using pd.to_numeric
print(f"{NOT_minus_in_plus_df.shape[0]} elements out of {data_minus.shape[0]} are not in plus strand")
print(f"{data_minus.shape[0] - NOT_minus_in_plus_df.shape[0]} elements are in plus strand")
NOT_minus_in_plus_df.head()

41 elements out of 2950 are not in plus strand
2909 elements are in plus strand


Unnamed: 0,sseqid,sstart,send
0,LinJ.02,150383,151369
1,LinJ.04,96585,97584
2,LinJ.04,97729,98226
3,LinJ.04,100424,101110
4,LinJ.05,109889,110888


In [46]:
plus_in_minus_df.value_counts("sseqid")

sseqid
LinJ.36    135
LinJ.34    125
LinJ.35    121
LinJ.31    109
LinJ.33     95
LinJ.29     93
LinJ.32     79
LinJ.30     79
LinJ.27     70
LinJ.23     64
LinJ.19     53
LinJ.26     53
LinJ.28     51
LinJ.18     48
LinJ.22     48
LinJ.24     48
LinJ.14     44
LinJ.17     44
LinJ.10     42
LinJ.16     41
LinJ.21     40
LinJ.25     39
LinJ.20     38
LinJ.12     37
LinJ.13     35
LinJ.08     34
LinJ.11     33
LinJ.09     32
LinJ.07     28
LinJ.15     27
LinJ.06     26
LinJ.05     24
LinJ.04     19
LinJ.02     18
LinJ.01     15
LinJ.03      8
Name: count, dtype: int64

In [30]:
minus_in_plus_df.value_counts("sseqid")

sseqid
LinJ.34    326
LinJ.36    197
LinJ.29    183
LinJ.35    182
LinJ.31    149
LinJ.33    145
LinJ.30    122
LinJ.32    113
LinJ.08    100
LinJ.27     99
LinJ.19     88
LinJ.23     85
LinJ.26     77
LinJ.22     71
LinJ.12     71
LinJ.24     69
LinJ.28     66
LinJ.18     61
LinJ.17     57
LinJ.20     54
LinJ.25     53
LinJ.14     52
LinJ.16     52
LinJ.10     50
LinJ.21     49
LinJ.11     45
LinJ.09     42
LinJ.13     42
LinJ.07     39
LinJ.15     34
LinJ.06     32
LinJ.05     28
LinJ.02     27
LinJ.04     24
LinJ.01     16
LinJ.03      9
Name: count, dtype: int64

In [36]:
# Prepare bedops again
path_plus_in_minus_bed = "./bedops_data/plus_in_minus.bed"
path_minus_in_plus_bed = "./bedops_data/minus_in_plus.bed"
plus_in_minus_df.to_csv(path_plus_in_minus_bed, sep="\t", index=False, header=False)
minus_in_plus_df.to_csv(path_minus_in_plus_bed, sep="\t", index=False, header=False)

In [47]:
# Merge them
cmd = f"bedops --merge {path_plus_in_minus_bed} {path_minus_in_plus_bed}"
merged = subprocess.check_output(cmd, shell=True, universal_newlines=True)
merged_df = pd.DataFrame([x.split("\t") for x in merged.split("\n") if x],
                         columns=["sseqid", "sstart", "send"])
merged_df[["sstart", "send"]] = merged_df[["sstart", "send"]].apply(pd.to_numeric)  # transform columns no numeric using pd.to_numeric
print(merged_df.shape)
merged_df.head()

(1878, 3)


Unnamed: 0,sseqid,sstart,send
0,LinJ.01,1,1000
1,LinJ.01,23733,24876
2,LinJ.01,35155,36297
3,LinJ.01,39516,40595
4,LinJ.01,54751,55909


In [74]:
# Vectorized operation to find matching rows
matching_rows = plus_in_minus_df.merge(merged_df, on=["sseqid", "sstart", "send"], how="inner")
print(matching_rows.shape)
print(matching_rows.dtypes)
matching_rows.head()

(542, 3)
sseqid    object
sstart     int64
send       int64
dtype: object


Unnamed: 0,sseqid,sstart,send
0,LinJ.01,1,1000
1,LinJ.01,39516,40595
2,LinJ.01,112660,114388
3,LinJ.01,144999,146653
4,LinJ.01,261440,262439
