In [8]:
# Import needed modules
import numpy as np
import pandas as pd
import subprocess

In [2]:
# Let's import the GFF file:
data_gff = pd.read_csv("../Data/diff_formats/sider_project.gff", sep="\t", header=None)
data_gff.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,LinJ.01,CBM-302,SIDER,24093,24758,.,+,.,.
1,LinJ.01,CBM-302,SIDER,35316,35933,.,+,.,.
2,LinJ.01,CBM-302,SIDER,39698,40581,.,+,.,.
3,LinJ.01,CBM-302,SIDER,54885,55529,.,+,.,.
4,LinJ.01,CBM-302,SIDER,72760,73757,.,+,.,.


In [5]:
type(data_gff)

pandas.core.frame.DataFrame

In [6]:
# To improve BEDOPS performance let's order by chromosome first and then, by start position
data_gff_sorted = data_gff.sort_values(by=[0, 3])
data_gff_sorted.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
2793,LinJ.01,CBM-302,SIDER,1,173,.,-,.,.
2794,LinJ.01,CBM-302,SIDER,13302,14301,.,-,.,.
0,LinJ.01,CBM-302,SIDER,24093,24758,.,+,.,.
2795,LinJ.01,CBM-302,SIDER,24199,24791,.,-,.,.
1,LinJ.01,CBM-302,SIDER,35316,35933,.,+,.,.


In [7]:
# Let's create the BEDOPS file
data_gff_sorted[[0, 3, 4, 6]].to_csv("./data_gff_sorted.bed", sep="\t", header=False, index=False)

In [9]:
# Now, let's use BEDOPS
data_bedos = subprocess.check_output("bedops --merge ./data_gff_sorted.bed", 
                                     shell=True,
                                     universal_newlines=True)

In [11]:
type(data_bedos)

str

In [12]:
data_bedops_df = pd.DataFrame([x.split("\t") for x in data_bedos.split("\n") if x])
data_bedops_df.head()

Unnamed: 0,0,1,2
0,LinJ.01,1,173
1,LinJ.01,13302,14301
2,LinJ.01,24093,24791
3,LinJ.01,35316,36160
4,LinJ.01,39698,40589


In [13]:
# Let's transform it to a GFF file for IGV visualization
GFF_data = pd.DataFrame({
    'seqname': data_bedops_df[0],  # assuming this is a list or series of sequence names
    'source': "CBM-302",
    'feature': "SIDER",  # Ensure this is the correct feature name
    'start': data_bedops_df[1],
    'end': data_bedops_df[2],
    'score': ".",  # Placeholder if no score data
    'strand': ".",
    'frame': ".", # Placeholder if no frame data
    'attribute': "."
})
GFF_data.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,LinJ.01,CBM-302,SIDER,1,173,.,.,.,.
1,LinJ.01,CBM-302,SIDER,13302,14301,.,.,.,.
2,LinJ.01,CBM-302,SIDER,24093,24791,.,.,.,.
3,LinJ.01,CBM-302,SIDER,35316,36160,.,.,.,.
4,LinJ.01,CBM-302,SIDER,39698,40589,.,.,.,.


In [14]:
# Transfor to the file
GFF_data.to_csv("./data_bedops.gff", sep="\t", header=False, index=False)