# BEDFILE transformation and BEDOPS use

The essentail modules used:

In [3]:
# Load the needed modules:
import numpy as np
import pandas as pd

## 1) Reading and filtering data

### Importing data

In [4]:
# Read Data Frame about the "LinJ.01_BLAST_MAIN.csv", which contains the correct coordinates which where launched agains the whole genome:
df1 = pd.read_csv("./LinJ.01_BLAST_MAIN.csv", sep=',', header=None) # No header
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,Seq_1_LinJ.01_plus,LinJ.01,100.0,992,992,278267,0,0,1,992,23937,24928,0.0,1832,plus,AGGCGATGCTGTGAGGAGCATGGAATTGGGAAGCACCGTCACGTCG...
1,Seq_1_LinJ.01_plus,LinJ.01,99.534,644,992,278267,3,0,179,822,55528,54885,0.0,1173,minus,GGCACGCACCTCCATGCGTGGCATCCCAGGGTCCAGCGCCCCCCCC...
2,Seq_1_LinJ.01_plus,LinJ.01,99.192,619,992,278267,3,2,163,780,35933,35316,0.0,1114,minus,GGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCATCCCAGG...
3,Seq_1_LinJ.01_plus,LinJ.01,98.441,449,992,278267,1,5,186,633,75923,76366,0.0,785,plus,ACCTCAGCGTGGCATCCCAGGGTCCAGCGCCCCCCCTCCACCCCCG...
4,Seq_1_LinJ.01_plus,LinJ.01,86.207,174,992,278267,21,3,345,518,137679,137849,2.25e-47,185,plus,GCGCGAGTTAGGGCTACGGACGTCAGCGGCCATGTCGTGCATGGCG...


### Divide data among "minus" and "plus" strands.

In [5]:
# For plus strand
df1_plus = df1[df1[14] == 'plus']  # Using pandas slicing
df1_plus.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,Seq_1_LinJ.01_plus,LinJ.01,100.0,992,992,278267,0,0,1,992,23937,24928,0.0,1832,plus,AGGCGATGCTGTGAGGAGCATGGAATTGGGAAGCACCGTCACGTCG...
3,Seq_1_LinJ.01_plus,LinJ.01,98.441,449,992,278267,1,5,186,633,75923,76366,0.0,785,plus,ACCTCAGCGTGGCATCCCAGGGTCCAGCGCCCCCCCTCCACCCCCG...
4,Seq_1_LinJ.01_plus,LinJ.01,86.207,174,992,278267,21,3,345,518,137679,137849,2.25e-47,185,plus,GCGCGAGTTAGGGCTACGGACGTCAGCGGCCATGTCGTGCATGGCG...
5,Seq_1_LinJ.01_plus,LinJ.01,86.207,174,992,278267,21,3,345,518,205751,205921,2.25e-47,185,plus,GCGCGAGTTAGGGCTACGGACGTCAGCGGCCATGTCGTGCATGGCG...
8,Seq_2_LinJ.01_minus,LinJ.01,99.534,644,998,278267,3,0,179,822,24115,24758,0.0,1173,plus,GGCACGCACCTCCATGCGTGGCATCCCAGGGTCCAGCGCCCCCCCC...


In [6]:
# For the minus st rand
df1_minus = df1[df1[14] == 'minus']
df1_minus.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
1,Seq_1_LinJ.01_plus,LinJ.01,99.534,644,992,278267,3,0,179,822,55528,54885,0.0,1173,minus,GGCACGCACCTCCATGCGTGGCATCCCAGGGTCCAGCGCCCCCCCC...
2,Seq_1_LinJ.01_plus,LinJ.01,99.192,619,992,278267,3,2,163,780,35933,35316,0.0,1114,minus,GGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCATCCCAGG...
6,Seq_1_LinJ.01_plus,LinJ.01,85.632,174,992,278267,22,3,345,518,130370,130200,1.05e-45,180,minus,GCGCGAGTTAGGGCTACGGACGTCAGCGGCCATGTCGTGCATGGCG...
7,Seq_2_LinJ.01_minus,LinJ.01,100.0,998,998,278267,0,0,1,998,55706,54709,0.0,1844,minus,CCGACTGCATGGATGTCCTGCGCGTTCACGTCTACTGCCCGCAGGC...
9,Seq_2_LinJ.01_minus,LinJ.01,99.171,603,998,278267,3,2,179,780,35917,35316,0.0,1085,minus,GGCACGCACCTCCATGCGTGGCATCCCAGGGTCCAGCGCCCCCCCC...


### Sort the data:
Sorting the data by the *start of alignment* sequene, i.e., **row[10]** in the data.

In [7]:
# Sorting "plus" data by column 10.
df1_plus_sorted = df1_plus.sort_values(by=10)
df1_plus_sorted

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
176,Seq_33_LinJ.01_plus,LinJ.01,100.000,1000,1000,278267,0,0,1,1000,6833,7832,0.000000e+00,1847,plus,ACTGGACTGGTAGAGCGTGCCGTAGCCGAGGTAACGCACGTATCGC...
57,Seq_10_LinJ.01_plus,LinJ.01,100.000,1000,1000,278267,0,0,1,1000,6833,7832,0.000000e+00,1847,plus,ACTGGACTGGTAGAGCGTGCCGTAGCCGAGGTAACGCACGTATCGC...
117,Seq_22_LinJ.01_plus,LinJ.01,100.000,1000,1000,278267,0,0,1,1000,16951,17950,0.000000e+00,1847,plus,GCATGGCTTGTTGGGAATCGGTTCTGTTTGTGGGTAAGATGAATGG...
177,Seq_34_LinJ.01_plus,LinJ.01,100.000,1000,1000,278267,0,0,1,1000,16951,17950,0.000000e+00,1847,plus,GCATGGCTTGTTGGGAATCGGTTCTGTTTGTGGGTAAGATGAATGG...
58,Seq_11_LinJ.01_plus,LinJ.01,100.000,1000,1000,278267,0,0,1,1000,16951,17950,0.000000e+00,1847,plus,GCATGGCTTGTTGGGAATCGGTTCTGTTTGTGGGTAAGATGAATGG...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353,Seq_60_LinJ.01_plus,LinJ.01,86.207,174,1000,278267,21,3,414,587,205751,205921,2.270000e-47,185,plus,GCGCGAGTTAGGGCTACGGACGTCAGCGGCCATGTCGTGCATGGCG...
219,Seq_41_LinJ.01_minus,LinJ.01,86.207,174,939,278267,21,3,414,587,205751,205921,2.120000e-47,185,plus,GCGCGAGTTAGGGCTACGGACGTCAGCGGCCATGTCGTGCATGGCG...
26,Seq_4_LinJ.01_plus,LinJ.01,86.207,174,1000,278267,21,3,434,607,205751,205921,2.270000e-47,185,plus,GCGCGAGTTAGGGCTACGGACGTCAGCGGCCATGTCGTGCATGGCG...
19,Seq_3_LinJ.01_minus,LinJ.01,86.207,174,999,278267,21,3,374,547,205751,205921,2.260000e-47,185,plus,GCGCGAGTTAGGGCTACGGACGTCAGCGGCCATGTCGTGCATGGCG...


In [8]:
# Same with the "minus" strand:
df1_minus_sorted = df1_minus.sort_values(by=10)
df1_minus_sorted

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
204,Seq_39_LinJ.01_plus,LinJ.01,86.207,174,915,278267,21,3,478,648,35750,35577,2.070000e-47,185,minus,GCGCGACGTAGCGCCACGGATGCCGGCGGCCATGTCGTGCATGGCG...
386,Seq_65_LinJ.01_plus,LinJ.01,86.207,174,989,278267,21,3,474,644,35750,35577,2.240000e-47,185,minus,GCGCGACGTAGCGCCACGGATGCCGGCGGCCATGTCGTGCATGGCG...
109,Seq_19_LinJ.01_minus,LinJ.01,85.632,174,949,278267,22,3,416,586,35750,35577,9.990000e-46,180,minus,GCGCGACGTAGCGCCACGGATGCCGGCGGCCATGTCGTGCATGGCG...
160,Seq_29_LinJ.01_plus,LinJ.01,86.207,174,951,278267,21,3,416,586,35750,35577,2.150000e-47,185,minus,GCGCGACGTAGCGCCACGGATGCCGGCGGCCATGTCGTGCATGGCG...
49,Seq_7_LinJ.01_minus,LinJ.01,85.632,174,949,278267,22,3,416,586,35750,35577,9.990000e-46,180,minus,GCGCGACGTAGCGCCACGGATGCCGGCGGCCATGTCGTGCATGGCG...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307,Seq_54_LinJ.01_minus,LinJ.01,100.000,997,997,278267,0,0,1,997,130850,129854,0.000000e+00,1842,minus,CTCCGTGAGGGCACAGAGAGAGAGAGAGAGCAAAGCCGTCTGGCAG...
235,Seq_44_LinJ.01_minus,LinJ.01,100.000,998,998,278267,0,0,1,998,130850,129853,0.000000e+00,1844,minus,CTCCGTGAGGGCACAGAGAGAGAGAGAGAGCAAAGCCGTCTGGCAG...
191,Seq_38_LinJ.01_minus,LinJ.01,100.000,911,911,278267,0,0,1,911,130854,129944,0.000000e+00,1683,minus,CTCCCTCCGTGAGGGCACAGAGAGAGAGAGAGAGCAAAGCCGTCTG...
363,Seq_62_LinJ.01_minus,LinJ.01,100.000,1000,1000,278267,0,0,1,1000,221391,220392,0.000000e+00,1847,minus,TTCGATCGAACGCGCAGGGTCGACGCCCCCCCCCCCTTATCCGTCC...


## Creation of BED files:

In [9]:
# Save the data in BED files
df1_plus_sorted[[1, 10, 11]].to_csv('df1_plus_sorted.bed', sep='\t', header=False, index=False)
df1_minus_sorted[[1, 10, 11]].to_csv('df1_minus_sorted.bed', sep='\t', header=False, index=False)

In [10]:
# Call bedops --merge in both BED files
!bedops --merge df1_plus_sorted.bed > df1_plus_sorted_merged.bed
!bedops --merge df1_minus_sorted.bed > df1_minus_sorted_merged.bed

In [13]:
# Let's see the difference between the start and end of the merged regions for the plus strand
!gawk 'function abs(x){return x < 0 ? -x : x} BEGIN{OFS="\t"}{print abs($2-$3);}' df1_plus_sorted_merged.bed

999
999
1060
1019
1000
1111
1209


In [14]:
# And now for the minus strand
!gawk 'function abs(x){return x < 0 ? -x : x} BEGIN{OFS="\t"}{print abs($2-$3);}' df1_minus_sorted_merged.bed

173
173
226
132
170
246
999


### Importing output BED files into pandas Data Frames

In [15]:
# Importing data into "pandas" data frames
df2_plus = pd.read_csv("./df1_plus_sorted_merged.bed", sep='\t', header=None)
df2_minus = pd.read_csv("./df1_minus_sorted_merged.bed", sep='\t', header=None)

In [18]:
# View the data:
df2_plus

Unnamed: 0,0,1,2
0,LinJ.01,6833,7832
1,LinJ.01,16951,17950
2,LinJ.01,23868,24928
3,LinJ.01,75645,76664
4,LinJ.01,123005,124005
5,LinJ.01,137095,138206
6,LinJ.01,205077,206286


In [17]:
# Seing the type of data used:
type(df2_plus)

pandas.core.frame.DataFrame

In [19]:
# Same with minus.
df2_minus

Unnamed: 0,0,1,2
0,LinJ.01,35750,35577
1,LinJ.01,55362,55189
2,LinJ.01,113986,113760
3,LinJ.01,125420,125288
4,LinJ.01,130370,130200
5,LinJ.01,130568,130322
6,LinJ.01,221391,220392


## Using blastdbcmd to get the rest of the data

Functon creation to extract the sequence and keeping the coordinates and strand:

In [32]:
import subprocess
def get_data_sequence(data, strand):
    """
    This function gets the sequence of the data from the fasta file. It will keep the Chromosome ID, start coordinate, end coordinate and strand.

    :param data: A pandas data frame with the data read of the BED files.
    :type data: pandas.core.frame.DataFrame

    :param strand: The strand of the sequence. It can be "plus" or "minus".
    :type strand: str
    """
    list = []
    for index, row in data.iterrows():
        if strand == "plus":
            start = row[1]
            end = row[2]
        else: # The start and end are inverted for the minus strand
            start = row[2] 
            end = row[1] 
        sequence = subprocess.check_output("blastdbcmd -db " + "../1-5_chr.fasta" + " -entry " + row[0] + " -range " + str(start) + "-" + str(end) + " -strand " + strand + " -outfmt %s", shell=True, universal_newlines=True)
        list.append(row[0] + "," + str(row[1]) + "," + str(row[2]) + "," + strand + "," + sequence)

    list_split = [row.split(",") for row in list]
    list_split_df = pd.DataFrame(list_split)

    return list_split_df

In [24]:
# Call function on "plus" and "minus" sequence:
plus_data = get_data_sequence(df2_plus, "plus")
minus_data = get_data_sequence(df2_minus, "minus")

In [25]:
# Check plus data
plus_data

Unnamed: 0,0,1,2,3,4
0,LinJ.01,6833,7832,plus,ACTGGACTGGTAGAGCGTGCCGTAGCCGAGGTAACGCACGTATCGC...
1,LinJ.01,16951,17950,plus,GCATGGCTTGTTGGGAATCGGTTCTGTTTGTGGGTAAGATGAATGG...
2,LinJ.01,23868,24928,plus,GCAGGTGCCGGCAGGGGCGTCGTCCGATCCGCCGAGGGAGAGCGAA...
3,LinJ.01,75645,76664,plus,TCAAAGTGGGAGGAGAGCGCCGCTGAGCAGGCAAGCGAGGCAACCT...
4,LinJ.01,123005,124005,plus,CGTTGTTTTGGTTATGTTTGTGTGTGTGTGTGTGTGTATCGGCTTC...
5,LinJ.01,137095,138206,plus,GTCCACGCGTCGGGGGCGGGGGGGAGGGGGGGCATCTGCGGATACC...
6,LinJ.01,205077,206286,plus,CACGCGCGGATGGGGAGTGAGGGGAGGGGGCCGTGCCACCCACACC...


In [23]:
# Check minus data
minus_data

Unnamed: 0,0,1,2,3,4
0,LinJ.01,35750,35577,minus,GCGCGACGTAGCGCCACGGATGCCGGCGGCCATGTCGTGCATGGCG...
1,LinJ.01,55362,55189,minus,GCGCGACGTAGCGCCACGGATGCCGGCGGCCATGTCGTGCATGGCG...
2,LinJ.01,113986,113760,minus,CACACACACACACGCACACACCTCCGTGCGTGGGGGGGGGGTACCT...
3,LinJ.01,125420,125288,minus,TTCGATCGAACGCGCAGGGTCGACGCCCCCCCCCTTATCCGTCCAT...
4,LinJ.01,130370,130200,minus,GCGCGAGTTAGGGCTACGGACGTCAGCGGCCATGTCGTGCATGGCG...
5,LinJ.01,130568,130322,minus,GGCGGCGGCGCACAGACACACACACACACACACACACGCACACACC...
6,LinJ.01,221391,220392,minus,TTCGATCGAACGCGCAGGGTCGACGCCCCCCCCCCCTTATCCGTCC...


In [26]:
# Check the type of the data
type(minus_data)

pandas.core.frame.DataFrame

In [30]:
# Cocaenate the data into a single data frame
all_data = pd.concat([plus_data, minus_data])
all_data.shape

(14, 5)

In [31]:
# Check if it's still a data frame
type(all_data)

pandas.core.frame.DataFrame