# BEDFILE transformation and BEDOPS use

The essentail modules used:

In [None]:
# Load the needed modules:
import numpy as np
import pandas as pd

## 1) Reading and filtering data

### Importing data

In [None]:
# Read Data Frame about the "LinJ.01_BLAST_MAIN.csv", which contains the correct coordinates which where launched agains the whole genome:
df1 = pd.read_csv("./LinJ.01_BLAST_MAIN.csv", sep=',', header=None) # No header
df1.head()

### Divide data among "minus" and "plus" strands.

In [None]:
# For plus strand
df1_plus = df1[df1[14] == 'plus']  # Using pandas slicing
df1_plus.head()

In [None]:
# For the minus st rand
df1_minus = df1[df1[14] == 'minus']
df1_minus.head()

### Sort the data:
Sorting the data by the *start of alignment* sequene, i.e., **row[10]** in the data.

In [None]:
# Sorting "plus" data by column 10.
df1_plus_sorted = df1_plus.sort_values(by=10)
df1_plus_sorted

In [None]:
# Same with the "minus" strand:
df1_minus_sorted = df1_minus.sort_values(by=10)
df1_minus_sorted

## Creation of BED files:

In [None]:
# Save the data in BED files
df1_plus_sorted[[1, 10, 11]].to_csv('df1_plus_sorted.bed', sep='\t', header=False, index=False)
df1_minus_sorted[[1, 10, 11]].to_csv('df1_minus_sorted.bed', sep='\t', header=False, index=False)

In [None]:
# Call bedops --merge in both BED files
!bedops --merge df1_plus_sorted.bed > df1_plus_sorted_merged.bed
!bedops --merge df1_minus_sorted.bed > df1_minus_sorted_merged.bed

In [None]:
# Let's see the difference between the start and end of the merged regions for the plus strand
!gawk 'function abs(x){return x < 0 ? -x : x} BEGIN{OFS="\t"}{print abs($2-$3);}' df1_plus_sorted_merged.bed

In [None]:
# And now for the minus strand
!gawk 'function abs(x){return x < 0 ? -x : x} BEGIN{OFS="\t"}{print abs($2-$3);}' df1_minus_sorted_merged.bed

### Importing output BED files into pandas Data Frames

In [None]:
# Importing data into "pandas" data frames
df2_plus = pd.read_csv("./df1_plus_sorted_merged.bed", sep='\t', header=None)
df2_minus = pd.read_csv("./df1_minus_sorted_merged.bed", sep='\t', header=None)

In [None]:
# View the data:
df2_plus

In [None]:
# Seing the type of data used:
type(df2_plus)

In [None]:
# Same with minus.
df2_minus

## Using blastdbcmd to get the rest of the data

Functon creation to extract the sequence and keeping the coordinates and strand:

In [None]:
import subprocess
def get_data_sequence(data, strand):
    """
    This function gets the sequence of the data from the fasta file. It will keep the Chromosome ID, start coordinate, end coordinate and strand.

    :param data: A pandas data frame with the data read of the BED files.
    :type data: pandas.core.frame.DataFrame

    :param strand: The strand of the sequence. It can be "plus" or "minus".
    :type strand: str
    """
    list = []
    for index, row in data.iterrows():
        if strand == "plus":
            start = row[1]
            end = row[2]
        else: # The start and end are inverted for the minus strand
            start = row[2] 
            end = row[1] 
        sequence = subprocess.check_output("blastdbcmd -db " + "../1-5_chr.fasta" + " -entry " + row[0] + " -range " + str(start) + "-" + str(end) + " -strand " + strand + " -outfmt %s", shell=True, universal_newlines=True)
        list.append(row[0] + "," + 
                    str(row[1]) + "," + 
                    str(row[2]) + "," + 
                    strand + "," + 
                    sequence)

    list_split = [row.split(",") for row in list]
    list_split_df = pd.DataFrame(list_split)
    list_split_df[4] = list_split_df[4].str.replace('\n', '')  # Important. It removes the new line character from the sequence.

    return list_split_df

In [None]:
# Call function on "plus" and "minus" sequence:
plus_data = get_data_sequence(df2_plus, "plus")
minus_data = get_data_sequence(df2_minus, "minus")

In [None]:
# Check plus data
plus_data

In [None]:
# Check minus data
minus_data

In [None]:
# Check the type of the data
type(minus_data)

In [None]:
# Cocaenate the data into a single data frame
all_data = pd.concat([plus_data, minus_data])
all_data.shape

In [None]:
# Check if it's still a data frame
type(all_data)

Adding the sequence length to the Data Frame:

In [None]:
new_column = [len(x) for x in all_data[4]]  # Get the length of the sequence
all_data.insert(1, "New", new_column)  # Insert the new column in the data frame
all_data.head()

In [None]:
all_data.columns = range(all_data.columns.size)  # Reset the columns index
all_data.head()

## Output the CSV file:

In [None]:
all_data.head()

In [None]:
all_data.shape

In [None]:
all_data.info()

In [None]:
type(all_data.iloc[0, 5])

In [None]:

data_to_csv = pd.DataFrame(index=range(14), columns=range(16))

# Assign values from the original DataFrame to the specified columns in the new DataFrame
data_to_csv.iloc[:, [1, 3, 10, 11, 14, 15]] = all_data.iloc[:, [0, 1, 2, 3, 4, 5]]

# Display the new DataFrame
data_to_csv.head()

In [None]:
# Save the data ingoring index and headers
data_to_csv.to_csv('Last_One.csv', index=False, header=None)