In [9]:
import numpy as np
import pandas as pd
import subprocess

In [5]:
# Let's extract the sequences using local blast extracto, aka, `blastdbcmd`
data = pd.read_csv("./data_bedops.gff", sep="\t", header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,LinJ.01,CBM-302,SIDER,1,173,.,.,.,.
1,LinJ.01,CBM-302,SIDER,13302,14301,.,.,.,.
2,LinJ.01,CBM-302,SIDER,24093,24791,.,.,.,.
3,LinJ.01,CBM-302,SIDER,35316,36160,.,.,.,.
4,LinJ.01,CBM-302,SIDER,39698,40589,.,.,.,.


In [7]:
# Let's filter the needed data
data = data[[0, 3, 4]]
data.columns = range(data.columns.size)
data.head()

Unnamed: 0,0,1,2
0,LinJ.01,1,173
1,LinJ.01,13302,14301
2,LinJ.01,24093,24791
3,LinJ.01,35316,36160
4,LinJ.01,39698,40589


In [11]:
# Let's use variables for the data needed
genome = "../Data/genome/TriTrypDB-67_LinfantumJPCM5_Genome.fasta"

In [12]:
# Now let's implement the sequence extraction.
## strand default value will be "plus" if not specified.
list = []
for index, row in data.iterrows():
    cmd = "blastdbcmd -db " + genome + " -entry " + row[0] + " -range " + str(row[1]) + "-" + str(row[2]) + " -outfmt %s"
    sequence = subprocess.check_output(cmd, shell=True, universal_newlines=True)
    list.append(row[0] + "," + 
                str(row[1]) + "," + 
                str(row[2]) + "," + 
                "plus" + "," +  # default strand will be "plus"
                sequence)

In [13]:
# list values are separated by commas, so we split them and create a data frame
list_split = [row.split(",") for row in list]  # Splitting the list by commas
list_split_df = pd.DataFrame(list_split)  # Creating a data frame from the list
list_split_df[4] = list_split_df[4].str.replace('\n', '')  # Important. It removes the new line character from the sequence.

In [15]:
new_column = [len(x) for x in list_split_df[4]]  # creates a list with the length of each sequence
list_split_df.insert(1, "New", new_column)  # inserts the new column with the sequence length. Column index are shifted.

# Repair column index
list_split_df.columns =range(list_split_df.columns.size)  # repairs the column index

# -----------------------------------------------------------------------------
# 7) Correctly modeling the output Data Frame to 15 columns and output as CSV file.
# -----------------------------------------------------------------------------
# data_to_csv = pd.DataFrame(index=range(list_split_df.shape[0]), columns=range(16))  # creates a new Data Frame with 15 columns. The rows depends on the .shape[0]
# data_to_csv.iloc[:, [1, 3, 10, 11, 14, 15]] = list_split_df.iloc[:, [0, 1, 2, 3, 4, 5]]

In [20]:
list_split_df.shape

(3009, 6)

In [17]:
list_split_df.to_csv("./sequences.csv", sep=",", index=False, header=None)  # Saves the Data Frame as a CSV file