## This code parse the DEG gene blast (blastP) data from http://origin.tubic.org/deg/public/index.php/index website.

#### It loops through the protein sequences from uniprot_df database and parse the DEG output for bacteria, archaea and eukaryotes. For every output of each sequence it takes the -
#### MaxID -> Highest Identity percent of all hits
#### totalHits -> Total number of hits

#### It stores these values in individual columns in the same uniprot_df database and write to csv file uniprot_DEG_df.csv.
#### It also create a new dataframe called deg_df to store all the original parsed infromation from DEG against each sequence id. Along with the "id" column, it has other three columns to store the parsed DEG data as individual dataframe. These inner dataframes are converted to json format, and then whole dataframe is saved in a csv file DEG_df_parsed.csv. The code to read this nested dataframe is provided at the end

In [65]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

import os
import json

In [66]:
#Read the uniprot_df database into dataframe
uniprot_df = pd.read_csv("/Users/zaidur/Documents/Sequence_Project/aeromonasBact/uniprot_df.csv")

In [21]:
#Three functions defined to parse data for bacteria, archaea and eukaryotes. Functions have one sequence argument

###############

def deg_bacteria_parser(sequence):
    url = 'http://origin.tubic.org/deg/public/index.php/index/blast/index'
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'en-US,en;q=0.9',
        'Connection': 'keep-alive',
        'Host': 'origin.tubic.org',
        'Origin': 'http://origin.tubic.org',
        'Referer': 'http://origin.tubic.org/deg/public/index.php/blast/bacteria',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
    }

    data = {
        'lineage': 'bacteria',
        'degac': 'DEG10',
        'blast': 'blastp',
        'seq': sequence
    }

    #degac: 'DEG10' -> for 'Bacteria'
    #degac: 'DEG30' -> for 'Archaea'
    #degac: 'DEG20' -> for 'Eukaryotes'

    response = requests.post(url, headers=headers, data=data)


    html_content = response.text

    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all <tr> tags within the parsed HTML (excluding the header row)
    tr_tags = soup.find_all('tr')[1:]

    # Initialize an empty list to store the extracted values
    data = []

    # Extract the values within <td> tags for each row
    for tr in tr_tags:
        td_tags = tr.find_all('td')
        row_values = [td.get_text(strip=True) for td in td_tags]
        data.append(row_values)

    # Create a dataframe from the extracted values
    df = pd.DataFrame(data, columns=['query_id', 'subject_id', 'pct_identity', 'aln_length', 'n_of_mismatches',
                                     'gap_openings', 'q_start', 'q_end', 's_start', 's_end', 'e_value', 'bit_score'])

    return df

In [67]:
def deg_archaea_parser(sequence):
    url = 'http://origin.tubic.org/deg/public/index.php/index/blast/index'
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'en-US,en;q=0.9',
        'Connection': 'keep-alive',
        'Host': 'origin.tubic.org',
        'Origin': 'http://origin.tubic.org',
        'Referer': 'http://origin.tubic.org/deg/public/index.php/blast/bacteria',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
    }

    data = {
        'lineage': 'bacteria',
        'degac': 'DEG30',
        'blast': 'blastp',
        'seq': sequence
    }

    #degac: 'DEG10' -> for 'Bacteria'
    #degac: 'DEG30' -> for 'Archaea'
    #degac: 'DEG20' -> for 'Eukaryotes'

    response = requests.post(url, headers=headers, data=data)


    html_content = response.text

    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all <tr> tags within the parsed HTML (excluding the header row)
    tr_tags = soup.find_all('tr')[1:]

    # Initialize an empty list to store the extracted values
    data = []

    # Extract the values within <td> tags for each row
    for tr in tr_tags:
        td_tags = tr.find_all('td')
        row_values = [td.get_text(strip=True) for td in td_tags]
        data.append(row_values)

    # Create a dataframe from the extracted values
    df = pd.DataFrame(data, columns=['query_id', 'subject_id', 'pct_identity', 'aln_length', 'n_of_mismatches',
                                     'gap_openings', 'q_start', 'q_end', 's_start', 's_end', 'e_value', 'bit_score'])

    return df

In [23]:
def deg_eukaryotes_parser(sequence):
    url = 'http://origin.tubic.org/deg/public/index.php/index/blast/index'
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'en-US,en;q=0.9',
        'Connection': 'keep-alive',
        'Host': 'origin.tubic.org',
        'Origin': 'http://origin.tubic.org',
        'Referer': 'http://origin.tubic.org/deg/public/index.php/blast/bacteria',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15',
    }

    data = {
        'lineage': 'bacteria',
        'degac': 'DEG20',
        'blast': 'blastp',
        'seq': sequence
    }

    #degac: 'DEG10' -> for 'Bacteria'
    #degac: 'DEG30' -> for 'Archaea'
    #degac: 'DEG20' -> for 'Eukaryotes'

    response = requests.post(url, headers=headers, data=data)


    html_content = response.text

    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all <tr> tags within the parsed HTML (excluding the header row)
    tr_tags = soup.find_all('tr')[1:]

    # Initialize an empty list to store the extracted values
    data = []

    # Extract the values within <td> tags for each row
    for tr in tr_tags:
        td_tags = tr.find_all('td')
        row_values = [td.get_text(strip=True) for td in td_tags]
        data.append(row_values)

    # Create a dataframe from the extracted values
    df = pd.DataFrame(data, columns=['query_id', 'subject_id', 'pct_identity', 'aln_length', 'n_of_mismatches',
                                     'gap_openings', 'q_start', 'q_end', 's_start', 's_end', 'e_value', 'bit_score'])

    return df

In [None]:
"""
This code lopes through the uniprot_df database and parse the DEG output for bacteria, archaea and eukaryotes.
For every output of each sequence it takes the -
MaxID -> Highest Identity percent of all hits
totalHits -> Total number of hits
Then it updates the uniprot_df dataframe adding those necessary columns only

It stores the raw request output in a new nested dataframe called deg_df with these columns-
['id', 'bacteria_df', 'archaea_df', 'eukaryotes_df']
the df column cells, each holds individual dataframes

"""

#Create blank columns in uniprot_df for MaxID and totalHits
uniprot_df['DEGBacteria_MaxID'] = ''
uniprot_df['DEGBacteria_totalHits'] = ''
uniprot_df['DEGArchea_MaxID'] = ''
uniprot_df['DEGArchea_totalHits'] = ''
uniprot_df['DEGEukar_MaxID'] = ''
uniprot_df['DEGEukar_totalHits'] = ''

# Create the blank DataFrame to hold all the DEG parsing output against the sequence id
deg_df = pd.DataFrame(columns=['id', 'bacteria_df', 'archaea_df', 'eukaryotes_df'])

for idx in range(len(uniprot_df)):
    
    #################
    #Bacteria - getting the output in a dataframe and assiging the MaxID and totalHits in respected columns
    bacteria_df = deg_bacteria_parser(uniprot_df.at[idx, 'sequence'])
    uniprot_df.at[idx, 'DEGBacteria_MaxID'] = bacteria_df['pct_identity'].max()
    uniprot_df.at[idx, 'DEGBacteria_totalHits'] = len(bacteria_df)
    
    #assigning the whole output in another dataframe called deg_df
    deg_df.at[idx, 'id'] = uniprot_df.at[idx, 'id']
    deg_df.at[idx, 'bacteria_df'] = bacteria_df
    
    #################
    #Archaea - getting the output in a dataframe and assiging the MaxID and totalHits in respected columns
    archaea_df = deg_archaea_parser(uniprot_df.at[idx, 'sequence'])
    uniprot_df.at[idx, 'DEGArchea_MaxID'] = archaea_df['pct_identity'].max()
    uniprot_df.at[idx, 'DEGArchea_totalHits'] = len(archaea_df)
    
    #assigning the whole output in another dataframe called deg_df
    deg_df.at[idx, 'archaea_df'] = archaea_df
    
    ##################
    #Eukaryotes - getting the output in a dataframe and assiging the MaxID and totalHits in respected columns
    eukaryotes_df = deg_eukaryotes_parser(uniprot_df.at[idx, 'sequence'])
    uniprot_df.at[idx, 'DEGEukar_MaxID'] = eukaryotes_df['pct_identity'].max()
    uniprot_df.at[idx, 'DEGEukar_totalHits'] = len(eukaryotes_df)
    
    #assigning the whole output in another dataframe called deg_df
    deg_df.at[idx, 'eukaryotes_df'] = eukaryotes_df
    

#Convert the inner dataframes (in column 'bacteria_df', 'archaea_df', 'eukaryotes_df') in JSON format
for col in ['bacteria_df', 'archaea_df', 'eukaryotes_df']:
    deg_df[col] = deg_df[col].apply(lambda df: df.to_json())

# Save the main DataFrame to a CSV file
deg_df.to_csv("/Users/zaidur/Documents/Sequence_Project/aeromonasBact/DEG_df_parsed.csv", index=False)

uniprot_df    

In [None]:
#save the dataframe with only necessary DEG data in a csv file

uniprot_df.to_csv("uniprot_DEG_df.csv", index=False)

In [55]:
"""
This code is to write the nested dataframe in a csv file. The inner dataframes are first converted into
JSON format and then main dataframe is saved as a csv file

"""

import pandas as pd
import os
import json

# Save dataframes in JSON format
for col in ['bacteria_df', 'archaea_df', 'eukaryotes_df']:
    deg_df[col] = deg_df[col].apply(lambda df: df.to_json())

# Save the main DataFrame to a CSV file
deg_df.to_csv("/Users/zaidur/Documents/Sequence_Project/deg_df_test.csv", index=False)

In [70]:
"""
This code is to read the csv file having nested dataframe. First the main csv file is read in a dataframe.
Then the inner dataframes in JSON format are read into actual dataframes.

"""

# Load the main DataFrame from the CSV file
deg_json_df = pd.read_csv("/Users/zaidur/Documents/Sequence_Project/deg_df_test.csv")

# Convert JSON strings back to DataFrames
for col in ['bacteria_df', 'archaea_df', 'eukaryotes_df']:
    deg_json_df[col] = deg_json_df[col].apply(lambda df: pd.read_json(df))