# Note3_關於proteo的MAGs分析

## 第一部分: 含有bitscore的篩選分析

### 1.1 使用 positive control 來了解 hmm profiles 的分析情形 (e-value、coverage、bit-score)

In [2]:
# HMM actino positive control bit-score threshold
import os
import pandas as pd
import re

#------------------------------------------------------------------------
# create a function can get the datafarme of each control's best bit-score, e-vale and coverage dataframes
# hmm domtblout name should be ...aed_I_... NOT ..._I_aed_....
def positive_control_df(control_dir, control_names, BitScore_df, Evalue_df, Coverage_df):
    # Create an empty dictionary to store the best bit-scores; best e-vale; best coverage for each hmmsearch
    best_bit_scores = {}
    evalue_scores = {}
    coverage_scores = {}

    # Loop over the hmmsearches and parse the corresponding "domtblout" file
    for control_name in control_names:
        # Load the "domtblout" file into a pandas DataFrame
        file_path = os.path.join(control_dir, control_name + ".domtblout")
        try:
            df = pd.read_csv(file_path, comment="#", sep='\s+', header=None)
        except pd.errors.EmptyDataError:
            df = pd.DataFrame()

        if df.empty:
            pattern = r'P_.*_edc'
            control_name = re.sub(pattern, 'P_edc', control_name)        
            best_bit_scores[control_name] = None
            evalue_scores[control_name] = None
            coverage_scores[control_name] = None
        else:        
            # Assign column names to the DataFrame
            df.columns = ["target_name", "accession", "tlen", "query_name", "accession2", "qlen", "E-value", "score", "bias",
                          "num_domains_index", "num_domains_total", "c-Evalue", "i-Evalue", "score2", "bias2", "hmm_from", "hmm_to", "ali_from", "ali_to",
                          "env_from", "env_to", "acc", "description"]
            # Calculate the coverage for each hit
            df["coverage"] = (df["ali_to"] - df["ali_from"] + 1) / df["tlen"]

            # Filter the DataFrame by E-value and coverage, and sort by bit-score
            significant_hits = df[(df["E-value"] < 0.001) & (df["coverage"] > 0.50)].sort_values(by="score", ascending=False)

            # replace strains name to aed..
            pattern = r'P_.*_edc'
            control_name = re.sub(pattern, 'P_edc', control_name)

            # Extract the best bit-score and store it in the dictionary
            if not significant_hits.empty:
                best_bit_score = significant_hits.iloc[0]["score"]
                best_bit_scores[control_name] = best_bit_score
                evalue_score = significant_hits.iloc[0]["E-value"]
                evalue_scores[control_name] = evalue_score
                coverage_score = significant_hits.iloc[0]["coverage"]
                coverage_scores[control_name] = coverage_score
            else:
                best_bit_scores[control_name] = None
                evalue_scores[control_name] = None
                coverage_scores[control_name] = None

    # create the index names for three df
    # Get the strain name
    pattern = r'P_.*_edc'
    StrainName = re.findall(pattern, control_names[0])
    
    # create index of bit score
    bitscore_name = StrainName[0] + '_bit_score'
    BitScore_Name = [bitscore_name]

    # create index of Evalue
    evalue_name = StrainName[0] + '_Evalue'
    Evalue_Name = [evalue_name]
    
    # create index of coverage
    coverage_name = StrainName[0] + '_coverage'
    Coverage_Name = [coverage_name]
    
    # create a dataframe of bit score
    BitScore_df = pd.DataFrame(best_bit_scores, index=BitScore_Name)

    # create a dataframe of e-value
    Evalue_df = pd.DataFrame(evalue_scores, index=Evalue_Name)

    # create a dataframe of coverage
    Coverage_df = pd.DataFrame(coverage_scores, index=Coverage_Name)
    
    return BitScore_df, Evalue_df, Coverage_df


#---------------------------------------------------------------
# Altererythrobacter_estronivo_MH-B5_1
# Define the directory that contains the "domtblout" files
control_dir = "../data/raw/Proteo_HMM_PositiveControl/Altererythrobacter_estronivo_MH-B5_1/domtblout/"

# Define the names of the control_names (without the file extension); delet 13540
control_names = ['P_Altererythrobacter_estronivo_MH-B5_1_edc13525_I_', 'P_Altererythrobacter_estronivo_MH-B5_1_edc13530', 'P_Altererythrobacter_estronivo_MH-B5_1_edc13535', 'P_Altererythrobacter_estronivo_MH-B5_1_edc13545', 'P_Altererythrobacter_estronivo_MH-B5_1_edc13550', 'P_Altererythrobacter_estronivo_MH-B5_1_edc13555', 'P_Altererythrobacter_estronivo_MH-B5_1_edc13560', 'P_Altererythrobacter_estronivo_MH-B5_1_edc13565', 'P_Altererythrobacter_estronivo_MH-B5_1_edc13570_I_', 'P_Altererythrobacter_estronivo_MH-B5_1_edc13575', 'P_Altererythrobacter_estronivo_MH-B5_1_edc13580_I_', 'P_Altererythrobacter_estronivo_MH-B5_1_edc13585', 'P_Altererythrobacter_estronivo_MH-B5_1_edc13590', 'P_Altererythrobacter_estronivo_MH-B5_1_edc13595']

# Use positive_control_df function to get the df (control_dir, control_names, BitScore_df, Evalue_df, Coverage_df)
B5_BitScore = pd.DataFrame()
B5_Evalue = pd.DataFrame()
B5_coverage = pd.DataFrame()

B5_BitScore, B5_Evalue, B5_coverage = positive_control_df(control_dir, control_names, B5_BitScore, B5_Evalue, B5_coverage)


#---------------------------------------------------------------
# Novosphingobium_sp_ES21
# Define the directory that contains the "domtblout" files
control_dir = "../data/raw/Proteo_HMM_PositiveControl/Novosphingobium_sp_ES21/domtblout/"

# Define the names of the control_names (without the file extension); delet 13540
control_names = ['P_Novosphingobium_sp_ES21_edc13525_I_', 'P_Novosphingobium_sp_ES21_edc13530', 'P_Novosphingobium_sp_ES21_edc13535', 'P_Novosphingobium_sp_ES21_edc13545', 'P_Novosphingobium_sp_ES21_edc13550', 'P_Novosphingobium_sp_ES21_edc13555', 'P_Novosphingobium_sp_ES21_edc13560', 'P_Novosphingobium_sp_ES21_edc13565', 'P_Novosphingobium_sp_ES21_edc13570_I_', 'P_Novosphingobium_sp_ES21_edc13575', 'P_Novosphingobium_sp_ES21_edc13580_I_', 'P_Novosphingobium_sp_ES21_edc13585', 'P_Novosphingobium_sp_ES21_edc13590', 'P_Novosphingobium_sp_ES21_edc13595']

    # Use positive_control_df function to get the df (control_dir, control_names, BitScore_df, Evalue_df, Coverage_df)
ES21_BitScore = pd.DataFrame()
ES21_Evalue = pd.DataFrame()
ES21_coverage = pd.DataFrame()

ES21_BitScore, ES21_Evalue, ES21_coverage = positive_control_df(control_dir, control_names, ES21_BitScore, ES21_Evalue, ES21_coverage)


#---------------------------------------------------------------
# Novosphingobium_tardaugens_NBRC16725
# Define the directory that contains the "domtblout" files
control_dir = "../data/raw/Proteo_HMM_PositiveControl/Novosphingobium_tardaugens_NBRC16725/domtblout/"

# Define the names of the control_names (without the file extension); delet 13540
control_names = ['P_Novosphingobium_tardaugens_NBRC16725_edc13525_I_', 'P_Novosphingobium_tardaugens_NBRC16725_edc13530', 'P_Novosphingobium_tardaugens_NBRC16725_edc13535', 'P_Novosphingobium_tardaugens_NBRC16725_edc13545', 'P_Novosphingobium_tardaugens_NBRC16725_edc13550', 'P_Novosphingobium_tardaugens_NBRC16725_edc13555', 'P_Novosphingobium_tardaugens_NBRC16725_edc13560', 'P_Novosphingobium_tardaugens_NBRC16725_edc13565', 'P_Novosphingobium_tardaugens_NBRC16725_edc13570_I_', 'P_Novosphingobium_tardaugens_NBRC16725_edc13575', 'P_Novosphingobium_tardaugens_NBRC16725_edc13580_I_', 'P_Novosphingobium_tardaugens_NBRC16725_edc13585', 'P_Novosphingobium_tardaugens_NBRC16725_edc13590', 'P_Novosphingobium_tardaugens_NBRC16725_edc13595']

# Use positive_control_df function to get the df (control_dir, control_names, BitScore_df, Evalue_df, Coverage_df)
NBRC16725_BitScore = pd.DataFrame()
NBRC16725_Evalue = pd.DataFrame()
NBRC16725_coverage = pd.DataFrame()

NBRC16725_BitScore, NBRC16725_Evalue, NBRC16725_coverage = positive_control_df(control_dir, control_names, NBRC16725_BitScore, NBRC16725_Evalue, NBRC16725_coverage)


#---------------------------------------------------------------
# Sphingobium_estronivorans_AXB
# Define the directory that contains the "domtblout" files
control_dir = "../data/raw/Proteo_HMM_PositiveControl/Sphingobium_estronivorans_AXB/domtblout/"

# Define the names of the control_names (without the file extension); delet 13540
control_names = ['P_Sphingobium_estronivorans_AXB_edc13525_I_', 'P_Sphingobium_estronivorans_AXB_edc13530', 'P_Sphingobium_estronivorans_AXB_edc13535', 'P_Sphingobium_estronivorans_AXB_edc13545', 'P_Sphingobium_estronivorans_AXB_edc13550', 'P_Sphingobium_estronivorans_AXB_edc13555', 'P_Sphingobium_estronivorans_AXB_edc13560', 'P_Sphingobium_estronivorans_AXB_edc13565', 'P_Sphingobium_estronivorans_AXB_edc13570_I_', 'P_Sphingobium_estronivorans_AXB_edc13575', 'P_Sphingobium_estronivorans_AXB_edc13580_I_', 'P_Sphingobium_estronivorans_AXB_edc13585', 'P_Sphingobium_estronivorans_AXB_edc13590', 'P_Sphingobium_estronivorans_AXB_edc13595']

# Use positive_control_df function to get the df (control_dir, control_names, BitScore_df, Evalue_df, Coverage_df)
AXB_BitScore = pd.DataFrame()
AXB_Evalue = pd.DataFrame()
AXB_coverage = pd.DataFrame()

AXB_BitScore, AXB_Evalue, AXB_coverage = positive_control_df(control_dir, control_names, AXB_BitScore, AXB_Evalue, AXB_coverage)


#---------------------------------------------------------------
# Sphingomonas_sp_KC8
# Define the directory that contains the "domtblout" files
control_dir = "../data/raw/Proteo_HMM_PositiveControl/Sphingomonas_sp_KC8/domtblout/"

# Define the names of the control_names (without the file extension); delet 13540
control_names = ['P_Sphingomonas_sp_KC8_edc13525_I_', 'P_Sphingomonas_sp_KC8_edc13530', 'P_Sphingomonas_sp_KC8_edc13535', 'P_Sphingomonas_sp_KC8_edc13545', 'P_Sphingomonas_sp_KC8_edc13550', 'P_Sphingomonas_sp_KC8_edc13555', 'P_Sphingomonas_sp_KC8_edc13560', 'P_Sphingomonas_sp_KC8_edc13565', 'P_Sphingomonas_sp_KC8_edc13570_I_', 'P_Sphingomonas_sp_KC8_edc13575', 'P_Sphingomonas_sp_KC8_edc13580_I_', 'P_Sphingomonas_sp_KC8_edc13585', 'P_Sphingomonas_sp_KC8_edc13590', 'P_Sphingomonas_sp_KC8_edc13595']

# Use positive_control_df function to get the df (control_dir, control_names, BitScore_df, Evalue_df, Coverage_df)
KC8_BitScore = pd.DataFrame()
KC8_Evalue = pd.DataFrame()
KC8_coverage = pd.DataFrame()

KC8_BitScore, KC8_Evalue, KC8_coverage = positive_control_df(control_dir, control_names, KC8_BitScore, KC8_Evalue, KC8_coverage)


#-------------------------------------------------------------------------------------------------------
# Merge tabel and create a df of max and min

# Got the min bit score of the df
# merge 5 best bit score df
Proteo_PositiveBitScore_df = pd.concat([B5_BitScore, ES21_BitScore, NBRC16725_BitScore, AXB_BitScore, KC8_BitScore], axis=0)

Proteo_MinBitScore_df = Proteo_PositiveBitScore_df.min()
Proteo_MinBitScore_df = Proteo_MinBitScore_df.iloc[0:]
Proteo_MinBitScore_df = Proteo_MinBitScore_df.to_frame()
Proteo_MinBitScore_df.columns = ['Min_BitScore']

# transpose the DataFrame
Proteo_MinBitScore_df = Proteo_MinBitScore_df.T

# merge all nad Min table
Proteo_PositiveBitScore_df = pd.concat([Proteo_PositiveBitScore_df, Proteo_MinBitScore_df], axis=0)
Proteo_PositiveBitScore_df.to_csv('../data/processed/Final/ForReader/ControlData/Proteo_edc/Proteo_PositiveBitScore.csv')


# Got the max e-value of the df
# merge 5 e-value df
Proteo_PositiveEvalue_df = pd.concat([B5_Evalue, ES21_Evalue, NBRC16725_Evalue, AXB_Evalue, KC8_Evalue], axis=0)

Proteo_MaxEvalue_df = Proteo_PositiveEvalue_df.max()
Proteo_MaxEvalue_df = Proteo_MaxEvalue_df.iloc[0:]
Proteo_MaxEvalue_df = Proteo_MaxEvalue_df.to_frame()
Proteo_MaxEvalue_df.columns = ['Max_Evalue']

# transpose the DataFrame
Proteo_MaxEvalue_df = Proteo_MaxEvalue_df.T

# merge all nad Min table
Proteo_PositiveEvalue_df = pd.concat([Proteo_PositiveEvalue_df, Proteo_MaxEvalue_df], axis=0)
Proteo_PositiveEvalue_df.to_csv('../data/processed/Final/ForReader/ControlData/Proteo_edc/Proteo_PositiveEvalue.csv')


# Got the min coverage of the df
# merge 5 e-value df
Proteo_Positivecoverage_df = pd.concat([B5_coverage, ES21_coverage, NBRC16725_coverage, AXB_coverage, KC8_coverage], axis=0)

Proteo_Mincoverage_df = Proteo_Positivecoverage_df.min()
Proteo_Mincoverage_df = Proteo_Mincoverage_df.iloc[0:]
Proteo_Mincoverage_df = Proteo_Mincoverage_df.to_frame()
Proteo_Mincoverage_df.columns = ['Min_coverage']

# transpose the DataFrame
Proteo_Mincoverage_df = Proteo_Mincoverage_df.T

# merge all nad Min table
Proteo_Positivecoverage_df = pd.concat([Proteo_Positivecoverage_df, Proteo_Mincoverage_df], axis=0)
Proteo_Positivecoverage_df.to_csv('../data/processed/Final/ForReader/ControlData/Proteo_edc/Proteo_PositiveCoverage.csv')


# Display three table
display(Proteo_PositiveBitScore_df)
display(Proteo_PositiveEvalue_df)
display(Proteo_Positivecoverage_df)
print('Done')

Unnamed: 0,P_edc13525_I_,P_edc13530,P_edc13535,P_edc13545,P_edc13550,P_edc13555,P_edc13560,P_edc13565,P_edc13570_I_,P_edc13575,P_edc13580_I_,P_edc13585,P_edc13590,P_edc13595
P_Altererythrobacter_estronivo_MH-B5_1_edc_bit_score,622.1,216.7,664.5,387.9,672.7,644.7,555.2,512.9,501.1,175.4,2001.2,714.2,213.2,214.4
P_Novosphingobium_sp_ES21_edc_bit_score,613.9,207.5,663.5,373.5,657.8,616.4,547.0,459.4,477.4,178.0,1968.2,706.2,185.3,202.9
P_Novosphingobium_tardaugens_NBRC16725_edc_bit_score,626.4,209.7,680.8,394.8,671.6,648.2,556.8,513.9,499.1,191.8,1997.6,717.6,215.0,221.8
P_Sphingobium_estronivorans_AXB_edc_bit_score,614.0,216.0,652.0,379.4,656.3,631.3,556.3,510.7,508.9,177.3,1971.9,720.8,218.0,194.7
P_Sphingomonas_sp_KC8_edc_bit_score,609.7,195.2,672.4,370.1,645.2,613.0,519.0,527.2,491.1,183.0,1937.0,702.1,222.9,204.2
Min_BitScore,609.7,195.2,652.0,370.1,645.2,613.0,519.0,459.4,477.4,175.4,1937.0,702.1,185.3,194.7


Unnamed: 0,P_edc13525_I_,P_edc13530,P_edc13535,P_edc13545,P_edc13550,P_edc13555,P_edc13560,P_edc13565,P_edc13570_I_,P_edc13575,P_edc13580_I_,P_edc13585,P_edc13590,P_edc13595
P_Altererythrobacter_estronivo_MH-B5_1_edc_Evalue,3.9e-190,8.9e-68,2.4999999999999998e-203,1.3e-119,1.4e-205,6.1e-197,2.9e-170,4.7e-157,8.6e-154,4e-55,0.0,2.8e-218,1.1e-66,4.3e-67
P_Novosphingobium_sp_ES21_edc_Evalue,1.3e-187,6.1999999999999995e-65,4.8999999999999994e-203,3.2e-115,4.4999999999999995e-201,2.4e-188,9.2e-168,8.1e-141,1.4e-146,6.2e-56,0.0,7.499999999999999e-216,4.2e-58,1.5e-63
P_Novosphingobium_tardaugens_NBRC16725_edc_Evalue,1.9000000000000002e-191,1.0999999999999999e-65,2.6e-208,9e-122,2.8e-205,4.7e-198,8.5e-171,2.1e-157,3.1000000000000005e-153,3.3000000000000004e-60,0.0,2.4e-219,2.8e-67,2.2000000000000003e-69
P_Sphingobium_estronivorans_AXB_edc_Evalue,1.0999999999999999e-187,1.5e-67,1.6e-199,5.2000000000000004e-117,1.3e-200,6.799999999999999e-193,1.3e-170,2.2000000000000002e-156,3.5000000000000005e-156,1.1e-55,0.0,2.8000000000000003e-220,3.8e-68,5.3e-61
P_Sphingomonas_sp_KC8_edc_Evalue,2.4e-186,3.9e-61,1e-205,3.8e-114,3.3e-197,2.6e-187,3.3e-159,2.2e-161,9.9e-151,2e-57,0.0,1.4e-214,1.2e-69,6.7e-64
Max_Evalue,2.4e-186,3.9e-61,1.6e-199,3.8e-114,3.3e-197,2.6e-187,3.3e-159,8.1e-141,1.4e-146,4e-55,0.0,1.4e-214,4.2e-58,5.3e-61


Unnamed: 0,P_edc13525_I_,P_edc13530,P_edc13535,P_edc13545,P_edc13550,P_edc13555,P_edc13560,P_edc13565,P_edc13570_I_,P_edc13575,P_edc13580_I_,P_edc13585,P_edc13590,P_edc13595
P_Altererythrobacter_estronivo_MH-B5_1_edc_coverage,0.992424,1.0,0.942708,0.854478,0.976378,0.912393,0.973244,0.976064,0.993548,0.955556,0.99566,0.949367,0.94012,0.935897
P_Novosphingobium_sp_ES21_edc_coverage,0.992424,0.993007,0.935401,0.854478,0.966234,0.916488,0.95082,0.975806,0.981013,0.955556,0.993929,0.949367,0.905882,0.89375
P_Novosphingobium_tardaugens_NBRC16725_edc_coverage,0.992424,0.95302,0.947644,0.854478,0.976378,0.922246,0.973244,0.978552,0.993548,0.955556,0.995656,0.949367,0.934132,0.928105
P_Sphingobium_estronivorans_AXB_edc_coverage,0.9801,0.946667,0.942708,0.854962,0.976378,0.922747,0.963455,0.98153,0.971698,0.962687,0.984563,0.949367,0.930233,0.915584
P_Sphingomonas_sp_KC8_edc_coverage,0.987469,0.986111,0.94026,0.863636,0.976378,0.915948,0.97651,0.981627,0.91018,0.956204,0.983677,0.949367,0.936047,0.90566
Min_coverage,0.9801,0.946667,0.935401,0.854478,0.966234,0.912393,0.95082,0.975806,0.91018,0.955556,0.983677,0.949367,0.905882,0.89375


Done


### 1.2 使用 negative control 來獲得合適的bit score

In [9]:
# HMM actino negative control bit-score threshold
import os
import pandas as pd
import re


#------------------------------------------------------------------------
# create a function can get the datafarme of each control's best bit-score, e-vale and coverage dataframes
# hmm domtblout name should be ...edc_I_... NOT ..._I_edc_....
def negative_control_df(control_dir, control_names, BitScore_df):
    # Create an empty dictionary to store the best bit-scores; best e-vale; best coverage for each hmmsearch
    best_bit_scores = {}

    # Loop over the hmmsearches and parse the corresponding "domtblout" file
    for control_name in control_names:
        # Load the "domtblout" file into a pandas DataFrame
        file_path = os.path.join(control_dir, control_name + ".domtblout")
        try:
            df = pd.read_csv(file_path, comment="#", sep='\s+', header=None)
        except pd.errors.EmptyDataError:
            df = pd.DataFrame()

        if df.empty:
            pattern = r'P_.*_edc'
            query_name = re.sub(pattern, 'P_edc', control_name)         
            best_bit_scores[query_name] = 0
        else:        
            # Assign column names to the DataFrame
            df.columns = ["target_name", "accession", "tlen", "query_name", "accession2", "qlen", "E-value", "score", "bias",
                          "num_domains_index", "num_domains_total", "c-Evalue", "i-Evalue", "score2", "bias2", "hmm_from", "hmm_to", "ali_from", "ali_to",
                          "env_from", "env_to", "acc", "description"]
            # Calculate the coverage for each hit
            df["coverage"] = (df["ali_to"] - df["ali_from"] + 1) / df["tlen"]

            # Filter the DataFrame by E-value and coverage, and sort by bit-score
            significant_hits = df[(df["E-value"] < 0.001) & (df["coverage"] > 0.50)].sort_values(by="score", ascending=False)

            # replace strains name to aed..
            pattern = r'P_.*_edc'
            query_name = re.sub(pattern, 'P_edc', control_name)  

            # Extract the best bit-score and store it in the dictionary
            if not significant_hits.empty:
                best_bit_score = significant_hits.iloc[0]["score"]
                best_bit_scores[query_name] = best_bit_score
            else:
                best_bit_scores[query_name] = 0

    # create the index names for three df
    # Get the strain name
    pattern = r'P_.*_edc'
    StrainName = re.findall(pattern, control_names[0])

    # create index of bit score
    bitscore_name = StrainName[0] + '_bit_score'
    BitScore_Name = [bitscore_name]

    # create a dataframe of bit score
    BitScore_df = pd.DataFrame(best_bit_scores, index=BitScore_Name)

    return BitScore_df


#---------------------------
# P_Comamonas_thiooxidans_CNB1
# Define the directory that contains the "domtblout" files, 這個genome protein fasta要把discription的文字刪除，因有空格會影響read
control_dir = "../data/raw/Proteo_HMM_NegativeControl/P_Comamonas_thiooxidans_CNB1/"

# Define the names of the control_names (without the file extension); delet 13540
control_names = ['P_Comamonas_thiooxidans_CNB1_edc13525_I_', 'P_Comamonas_thiooxidans_CNB1_edc13530', 'P_Comamonas_thiooxidans_CNB1_edc13535', 'P_Comamonas_thiooxidans_CNB1_edc13545', 'P_Comamonas_thiooxidans_CNB1_edc13550', 'P_Comamonas_thiooxidans_CNB1_edc13555', 'P_Comamonas_thiooxidans_CNB1_edc13560', 'P_Comamonas_thiooxidans_CNB1_edc13565', 'P_Comamonas_thiooxidans_CNB1_edc13570_I_', 'P_Comamonas_thiooxidans_CNB1_edc13575', 'P_Comamonas_thiooxidans_CNB1_edc13580_I_', 'P_Comamonas_thiooxidans_CNB1_edc13585', 'P_Comamonas_thiooxidans_CNB1_edc13590', 'P_Comamonas_thiooxidans_CNB1_edc13595']

# Use positive_control_df function to get the df (control_dir, control_names, BitScore_df)
CNB1_BitScore = pd.DataFrame()
CNB1_BitScore = negative_control_df(control_dir, control_names, CNB1_BitScore)


#---------------------------
# P_Novosphingobium_sp_strain_Chol11
# Define the directory that contains the "domtblout" files, 這個genome protein fasta要把discription的文字刪除，因有空格會影響read
control_dir = "../data/raw/Proteo_HMM_NegativeControl/P_Novosphingobium_sp_strain_Chol11/"

# Define the names of the control_names (without the file extension); delet 13540
control_names = ['P_Novosphingobium_sp_strain_Chol11_edc13525_I_', 'P_Novosphingobium_sp_strain_Chol11_edc13530', 'P_Novosphingobium_sp_strain_Chol11_edc13535', 'P_Novosphingobium_sp_strain_Chol11_edc13545', 'P_Novosphingobium_sp_strain_Chol11_edc13550', 'P_Novosphingobium_sp_strain_Chol11_edc13555', 'P_Novosphingobium_sp_strain_Chol11_edc13560', 'P_Novosphingobium_sp_strain_Chol11_edc13565', 'P_Novosphingobium_sp_strain_Chol11_edc13570_I_', 'P_Novosphingobium_sp_strain_Chol11_edc13575', 'P_Novosphingobium_sp_strain_Chol11_edc13580_I_', 'P_Novosphingobium_sp_strain_Chol11_edc13585', 'P_Novosphingobium_sp_strain_Chol11_edc13590', 'P_Novosphingobium_sp_strain_Chol11_edc13595']

# Use positive_control_df function to get the df (control_dir, control_names, BitScore_df)
Chol11_BitScore = pd.DataFrame()
Chol11_BitScore = negative_control_df(control_dir, control_names, Chol11_BitScore)


#---------------------------
# P_Pseudomonas_putida_DOC21_cluster
# Define the directory that contains the "domtblout" files, 這個genome protein fasta要把discription的文字刪除，因有空格會影響read
control_dir = "../data/raw/Proteo_HMM_NegativeControl/P_Pseudomonas_putida_DOC21_cluster/"

# Define the names of the control_names (without the file extension); delet 13540
control_names = ['P_Pseudomonas_putida_DOC21_cluster1_edc13525_I_', 'P_Pseudomonas_putida_DOC21_cluster1_edc13530', 'P_Pseudomonas_putida_DOC21_cluster1_edc13535', 'P_Pseudomonas_putida_DOC21_cluster1_edc13545', 'P_Pseudomonas_putida_DOC21_cluster1_edc13550', 'P_Pseudomonas_putida_DOC21_cluster1_edc13555', 'P_Pseudomonas_putida_DOC21_cluster1_edc13560', 'P_Pseudomonas_putida_DOC21_cluster1_edc13565', 'P_Pseudomonas_putida_DOC21_cluster1_edc13570_I_', 'P_Pseudomonas_putida_DOC21_cluster1_edc13575', 'P_Pseudomonas_putida_DOC21_cluster1_edc13580_I_', 'P_Pseudomonas_putida_DOC21_cluster1_edc13585', 'P_Pseudomonas_putida_DOC21_cluster1_edc13590', 'P_Pseudomonas_putida_DOC21_cluster1_edc13595']

# Use positive_control_df function to get the df (control_dir, control_names, BitScore_df)
DOC21_BitScore = pd.DataFrame()
DOC21_BitScore = negative_control_df(control_dir, control_names, DOC21_BitScore)


#---------------------------
# P_Pseudomonas_stutzeri_Chol-1
# Define the directory that contains the "domtblout" files, 這個genome protein fasta要把discription的文字刪除，因有空格會影響read
control_dir = "../data/raw/Proteo_HMM_NegativeControl/P_Pseudomonas_stutzeri_Chol-1/"

# Define the names of the control_names (without the file extension); delet 13540
control_names = ['P_Pseudomonas_stutzeri_Chol-1_edc13525_I_', 'P_Pseudomonas_stutzeri_Chol-1_edc13530', 'P_Pseudomonas_stutzeri_Chol-1_edc13535', 'P_Pseudomonas_stutzeri_Chol-1_edc13545', 'P_Pseudomonas_stutzeri_Chol-1_edc13550', 'P_Pseudomonas_stutzeri_Chol-1_edc13555', 'P_Pseudomonas_stutzeri_Chol-1_edc13560', 'P_Pseudomonas_stutzeri_Chol-1_edc13565', 'P_Pseudomonas_stutzeri_Chol-1_edc13570_I_', 'P_Pseudomonas_stutzeri_Chol-1_edc13575', 'P_Pseudomonas_stutzeri_Chol-1_edc13580_I_', 'P_Pseudomonas_stutzeri_Chol-1_edc13585', 'P_Pseudomonas_stutzeri_Chol-1_edc13590', 'P_Pseudomonas_stutzeri_Chol-1_edc13595']

# Use positive_control_df function to get the df (control_dir, control_names, BitScore_df)
Chol01_BitScore = pd.DataFrame()
Chol01_BitScore = negative_control_df(control_dir, control_names, Chol01_BitScore)


#-------------------------------------------------------------------------------------------------------
# merge four negative best bit score df
Proteo_Negative_df = pd.concat([CNB1_BitScore, Chol11_BitScore, DOC21_BitScore, Chol01_BitScore], axis=0)

# Got the highest bit score of the df
Proteo_BitScore_Criteria = Proteo_Negative_df.max()
Proteo_BitScore_Criteria = Proteo_BitScore_Criteria.iloc[0:]
Proteo_BitScore_Criteria = Proteo_BitScore_Criteria.to_frame()
Proteo_BitScore_Criteria.columns = ['Criteria_Bitscore']

# transpose the DataFrame
Proteo_BitScore_Criteria_T = Proteo_BitScore_Criteria.T

# merge all nad Min table
Proteo_Negative_df = pd.concat([Proteo_Negative_df, Proteo_BitScore_Criteria_T], axis=0)
Proteo_Negative_df.to_csv('../data/processed/Final/ForReader/ControlData/Proteo_edc/Proteo_NegativeBitscore.csv')

# done
display(Proteo_BitScore_Criteria)
display(Proteo_Negative_df)
print('done')

Unnamed: 0,Criteria_Bitscore
P_edc13525_I_,127.5
P_edc13530,0.0
P_edc13535,397.4
P_edc13545,122.5
P_edc13550,188.1
P_edc13555,31.5
P_edc13560,211.8
P_edc13565,253.4
P_edc13570_I_,222.9
P_edc13575,45.0


Unnamed: 0,P_edc13525_I_,P_edc13530,P_edc13535,P_edc13545,P_edc13550,P_edc13555,P_edc13560,P_edc13565,P_edc13570_I_,P_edc13575,P_edc13580_I_,P_edc13585,P_edc13590,P_edc13595
P_Comamonas_thiooxidans_CNB1_edc_bit_score,0.0,0.0,391.7,122.5,188.1,0.0,211.8,235.9,217.4,22.6,1496.4,254.9,0.0,91.7
P_Novosphingobium_sp_strain_Chol11_edc_bit_score,127.5,0.0,0.0,111.0,0.0,31.5,200.1,249.1,222.9,45.0,0.0,228.2,0.0,19.0
P_Pseudomonas_putida_DOC21_cluster1_edc_bit_score,0.0,0.0,395.2,109.1,136.3,0.0,112.1,212.6,0.0,0.0,0.0,243.1,0.0,99.7
P_Pseudomonas_stutzeri_Chol-1_edc_bit_score,105.1,0.0,397.4,114.8,135.0,0.0,203.1,253.4,221.4,35.9,0.0,241.4,51.3,99.5
Criteria_Bitscore,127.5,0.0,397.4,122.5,188.1,31.5,211.8,253.4,222.9,45.0,1496.4,254.9,51.3,99.7


done


### 2. 使用上述的 bit score 來篩選 MAGS 的 outpout data (不含有 hits 的篩選)

In [10]:
# Proteo_HMM_MAGs
# 需執行前一個cell程式

import os
import pandas as pd
import re

# Aed Cluster to MAGs
# Define the directory that contains the "domtblout" files.需要刪除discription
domtblout_dir = "../data/raw/Proteo_HMM_MAGs_domtblout/"

# Create an empty dictionary to store the target name for each hmmsearch
MAGs_Hits = {}
MAGs_Hits_name = []

# covert criteria dataframe to serires
Proteo_BitScore_Criteria_S = Proteo_BitScore_Criteria['Criteria_Bitscore']
Proteo_BitScore_Criteria_S = Proteo_BitScore_Criteria_S.astype(float)  

# create a dataframe for a all hits 
columns = ["target_name", "accession", "tlen", "query_name", "accession2", "qlen", "E-value", "score", "bias",
           "num_domains_index", "num_domains_total", "c-Evalue", "i-Evalue", "score2", "bias2", "hmm_from",
           "hmm_to", "ali_from", "ali_to", "env_from", "env_to", "acc", "description"]

All_edc_Hits_df = pd.DataFrame(columns=columns)

# Loop over the HMM DOMTBLOUT files and filter the results based on bit score, e-value and coverage
# hmm name and bit score are in the Actino_BitScore_Criteria series
for hmmsearch, threshold in Proteo_BitScore_Criteria_S.items():
    # Load the "domtblout" file into a pandas DataFrame
    file_path = os.path.join(domtblout_dir, hmmsearch + ".domtblout")
    df = pd.read_csv(file_path, comment="#", sep='\s+', header=None)    
    # Assign column names to the DataFrame
    df.columns = ["target_name", "accession", "tlen", "query_name", "accession2", "qlen", "E-value", "score", "bias",
                  "num_domains_index", "num_domains_total", "c-Evalue", "i-Evalue", "score2", "bias2", "hmm_from", "hmm_to", "ali_from", "ali_to",
                  "env_from", "env_to", "acc", "description"]

    # Calculate the coverage for each hit
    df["coverage"] = (df["ali_to"] - df["ali_from"] + 1) / df["tlen"]
    
    # Filter the DataFrame by E-value, coverage, and bit-score
    significant_hits = df[(df["E-value"] < 0.001) & (df["coverage"] > 0.50) & (df["score"] > threshold)]

    # Extract Target nmae and store it in the dictionary
    if not significant_hits.empty:
        MAGs_Hits_name = significant_hits["target_name"].tolist()
        MAGs_Hits[hmmsearch] = MAGs_Hits_name
    else:
        MAGs_Hits_name = None
        MAGs_Hits[hmmsearch] = MAGs_Hits_name
    
    # add hits table to a df
    All_edc_Hits_df = pd.concat([significant_hits, All_edc_Hits_df], axis=0)

# done
print('done')
print('unique query name: ', All_edc_Hits_df['query_name'].unique(), ' / ', len(All_edc_Hits_df['query_name'].unique()))
All_edc_Hits_df.to_csv('../data/processed/All_edc_Hits_df_bitscore.csv')  
All_edc_Hits_df

done
unique query name:  ['Q_P_13595' 'Q_P_13590' 'Q_P_13585' 'Q_P_I_edcC_13580' 'Q_P_13575'
 'Q_P_I_edcB_13570' 'Q_P_13565' 'Q_P_13560' 'Q_P_13555' 'Q_P_13550'
 'Q_P_13545' 'Q_P_13535' 'Q_P_13530' 'Q_P_I_edcA_13525']  /  14


Unnamed: 0,target_name,accession,tlen,query_name,accession2,qlen,E-value,score,bias,num_domains_index,...,bias2,hmm_from,hmm_to,ali_from,ali_to,env_from,env_to,acc,description,coverage
0,3300024060.a:Ga0209987_10000324_8,-,161,Q_P_13595,-,145,1.400000e-51,186.5,0.0,1,...,0.0,1,145,15,160,15,160,0.99,-,0.906832
1,3300024431.a:Ga0209988_10001403_11,-,161,Q_P_13595,-,145,1.400000e-51,186.5,0.0,1,...,0.0,1,145,15,160,15,160,0.99,-,0.906832
2,3300012090.a:Ga0153956_1000496_23,-,161,Q_P_13595,-,145,2.200000e-51,185.8,0.2,1,...,0.2,3,145,16,160,14,160,0.97,-,0.900621
3,3300026193.a:Ga0208129_1000342_13,-,162,Q_P_13595,-,145,2.800000e-51,185.5,0.1,1,...,0.1,1,145,16,161,16,161,0.99,-,0.901235
4,3300026202.a:Ga0207984_1001029_7,-,162,Q_P_13595,-,145,2.800000e-51,185.5,0.1,1,...,0.1,1,145,16,161,16,161,0.99,-,0.901235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16530,3300014205.a:Ga0172380_10055494_2,-,412,Q_P_I_edcA_13525,-,393,4.700000e-33,127.6,0.0,1,...,0.0,12,369,21,383,16,409,0.83,-,0.881068
16531,3300026118.a:Ga0207675_100018219_6,-,408,Q_P_I_edcA_13525,-,393,4.800000e-33,127.6,0.0,1,...,0.0,69,390,83,400,9,403,0.87,-,0.779412
16532,3300026980.a:Ga0207829_101107_6,-,408,Q_P_I_edcA_13525,-,393,4.800000e-33,127.6,0.0,1,...,0.0,77,368,98,380,89,391,0.92,-,0.693627
16551,3300015206.a:Ga0167644_1000100_80,-,409,Q_P_I_edcA_13525,-,393,4.800000e-33,127.6,0.0,1,...,0.0,70,351,86,368,74,382,0.91,-,0.691932


#### 2.1 獲取positive的target name並連結MAGsID，整理出一個類似heatmap的table，再以寬鬆hits數量進行篩選(>8)

In [11]:
# Got target name and query name
import os
import pandas as pd
import re

All_edc_Hits_df = pd.read_csv('../data/processed/All_edc_Hits_df_bitscore.csv') 
All_edc_Hits_TargetAndQuery = All_edc_Hits_df[['target_name', 'query_name']]

# Load a Dataframe with the lookup values for merge protein id to MAGs and merge them
TarToMAGs_edc = pd.read_csv('../data/interim/edc_All_TarToMAGsID.csv')

# use merge() function to join the MAGsID data
edc_hits_TargetAndMAGsID = pd.merge(All_edc_Hits_TargetAndQuery, TarToMAGs_edc, on='target_name', how='left')                                                                               

# check the null value
print('Any Null in TargetToMAGsID: ', edc_hits_TargetAndMAGsID['MAGsID'].isnull().any())

# create the crosstab table (like heatmap)
edc_hits_heatmap = pd.crosstab(edc_hits_TargetAndMAGsID['query_name'], edc_hits_TargetAndMAGsID['MAGsID'], dropna=False)
edc_hits_heatmap = edc_hits_heatmap.transpose()
edc_hits_heatmap
# edc_hits_heatmap.to_csv('../data/processed/edc_hits_heatmap_bitscore.csv')

# Count the non-zero values in hmm profiles hit row to calculate the number of different HMM profiles that have hits in a given MAG. 
def count_nonzero(row):
    return len(row[row != 0])

num_hits = edc_hits_heatmap.apply(count_nonzero, axis=1)
edc_hits_heatmap['num_hits'] = num_hits

# sort them by hits numer
edc_hits_heatmap = edc_hits_heatmap.sort_values(by="num_hits", ascending=False)

# extract >8 hmm profiles hits and the necessary hits (aedA、aedB、aedJ)
edc_hits_FinalFilter =  edc_hits_heatmap[(edc_hits_heatmap['num_hits'] >= 8)]
# >7 = 1020, >8 = 597, >9 = 294, >10 = 111 先用8看看 大於一半的query gene

# Reset index and move index column to first position
edc_hits_FinalFilter.index.name = None
edc_hits_FinalFilter = edc_hits_FinalFilter.reset_index()
edc_hits_FinalFilter.insert(0, 'index', edc_hits_FinalFilter.pop('index'))

# rename MAGsID
edc_hits_FinalFilter = edc_hits_FinalFilter.rename(columns={'index': 'genome_id'})

# extract the MAGsID and num_hits column
edc_Positive_MAGsID = edc_hits_FinalFilter[['genome_id', 'num_hits']]
edc_Positive_MAGsID

# done
print('done')
edc_Positive_MAGsID

Any Null in TargetToMAGsID:  False
done


query_name,genome_id,num_hits
0,3300026101_4,14
1,3300020369_2,14
2,3300024344_6,13
3,3300027685_11,13
4,3300024431_16,13
...,...,...
592,3300010158_16,8
593,3300020432_23,8
594,3300024514_1,8
595,3300020477_54,8


##### 2.1.1 連結loose positive MAGs到 metagenomes 總資料中並進行處理成最株能給R繪製 stacket barchart 的資料格式

In [12]:
# 需先執行上一個cell

# open metagenome csv files
metagenmoes_df = pd.read_csv('../data/external/genome_metadata_editForAnalysis_NotReference.csv')

# merge positive MAGs with metagenome
edc_Positive_metagenomes_df = pd.merge(edc_Positive_MAGsID, metagenmoes_df, on='genome_id', how='left')

# print(edc_Positive_metagenomes_df['ecosystem_category'].unique())
# print(edc_Positive_metagenomes_df['ecosystem_type'].unique())
# display(edc_Positive_metagenomes_df[edc_Positive_metagenomes_df['ecosystem_type'] == 'Bacteria']) #Digestive system, Anaerobic

# extract certain column
edc_Positive_metagenomes_df = edc_Positive_metagenomes_df[['genome_id', 'metagenome_id','taxonomy', 'ecosystem', 'ecosystem_category','num_hits', 'longitude', 'latitude']]

# extract phylum and class from taxonomy column and expand to new column
edc_Positive_metagenomes_df['Phylum'] = edc_Positive_metagenomes_df['taxonomy'].str.extract('(p__\w+)', expand=True)
edc_Positive_metagenomes_df['Class'] = edc_Positive_metagenomes_df['taxonomy'].str.extract('(c__\w+)', expand=True)

# # print non-duplicated values in ecosystem_type than check the lable
# print(edc_Positive_metagenomes_df['ecosystem_type'].unique())

# # check the None value in ecosystem column Create a Boolean mask to identify NaN values
# mask = edc_addEco_df.isna()
# edc_addEco_df_nan_rows = edc_addEco_df[mask.any(axis=1)]
# edc_addEco_df_nan_rows

# remove, add and rearrange column
edc_Positive_metagenomes_df.drop('taxonomy', axis=1, inplace=True)
edc_Positive_metagenomes_df = edc_Positive_metagenomes_df.reindex(columns=['genome_id', 'metagenome_id', 'num_hits', 'Phylum', 'Class', 'ecosystem', 'ecosystem_category', 'longitude', 'latitude'])
edc_Positive_metagenomes_df['Homologous_cluster']='Proteo_edc_cluster'

#check phylum data and null value
print(edc_Positive_metagenomes_df['Phylum'].unique())
print('Any Null in edc_Positive_metagenomes_df:\n', edc_Positive_metagenomes_df.isnull().any())

# write file
edc_Positive_metagenomes_df.to_csv('../data/processed/Final/Proteo/edc_PositiveHits_ForR_loose.csv')
print('doen')
print('Number of p__UBP10:', edc_Positive_metagenomes_df[edc_Positive_metagenomes_df['Phylum'] == 'p__UBP10'].shape[0])
edc_Positive_metagenomes_df

['p__Proteobacteria' 'p__Myxococcota' 'p__Actinobacteriota' 'p__UBP10'
 'p__Chloroflexota' 'p__Methylomirabilota' 'p__Entotheonellota'
 'p__Desulfobacterota' 'p__SAR324' 'p__Spirochaetota']
Any Null in edc_Positive_metagenomes_df:
 genome_id             False
metagenome_id         False
num_hits              False
Phylum                False
Class                  True
ecosystem             False
ecosystem_category    False
longitude              True
latitude               True
Homologous_cluster    False
dtype: bool
doen
Number of p__UBP10: 48


Unnamed: 0,genome_id,metagenome_id,num_hits,Phylum,Class,ecosystem,ecosystem_category,longitude,latitude,Homologous_cluster
0,3300026101_4,3300026101,14,p__Proteobacteria,c__Gammaproteobacteria,Aquatic,Marine,-55.3027,9.7046,Proteo_edc_cluster
1,3300020369_2,3300020369,14,p__Proteobacteria,c__Gammaproteobacteria,Aquatic,Marine,-3.2659,-20.3356,Proteo_edc_cluster
2,3300024344_6,3300024344,13,p__Myxococcota,c__UBA9160,Aquatic,Marine,25.4868,36.5264,Proteo_edc_cluster
3,3300027685_11,3300027685,13,p__Myxococcota,c__UBA9160,Aquatic,Marine,-44.9905,-37.9728,Proteo_edc_cluster
4,3300024431_16,3300024431,13,p__Myxococcota,c__UBA9160,Aquatic,Marine,-177.6630,-32.8507,Proteo_edc_cluster
...,...,...,...,...,...,...,...,...,...,...
592,3300010158_16,3300010158,8,p__Actinobacteriota,c__Acidimicrobiia,Aquatic,Freshwater,-72.5000,46.8319,Proteo_edc_cluster
593,3300020432_23,3300020432,8,p__Actinobacteriota,c__Acidimicrobiia,Aquatic,Marine,-88.4916,25.7109,Proteo_edc_cluster
594,3300024514_1,3300024514,8,p__Proteobacteria,c__Alphaproteobacteria,Aquatic,Freshwater,-81.6053,31.4271,Proteo_edc_cluster
595,3300020477_54,3300020477,8,p__Actinobacteriota,c__Acidimicrobiia,Aquatic,Marine,-142.5770,-8.9103,Proteo_edc_cluster


#### 2.2 獲取positive的target name並連結MAGsID，整理出一個類似heatmap的table，再以嚴格hits數量進行篩選(>8, aedA, aedB, aedJ)

In [11]:
import os
import pandas as pd
import re


# read table
All_edc_Hits_df = pd.read_csv('../data/processed/All_edc_Hits_df_bitscore.csv') 
All_edc_Hits_TargetAndQuery = All_edc_Hits_df[['target_name', 'query_name']]

# link target name to metagenome dataframe

# Load a Dataframe with the lookup values for merge protein id to MAGs and merge them
TarToMAGs_edc = pd.read_csv('../data/interim/edc_All_TarToMAGsID.csv')

# use merge() function to join the MAGsID data
edc_hits_TargetAndMAGsID = pd.merge(All_edc_Hits_TargetAndQuery, TarToMAGs_edc, on='target_name', how='left')                                                                               

# check the null value
print('Any Null in TargetToMAGsID: ', edc_hits_TargetAndMAGsID['MAGsID'].isnull().any())

# create the crosstab table (like heatmap)
edc_hits_heatmap = pd.crosstab(edc_hits_TargetAndMAGsID['query_name'], edc_hits_TargetAndMAGsID['MAGsID'], dropna=False)
edc_hits_heatmap = edc_hits_heatmap.transpose()
edc_hits_heatmap
# edc_hits_heatmap.to_csv('../data/processed/edc_hits_heatmap_bitscore.csv')

# Count the non-zero values in hmm profiles hit row to calculate the number of different HMM profiles that have hits in a given MAG. 
def count_nonzero(row):
    return len(row[row != 0])

num_hits = edc_hits_heatmap.apply(count_nonzero, axis=1)
edc_hits_heatmap['num_hits'] = num_hits

# sort them by hits numer
edc_hits_heatmap = edc_hits_heatmap.sort_values(by="num_hits", ascending=False)

# extract >8 hmm profiles hits and the necessary hits (edcA、edcB、edcC)
edc_hits_FinalFilter =  edc_hits_heatmap[(edc_hits_heatmap['num_hits'] >= 8) & (edc_hits_heatmap['Q_P_I_edcC_13580'] != 0) & (edc_hits_heatmap['Q_P_I_edcA_13525'] != 0) & (edc_hits_heatmap['Q_P_I_edcB_13570'] != 0)]
# 34個hits

# Reset index and move index column to first position
edc_hits_FinalFilter.index.name = None
edc_hits_FinalFilter = edc_hits_FinalFilter.reset_index()
edc_hits_FinalFilter.insert(0, 'index', edc_hits_FinalFilter.pop('index'))

# rename MAGsID
edc_hits_FinalFilter = edc_hits_FinalFilter.rename(columns={'index': 'genome_id'})

# extract the MAGsID and num_hits column
edc_Positive_MAGsID = edc_hits_FinalFilter[['genome_id', 'num_hits']]
edc_Positive_MAGsID

# done
print('done')
edc_Positive_MAGsID

Any Null in TargetToMAGsID:  False
done


query_name,genome_id,num_hits
0,3300020369_2,13
1,3300026101_4,13
2,3300024344_6,12
3,3300027685_11,12
4,3300025886_13,11
5,3300017444_33,11
6,3300006913_5,11
7,3300020460_9,10
8,3300020463_34,10
9,3300021343_7,10


##### 2.2.1 連結critical_positive MAGs到 metagenomes 總資料中並進行處理成最株能給R繪製 stacket barchart 的資料格式

In [12]:
# 需先執行上一個cell
import os
import pandas as pd
import re

# open metagenome csv files
metagenmoes_df = pd.read_csv('../data/external/genome_metadata_editForAnalysis_NotReference.csv')

# merge positive MAGs with metagenome
edc_Positive_metagenomes_df = pd.merge(edc_Positive_MAGsID, metagenmoes_df, on='genome_id', how='left')

# print(edc_Positive_metagenomes_df['ecosystem_category'].unique())
# print(edc_Positive_metagenomes_df['ecosystem_type'].unique())
# display(edc_Positive_metagenomes_df[edc_Positive_metagenomes_df['ecosystem_type'] == 'Bacteria']) #Digestive system, Anaerobic

# extract certain column
edc_Positive_metagenomes_df = edc_Positive_metagenomes_df[['genome_id', 'metagenome_id', 'taxonomy', 'ecosystem', 'ecosystem_category', 'num_hits', 'longitude', 'latitude']]

# extract phylum and class from taxonomy column and expand to new column
edc_Positive_metagenomes_df['Phylum'] = edc_Positive_metagenomes_df['taxonomy'].str.extract('(p__\w+)', expand=True)
edc_Positive_metagenomes_df['Class'] = edc_Positive_metagenomes_df['taxonomy'].str.extract('(c__\w+)', expand=True)

# # print non-duplicated values in ecosystem_type than check the lable
# print(edc_Positive_metagenomes_df['ecosystem_type'].unique())

# # check the None value in ecosystem column Create a Boolean mask to identify NaN values
# mask = edc_addEco_df.isna()
# edc_addEco_df_nan_rows = edc_addEco_df[mask.any(axis=1)]
# edc_addEco_df_nan_rows

# remove, add and rearrange column
edc_Positive_metagenomes_df.drop('taxonomy', axis=1, inplace=True)
edc_Positive_metagenomes_df = edc_Positive_metagenomes_df.reindex(columns=['genome_id', 'metagenome_id', 'num_hits', 'Phylum', 'Class', 'ecosystem', 'ecosystem_category', 'longitude', 'latitude'])
edc_Positive_metagenomes_df['Homologous_cluster']='Proteo_edc_cluster'

# check phylum data
print(edc_Positive_metagenomes_df['Phylum'].unique())
print('Any Null in edc_Positive_metagenomes_df:\n', edc_Positive_metagenomes_df.isnull().any())

# write file
edc_Positive_metagenomes_df.to_csv('../data/processed/Final/Proteo/edc_PositiveHits_ForR_critical.csv')
print('doen')
edc_Positive_metagenomes_df

['p__Proteobacteria' 'p__Myxococcota']
Any Null in edc_Positive_metagenomes_df:
 genome_id             False
metagenome_id         False
num_hits              False
Phylum                False
Class                 False
ecosystem             False
ecosystem_category    False
longitude             False
latitude              False
Homologous_cluster    False
dtype: bool
doen


Unnamed: 0,genome_id,metagenome_id,num_hits,Phylum,Class,ecosystem,ecosystem_category,longitude,latitude,Homologous_cluster
0,3300020369_2,3300020369,13,p__Proteobacteria,c__Gammaproteobacteria,Aquatic,Marine,-3.2659,-20.3356,Proteo_edc_cluster
1,3300026101_4,3300026101,13,p__Proteobacteria,c__Gammaproteobacteria,Aquatic,Marine,-55.3027,9.7046,Proteo_edc_cluster
2,3300024344_6,3300024344,12,p__Myxococcota,c__UBA9160,Aquatic,Marine,25.4868,36.5264,Proteo_edc_cluster
3,3300027685_11,3300027685,12,p__Myxococcota,c__UBA9160,Aquatic,Marine,-44.9905,-37.9728,Proteo_edc_cluster
4,3300025886_13,3300025886,11,p__Myxococcota,c__UBA9160,Aquatic,Marine,7.9,54.1842,Proteo_edc_cluster
5,3300017444_33,3300017444,11,p__Myxococcota,c__UBA9160,Aquatic,Thermal springs,-125.513,60.1987,Proteo_edc_cluster
6,3300006913_5,3300006913,11,p__Proteobacteria,c__Gammaproteobacteria,Host-associated,Annelida,12.3946,51.3216,Proteo_edc_cluster
7,3300020460_9,3300020460,10,p__Proteobacteria,c__Gammaproteobacteria,Aquatic,Marine,37.2517,23.4217,Proteo_edc_cluster
8,3300020463_34,3300020463,10,p__Proteobacteria,c__Alphaproteobacteria,Aquatic,Marine,-87.0917,-32.7806,Proteo_edc_cluster
9,3300021343_7,3300021343,10,p__Proteobacteria,c__Gammaproteobacteria,Aquatic,Marine,-81.2145,29.6703,Proteo_edc_cluster


# 不執行

#### 2.3 獲取positive的target name並連結MAGsID，整理出一個類似heatmap的table，再以本先討論的hits數量進行篩選來看差異(>10)

In [3]:
# Got target name and query name
import os
import pandas as pd
import re

All_edc_Hits_df = pd.read_csv('../data/processed/All_edc_Hits_df_bitscore.csv') 
All_edc_Hits_TargetAndQuery = All_edc_Hits_df[['target_name', 'query_name']]

# link target name to metagenome dataframe

# Load a Dataframe with the lookup values for merge protein id to MAGs and merge them
TarToMAGs_edc = pd.read_csv('../data/interim/edc_All_TarToMAGsID.csv')

# use merge() function to join the MAGsID data
edc_hits_TargetAndMAGsID = pd.merge(All_edc_Hits_TargetAndQuery, TarToMAGs_edc, on='target_name', how='left')                                                                               

# check the null value
print('Any Null: ', edc_hits_TargetAndMAGsID['MAGsID'].isnull().any())

# create the crosstab table (like heatmap)
edc_hits_heatmap = pd.crosstab(edc_hits_TargetAndMAGsID['query_name'], edc_hits_TargetAndMAGsID['MAGsID'], dropna=False)
edc_hits_heatmap = edc_hits_heatmap.transpose()
edc_hits_heatmap
# edc_hits_heatmap.to_csv('../data/processed/edc_hits_heatmap_bitscore.csv')

# Count the non-zero values in hmm profiles hit row to calculate the number of different HMM profiles that have hits in a given MAG. 
def count_nonzero(row):
    return len(row[row != 0])

num_hits = edc_hits_heatmap.apply(count_nonzero, axis=1)
edc_hits_heatmap['num_hits'] = num_hits

# sort them by hits numer
edc_hits_heatmap = edc_hits_heatmap.sort_values(by="num_hits", ascending=False)

# extract >8 hmm profiles hits and the necessary hits (edcA、edcB、edcC)
edc_hits_FinalFilter =  edc_hits_heatmap[(edc_hits_heatmap['num_hits'] >= 10)]
# 111個hits

# Reset index and move index column to first position
edc_hits_FinalFilter.index.name = None
edc_hits_FinalFilter = edc_hits_FinalFilter.reset_index()
edc_hits_FinalFilter.insert(0, 'index', edc_hits_FinalFilter.pop('index'))

# rename MAGsID
edc_hits_FinalFilter = edc_hits_FinalFilter.rename(columns={'index': 'genome_id'})

# extract the MAGsID and num_hits column
edc_Positive_MAGsID = edc_hits_FinalFilter[['genome_id', 'num_hits']]
edc_Positive_MAGsID

# done
print('done')
edc_Positive_MAGsID

Any Null:  False
done


query_name,genome_id,num_hits
0,3300020369_2,14
1,3300026101_4,14
2,3300027685_11,13
3,3300024344_6,13
4,3300024431_16,13
...,...,...
106,3300027838_39,10
107,3300026091_10,10
108,3300006083_2,10
109,3300007519_68,10


##### 2.3.1 連結 10 Hits positive (>10) MAGs到 metagenomes 總資料中並進行處理成最株能給R繪製 stacket barchart 的資料格式

In [26]:
# 需先執行上一個cell
import os
import pandas as pd
import re

# open metagenome csv files
metagenmoes_df = pd.read_csv('../data/external/Paper_genome_metadata.csv')
metagenmoes_df = metagenmoes_df.rename(columns={'ecosystem': 'taxonomy'})

# merge positive MAGs with metagenome
edc_Positive_metagenomes_df = pd.merge(edc_Positive_MAGsID, metagenmoes_df, on='genome_id', how='left')

# print(edc_Positive_metagenomes_df['ecosystem_category'].unique())
# print(edc_Positive_metagenomes_df['ecosystem_type'].unique())
# display(edc_Positive_metagenomes_df[edc_Positive_metagenomes_df['ecosystem_type'] == 'Bacteria']) #Digestive system, Anaerobic

# extract certain column
edc_Positive_metagenomes_df = edc_Positive_metagenomes_df[['genome_id', 'taxonomy', 'ecosystem_type', 'num_hits']]

# extract phylum and class from taxonomy column and expand to new column
edc_Positive_metagenomes_df['Phylum'] = edc_Positive_metagenomes_df['taxonomy'].str.extract('(p__\w+)', expand=True)
edc_Positive_metagenomes_df['Class'] = edc_Positive_metagenomes_df['taxonomy'].str.extract('(c__\w+)', expand=True)

# # print non-duplicated values in ecosystem_type than check the lable
# print(edc_Positive_metagenomes_df['ecosystem_type'].unique())

# Load a EcosystemToEco_type dataframe for lookup
ForEcoLookup = pd.read_csv("../data/interim/EcosystemToEco_type.csv")

# merge them with ecosystem_type
edc_addEco_df = pd.merge(edc_Positive_metagenomes_df, ForEcoLookup, on='ecosystem_type', how='left')

# # check the None value in ecosystem column Create a Boolean mask to identify NaN values
# mask = edc_addEco_df.isna()
# edc_addEco_df_nan_rows = edc_addEco_df[mask.any(axis=1)]
# edc_addEco_df_nan_rows

# !!!! change the ecosystem_type value name
edc_addEco_df['ecosystem_type'] = edc_addEco_df['ecosystem_type'].replace({'Mycelium': 'Fungi', 'Rhizosphere': 'Plant', 'Industrial wastewater': 'Wastewater', 
                                                                          "Nutrient removal": "Wastewater", "Rhizoplane": "Plant", "Activated Sludge": "Wastewater","Defined media": "Lab enrichment"})
print(edc_addEco_df['ecosystem_type'].unique())

# remove, add and rearrange column
edc_addEco_df.drop('taxonomy', axis=1, inplace=True)
edc_addEco_df= edc_addEco_df.reindex(columns=['genome_id', 'num_hits', 'Phylum', 'Class', 'ecosystem', 'ecosystem_type'])
edc_addEco_df['Homologous_cluster']='Proteo_edc_cluster'

# check phylum data
# print(edc_addEco_df['Phylum'].unique())

# write file
edc_addEco_df.to_csv('../data/processed/Final/Proteo/edc_PositiveHits_ForR_10Hits.csv')
print('doen')
edc_addEco_df

['Marine' 'Fungi' 'Thermal spring' 'Soil' 'Freshwater' 'Annelida' 'Plant'
 'Cnidaria' 'Wastewater' 'Lab enrichment' 'Sediment']
doen


Unnamed: 0,genome_id,num_hits,Phylum,Class,ecosystem,ecosystem_type,Homologous_cluster
0,3300020369_2,14,p__Proteobacteria,c__Gammaproteobacteria,Aquatic,Marine,Proteo_edc_cluster
1,3300026101_4,14,p__Proteobacteria,c__Gammaproteobacteria,Aquatic,Marine,Proteo_edc_cluster
2,3300027685_11,13,p__Myxococcota,c__UBA9160,Aquatic,Marine,Proteo_edc_cluster
3,3300024344_6,13,p__Myxococcota,c__UBA9160,Aquatic,Marine,Proteo_edc_cluster
4,3300024431_16,13,p__Myxococcota,c__UBA9160,Aquatic,Marine,Proteo_edc_cluster
...,...,...,...,...,...,...,...
106,3300027838_39,10,p__UBP10,c__GR,Aquatic,Marine,Proteo_edc_cluster
107,3300026091_10,10,p__Actinobacteriota,c__Acidimicrobiia,Aquatic,Marine,Proteo_edc_cluster
108,3300006083_2,10,p__Actinobacteriota,c__Acidimicrobiia,Aquatic,Marine,Proteo_edc_cluster
109,3300007519_68,10,p__Actinobacteriota,c__Acidimicrobiia,Aquatic,Freshwater,Proteo_edc_cluster


## 第二部分: 舊版參考前人標準 (e-value coverage)，不含有bit score
DOI: https://doi.org/10.1128/mBio.00166-16

### 1. 用一致的e-value與coverage進行篩選

In [2]:
import os
import pandas as pd
import re

# Proteo MAGs reference filtering
# Define the directory that contains the "domtblout" files, 這個genome protein fasta要把discription的文字刪除，因有空格會影響read
domtblout_dir = "../data/raw/Proteo_HMM_MAGs_domtblout/"

# Define the names of the hmmsearches (without the file extension) delet aedC,Q,R.
hmmsearches = ['MAGs_P_13530', 'MAGs_P_13535', 'MAGs_P_13540', 'MAGs_P_13545', 'MAGs_P_13550', 'MAGs_P_13555', 'MAGs_P_13560', 'MAGs_P_13565', 'MAGs_P_13575',
               'MAGs_P_13585', 'MAGs_P_13590', 'MAGs_P_13595', 'MAGs_P_I_edcA_13525', 'MAGs_P_I_edcB_13570', 'MAGs_P_I_edcC_13580']

# create a dataframe for a all hits 
columns = ["target_name", "accession", "tlen", "query_name", "accession2", "qlen", "E-value", "score", "bias",
           "num_domains_index", "num_domains_total", "c-Evalue", "i-Evalue", "score2", "bias2", "hmm_from",
           "hmm_to", "ali_from", "ali_to", "env_from", "env_to", "acc", "description"]

All_edc_Hits_df = pd.DataFrame(columns=columns)

MAGs_Hits = {}
MAGs_Hits_name = []


# Loop over the hmmsearches and parse the corresponding "domtblout" file
for hmmsearch in hmmsearches:
    # Load the "domtblout" file into a pandas DataFrame
    file_path = os.path.join(domtblout_dir, hmmsearch + ".domtblout")
    df = pd.read_csv(file_path, comment="#", sep='\s+', header=None)    
    # Assign column names to the DataFrame
    df.columns = ["target_name", "accession", "tlen", "query_name", "accession2", "qlen", "E-value", "score", "bias",
                  "num_domains_index", "num_domains_total", "c-Evalue", "i-Evalue", "score2", "bias2", "hmm_from", "hmm_to", "ali_from", "ali_to",
                  "env_from", "env_to", "acc", "description"]
    # Calculate the coverage for each hit
    df["coverage"] = (df["ali_to"] - df["ali_from"] + 1) / df["tlen"]

    # Filter the DataFrame by E-value and coverage, and sort by bit-score
    significant_hits_Ref = df[(df["E-value"] <= 1e-25) & (df["coverage"] > 0.30)]

    # Extract Target nmae and store it in the dictionary
    if not significant_hits_Ref.empty:
        MAGs_Hits_name = significant_hits_Ref["target_name"].tolist()
        MAGs_Hits[hmmsearch] = MAGs_Hits_name
    else:
        MAGs_Hits_name = None
        MAGs_Hits[hmmsearch] = MAGs_Hits_name
    
    # add hits table to a df
    All_edc_Hits_df = pd.concat([significant_hits_Ref, All_edc_Hits_df], axis=0)

All_edc_Hits_df.to_csv('../data/processed/All_edc_Hits_df_refrence.csv')    

# done
print('done')
All_edc_Hits_df

done


Unnamed: 0,target_name,accession,tlen,query_name,accession2,qlen,E-value,score,bias,num_domains_index,...,bias2,hmm_from,hmm_to,ali_from,ali_to,env_from,env_to,acc,description,coverage
0,3300020463.a:Ga0211676_10006390_8,-,1162,Q_P_I_edcC_13580,-,1145,0.000000e+00,1994.3,0.0,1,...,0.0,1,1143,12,1142,12,1144,0.99,-,0.973322
1,3300020369.a:Ga0211709_10000577_18,-,1185,Q_P_I_edcC_13580,-,1145,0.000000e+00,1991.4,0.0,1,...,0.0,1,1144,20,1167,20,1168,0.98,-,0.968776
2,3300014833.a:Ga0119870_1000327_7,-,1168,Q_P_I_edcC_13580,-,1145,0.000000e+00,1871.7,0.0,1,...,0.0,1,1144,6,1154,6,1155,0.98,-,0.983733
3,3300024431.a:Ga0209988_10007392_4,-,1169,Q_P_I_edcC_13580,-,1145,0.000000e+00,1735.0,0.0,1,...,0.0,1,1143,14,1162,14,1164,0.98,-,0.982891
4,3300017444.a:Ga0185300_10000551_81,-,1185,Q_P_I_edcC_13580,-,1145,0.000000e+00,1721.2,0.0,1,...,0.0,1,1144,37,1182,37,1183,0.98,-,0.967089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371,3300020403.a:Ga0211532_10000028_18,-,149,Q_P_13530,-,144,1.000000e-25,102.6,0.4,1,...,0.4,6,134,16,135,13,147,0.87,-,0.805369
372,3300020417.a:Ga0211528_10001553_12,-,149,Q_P_13530,-,144,1.000000e-25,102.6,0.4,1,...,0.4,6,134,16,135,13,147,0.87,-,0.805369
373,3300020418.a:Ga0211557_10001031_4,-,149,Q_P_13530,-,144,1.000000e-25,102.6,0.4,1,...,0.4,6,134,16,135,13,147,0.87,-,0.805369
374,3300020453.a:Ga0211550_10000005_13,-,149,Q_P_13530,-,144,1.000000e-25,102.6,0.4,1,...,0.4,6,134,16,135,13,147,0.87,-,0.805369


### 2. 獲取 positive的 target name 並連結 MAGs 並整理出一個類似heatmap的table，再以> 10 hits數量進行篩選，並有三個必須基因的限制(edcA, edcB, edcC)

In [4]:
import os
import pandas as pd
import re


# read table
All_edc_Hits_df = pd.read_csv('../data/processed/All_edc_Hits_df_refrence.csv') 
All_edc_Hits_TargetAndQuery = All_edc_Hits_df[['target_name', 'query_name']]

# link target name to metagenome dataframe

# Load a Dataframe with the lookup values for merge protein id to MAGs and merge them
TarToMAGs_edc = pd.read_csv('../data/interim/edc_All_TarToMAGsID.csv')

# use merge() function to join the MAGsID data
edc_hits_TargetAndMAGsID = pd.merge(All_edc_Hits_TargetAndQuery, TarToMAGs_edc, on='target_name', how='left')                                                                               

# check the null value
print('Any Null: ', edc_hits_TargetAndMAGsID['MAGsID'].isnull().any())

# create the crosstab table (like heatmap)
edc_hits_heatmap = pd.crosstab(edc_hits_TargetAndMAGsID['query_name'], edc_hits_TargetAndMAGsID['MAGsID'], dropna=False)
edc_hits_heatmap = edc_hits_heatmap.transpose()
edc_hits_heatmap
# edc_hits_heatmap.to_csv('../data/processed/edc_hits_heatmap_bitscore.csv')

# Count the non-zero values in hmm profiles hit row to calculate the number of different HMM profiles that have hits in a given MAG. 
def count_nonzero(row):
    return len(row[row != 0])

num_hits = edc_hits_heatmap.apply(count_nonzero, axis=1)
edc_hits_heatmap['num_hits'] = num_hits

# sort them by hits numer
edc_hits_heatmap = edc_hits_heatmap.sort_values(by="num_hits", ascending=False)

# extract >8 hmm profiles hits and the necessary hits (edcA、edcB、edcC)
edc_hits_FinalFilter =  edc_hits_heatmap[(edc_hits_heatmap['num_hits'] >= 10) & (edc_hits_heatmap['Q_P_I_edcC_13580'] != 0) & (edc_hits_heatmap['Q_P_I_edcA_13525'] != 0) & (edc_hits_heatmap['Q_P_I_edcB_13570'] != 0)]
# 

# Reset index and move index column to first position
edc_hits_FinalFilter.index.name = None
edc_hits_FinalFilter = edc_hits_FinalFilter.reset_index()
edc_hits_FinalFilter.insert(0, 'index', edc_hits_FinalFilter.pop('index'))

# rename MAGsID
edc_hits_FinalFilter = edc_hits_FinalFilter.rename(columns={'index': 'genome_id'})

# extract the MAGsID and num_hits column
edc_Positive_MAGsID = edc_hits_FinalFilter[['genome_id', 'num_hits']]
edc_Positive_MAGsID

# done
print('done')
edc_Positive_MAGsID

Any Null:  False
done


query_name,genome_id,num_hits
0,3300020369_2,15
1,3300012940_39,15
2,3300020193_17,15
3,3300024344_6,15
4,3300027951_10,14
...,...,...
286,3300025617_10,10
287,3300027851_25,10
288,3300025590_3,10
289,3300025594_3,10


### 3. 連結 Reference_positive MAGs到 metagenomes 總資料中並進行處理成最株能給R繪製 stacket barchart 的資料格式

In [29]:
# 需先執行上一個cell
import os
import pandas as pd
import re

# open metagenome csv files
metagenmoes_df = pd.read_csv('../data/external/Paper_genome_metadata.csv')
metagenmoes_df = metagenmoes_df.rename(columns={'ecosystem': 'taxonomy'})

# merge positive MAGs with metagenome
edc_Positive_metagenomes_df = pd.merge(edc_Positive_MAGsID, metagenmoes_df, on='genome_id', how='left')

# print(edc_Positive_metagenomes_df['ecosystem_category'].unique())
# print(edc_Positive_metagenomes_df['ecosystem_type'].unique())
# display(edc_Positive_metagenomes_df[edc_Positive_metagenomes_df['ecosystem_type'] == 'Bacteria']) #Anaerobic

# extract certain column
edc_Positive_metagenomes_df = edc_Positive_metagenomes_df[['genome_id', 'taxonomy', 'ecosystem_type', 'num_hits']]

# extract phylum and class from taxonomy column and expand to new column
edc_Positive_metagenomes_df['Phylum'] = edc_Positive_metagenomes_df['taxonomy'].str.extract('(p__\w+)', expand=True)
edc_Positive_metagenomes_df['Class'] = edc_Positive_metagenomes_df['taxonomy'].str.extract('(c__\w+)', expand=True)

# # print non-duplicated values in ecosystem_type than check the lable
# print(edc_Positive_metagenomes_df['ecosystem_type'].unique())

# Load a EcosystemToEco_type dataframe for lookup
ForEcoLookup = pd.read_csv("../data/interim/EcosystemToEco_type.csv")

# merge them with ecosystem_type
edc_addEco_df = pd.merge(edc_Positive_metagenomes_df, ForEcoLookup, on='ecosystem_type', how='left')

# # check the None value in ecosystem column Create a Boolean mask to identify NaN values
# mask = edc_addEco_df.isna()
# edc_addEco_df_nan_rows = edc_addEco_df[mask.any(axis=1)]
# edc_addEco_df_nan_rows

# !!!! change the ecosystem_type value name
edc_addEco_df['ecosystem_type'] = edc_addEco_df['ecosystem_type'].replace({"Activated Sludge": "Wastewater", "Nutrient removal": "Wastewater", "Rhizoplane": "Plant", 'Mycelium': 'Fungi', 'Peat moss': "Plant",
                                                                           'Rhizosphere': 'Plant', "Defined media": "Lab enrichment", 'Continuous culture': 'Bioreactor',
                                                                           'Geologic': 'Soil', 'Tetrachloroethylene and derivatives': 'Bioremediation', 'Red algae': 'Algae',
                                                                           'Green algae': 'Algae', 'Ant dump': 'Arthropoda', 'Simulated communities (microbial mixture)': 'Lab enrichment',
                                                                           'Anaerobic digestor': 'Wastewater', 'Composting': 'Solid waste', 'Phylloplane': "Plant", 'City': 'Built environment'})
print(edc_addEco_df['ecosystem_type'].unique())

# remove, add and rearrange column
edc_addEco_df.drop('taxonomy', axis=1, inplace=True)
edc_addEco_df= edc_addEco_df.reindex(columns=['genome_id', 'num_hits', 'Phylum', 'Class', 'ecosystem', 'ecosystem_type'])
edc_addEco_df['Homologous_cluster']='Proteo_edc_cluster'

# check phylum data
# print(edc_addEco_df['Phylum'].unique())

# write file
edc_addEco_df.to_csv('../data/processed/Final/Proteo/edc_PositiveHits_ForR_Reference.csv')
print('doen')
edc_addEco_df

['Marine' 'Soil' 'Freshwater' 'Porifera' 'Thermal spring' 'Wastewater'
 'Plant' 'Annelida' 'Built environment' 'Deep subsurface' 'Fungi'
 'Non-marine Saline and Alkaline' 'Lab enrichment' 'Bioreactor'
 'Bioremediation' 'Algae' 'Sediment' 'Arthropoda' 'Mammal' 'Solid waste'
 'Insecta']
doen


Unnamed: 0,genome_id,num_hits,Phylum,Class,ecosystem,ecosystem_type,Homologous_cluster
0,3300020369_2,15,p__Proteobacteria,c__Gammaproteobacteria,Aquatic,Marine,Proteo_edc_cluster
1,3300012940_39,15,p__Proteobacteria,c__Gammaproteobacteria,Terrestrial,Soil,Proteo_edc_cluster
2,3300020193_17,15,p__Myxococcota,c__UBA9160,Aquatic,Freshwater,Proteo_edc_cluster
3,3300024344_6,15,p__Myxococcota,c__UBA9160,Aquatic,Marine,Proteo_edc_cluster
4,3300027951_10,14,p__Actinobacteriota,c__Acidimicrobiia,Host-associated,Porifera,Proteo_edc_cluster
...,...,...,...,...,...,...,...
286,3300025617_10,10,p__Proteobacteria,c__Gammaproteobacteria,Aquatic,Marine,Proteo_edc_cluster
287,3300027851_25,10,p__Proteobacteria,c__Gammaproteobacteria,Aquatic,Freshwater,Proteo_edc_cluster
288,3300025590_3,10,p__Actinobacteriota,c__Actinobacteria,Aquatic,Marine,Proteo_edc_cluster
289,3300025594_3,10,p__Actinobacteriota,c__Actinobacteria,Aquatic,Marine,Proteo_edc_cluster


# 補充部分

In [44]:
# # 
# TarToMAGs_edc2560 = pd.read_csv('../data/interim/edc_135_2560_TargetMAGsID.csv')
# TarToMAGs_edc6595 = pd.read_csv('../data/interim/edc_135_6595_TargetMAGsID.csv')
# TarToMAGs_edc_All = pd.concat([TarToMAGs_edc2560, TarToMAGs_edc6595], axis=0)
# TarToMAGs_edc_All = TarToMAGs_edc_All.drop_duplicates(subset='target_name')

# TarToMAGs_edc_All.to_csv('../data/interim/edc_All_TarToMAGsID.csv')
# # TarToMAGs_aed_All.to_csv('../data/interim/aed_TargetToMAGsID_all.csv')


In [30]:
edc_Positive_MAGsID[edc_Positive_MAGsID['genome_id'] == '3300026863_7']

query_name,genome_id,num_hits


In [33]:
edc_Positive_metagenomes_df[edc_Positive_metagenomes_df['ecosystem_type'] == 'Anaerobic']

Unnamed: 0,genome_id,taxonomy,ecosystem_type,num_hits,Phylum,Class
280,3300026195_9,d__Bacteria;p__Actinobacteriota;c__Actinobacte...,Anaerobic,10,p__Actinobacteriota,c__Actinobacteria
281,3300026194_9,d__Bacteria;p__Actinobacteriota;c__Actinobacte...,Anaerobic,10,p__Actinobacteriota,c__Actinobacteria


In [39]:
# open metagenome csv files
metagenmoes_df = pd.read_csv('../data/external/Paper_genome_metadata.csv')
metagenmoes_df = metagenmoes_df.rename(columns={'ecosystem': 'taxonomy'})

# merge positive MAGs with metagenome
edc_Positive_metagenomes_df = pd.merge(edc_Positive_MAGsID, metagenmoes_df, on='genome_id', how='left')

print(edc_Positive_metagenomes_df['ecosystem_category'].unique())
print(edc_Positive_metagenomes_df['ecosystem_type'].unique())
display(edc_Positive_metagenomes_df[edc_Positive_metagenomes_df['ecosystem_type'] == 'Unclassified']) #Anaerobic

edc_Positive_metagenomes_df[edc_Positive_metagenomes_df['ecosystem_type'] == 'Anaerobic']

['Aquatic' 'Terrestrial' 'Host-associated' 'Wastewater' 'Plant'
 'Digestive system' 'Built environment' 'Fungi' 'Lab enrichment'
 'Bioreactor' 'Anaerobic' 'Bioremediation' 'Algae' 'Arthropoda' 'Modeled'
 'Solid waste' 'Unclassified']
['Marine' 'Soil' 'Freshwater' 'Porifera' 'Thermal spring'
 'Activated Sludge' 'Nutrient removal' 'Rhizoplane' 'Annelida' 'City'
 'Deep subsurface' 'Mycelium' 'Peat moss' 'Non-marine Saline and Alkaline'
 'Rhizosphere' 'Defined media' 'Continuous culture' 'Bioreactor'
 'Geologic' 'Tetrachloroethylene and derivatives' 'Red algae'
 'Green algae' 'Sediment' 'Ant dump'
 'Simulated communities (microbial mixture)' 'Mammal' 'Anaerobic digestor'
 'Composting' 'Arthropoda' 'Insecta' 'Phylloplane']


Unnamed: 0,genome_id,num_hits,metagenome_id,genome_length,num_contigs,n50,num_16s,num_5s,num_23s,num_trna,...,contamination,quality_score,mimag_quality,otu_id,taxonomy,ecosystem_category,ecosystem_type,habitat,longitude,latitude


Unnamed: 0,genome_id,num_hits,metagenome_id,genome_length,num_contigs,n50,num_16s,num_5s,num_23s,num_trna,...,contamination,quality_score,mimag_quality,otu_id,taxonomy,ecosystem_category,ecosystem_type,habitat,longitude,latitude


In [10]:
edc_Positive_MAGsID[edc_Positive_MAGsID['genome_id'] == '3300026863_7']

query_name,genome_id,num_hits
272,3300026863_7,9
