# Note2_關於actino 的MAGs分析

## 第一部分: 含有bitscore的篩選分析

### 1.1 使用 positive control 來了解 hmm profiles 的分析情形 (e-value、coverage、bit-score)

In [1]:
# HMM actino positive control bit-score threshold
import os
import pandas as pd
import re

#------------------------------------------------------------------------
# create a function can get the datafarme of each control's best bit-score, e-vale and coverage dataframes
# hmm domtblout name should be ...aed_I_... NOT ..._I_aed_....
def positive_control_df(control_dir, control_names, BitScore_df, Evalue_df, Coverage_df):
    # Create an empty dictionary to store the best bit-scores; best e-vale; best coverage for each hmmsearch
    best_bit_scores = {}
    evalue_scores = {}
    coverage_scores = {}

    # Loop over the hmmsearches and parse the corresponding "domtblout" file
    for control_name in control_names:
        # Load the "domtblout" file into a pandas DataFrame
        file_path = os.path.join(control_dir, control_name + ".domtblout")
        try:
            df = pd.read_csv(file_path, comment="#", sep='\s+', header=None)
        except pd.errors.EmptyDataError:
            df = pd.DataFrame()

        if df.empty:
            pattern = r'A_.*_aed'
            query_name = re.sub(pattern, 'A_aed', control_name)        
            best_bit_scores[query_name] = None
            evalue_scores[query_name] = None
            coverage_scores[query_name] = None
        else:        
            # Assign column names to the DataFrame
            df.columns = ["target_name", "accession", "tlen", "query_name", "accession2", "qlen", "E-value", "score", "bias",
                          "num_domains_index", "num_domains_total", "c-Evalue", "i-Evalue", "score2", "bias2", "hmm_from", "hmm_to", "ali_from", "ali_to",
                          "env_from", "env_to", "acc", "description"]
            # Calculate the coverage for each hit
            df["coverage"] = (df["ali_to"] - df["ali_from"] + 1) / df["tlen"]

            # Filter the DataFrame by E-value and coverage, and sort by bit-score
            significant_hits = df[(df["E-value"] < 0.001) & (df["coverage"] > 0.50)].sort_values(by="score", ascending=False)

            # replace strains name to aed..
            pattern = r'A_.*_aed'
            query_name = re.sub(pattern, 'A_aed', control_name)

            # Extract the best bit-score and store it in the dictionary
            if not significant_hits.empty:
                best_bit_score = significant_hits.iloc[0]["score"]
                best_bit_scores[query_name] = best_bit_score
                evalue_score = significant_hits.iloc[0]["E-value"]
                evalue_scores[query_name] = evalue_score
                coverage_score = significant_hits.iloc[0]["coverage"]
                coverage_scores[query_name] = coverage_score
            else:
                best_bit_scores[query_name] = None
                evalue_scores[query_name] = None
                coverage_scores[query_name] = None

    # create the index names for three df
    # Get the strain name
    pattern = r'A_.*_aed'
    StrainName = re.findall(pattern, control_names[0])
    
    # create index of bit score
    bitscore_name = StrainName[0] + '_bit_score'
    BitScore_Name = [bitscore_name]

    # create index of Evalue
    evalue_name = StrainName[0] + '_Evalue'
    Evalue_Name = [evalue_name]
    
    # create index of coverage
    coverage_name = StrainName[0] + '_coverage'
    Coverage_Name = [coverage_name]
    
    # create a dataframe of bit score
    BitScore_df = pd.DataFrame(best_bit_scores, index=BitScore_Name)

    # create a dataframe of e-value
    Evalue_df = pd.DataFrame(evalue_scores, index=Evalue_Name)

    # create a dataframe of coverage
    Coverage_df = pd.DataFrame(coverage_scores, index=Coverage_Name)
    
    return BitScore_df, Evalue_df, Coverage_df


#---------------------------------------------------------------
# A_Dietzia_sp_B32
# Define the directory that contains the "domtblout" files
control_dir = "../data/raw/Actino_HMM_Control/Positive/A_Dietzia_sp_B32/"

# Define the names of the control_names (without the file extension)
# Do not use 'A_aedC_RS26365', 'A_aedQ_RS26445', 'A_aedR_RS26450' cause their Non-All homologous
control_names = ['A_Dietzia_sp_B32_aedA_I_RS26385', 'A_Dietzia_sp_B32_aedB_I_RS26395', 'A_Dietzia_sp_B32_aedD_RS26370', 'A_Dietzia_sp_B32_aedE_RS26375', 'A_Dietzia_sp_B32_aedF_I_RS26380', 'A_Dietzia_sp_B32_aedG_I_RS26390', 'A_Dietzia_sp_B32_aedH_I_RS26400', 'A_Dietzia_sp_B32_aedI_RS26405', 'A_Dietzia_sp_B32_aedJ_I_RS26410', 'A_Dietzia_sp_B32_aedK_I_RS26415', 'A_Dietzia_sp_B32_aedL_RS26420', 'A_Dietzia_sp_B32_aedM_RS26425', 'A_Dietzia_sp_B32_aedN_RS26430', 'A_Dietzia_sp_B32_aedO_RS26435', 'A_Dietzia_sp_B32_aedP_RS26440']

# Use positive_control_df function to get the df (control_dir, control_names, BitScore_df, Evalue_df, Coverage_df)
B32_BitScore = pd.DataFrame()
B32_Evalue = pd.DataFrame()
B32_coverage = pd.DataFrame()

B32_BitScore, B32_Evalue, B32_coverage = positive_control_df(control_dir, control_names, B32_BitScore, B32_Evalue, B32_coverage)


#---------------------------------------------------------------
# A_Mycobacteroides_chelonae_S00154
# Define the directory that contains the "domtblout" files
control_dir = "../data/raw/Actino_HMM_Control/Positive/A_Mycobacteroides_chelonae_S00154/"

# Define the names of the control_names (without the file extension)
# Do not use 'A_aedC_RS26365', 'A_aedQ_RS26445', 'A_aedR_RS26450' cause their Non-All homologous
control_names = ['A_Mycobacteroides_chelonae_S00154_aedA_I_RS26385', 'A_Mycobacteroides_chelonae_S00154_aedB_I_RS26395', 'A_Mycobacteroides_chelonae_S00154_aedD_RS26370', 'A_Mycobacteroides_chelonae_S00154_aedE_RS26375', 'A_Mycobacteroides_chelonae_S00154_aedF_I_RS26380', 'A_Mycobacteroides_chelonae_S00154_aedG_I_RS26390', 'A_Mycobacteroides_chelonae_S00154_aedH_I_RS26400', 'A_Mycobacteroides_chelonae_S00154_aedI_RS26405', 'A_Mycobacteroides_chelonae_S00154_aedJ_I_RS26410', 'A_Mycobacteroides_chelonae_S00154_aedK_I_RS26415', 'A_Mycobacteroides_chelonae_S00154_aedL_RS26420', 'A_Mycobacteroides_chelonae_S00154_aedM_RS26425', 'A_Mycobacteroides_chelonae_S00154_aedN_RS26430', 'A_Mycobacteroides_chelonae_S00154_aedO_RS26435', 'A_Mycobacteroides_chelonae_S00154_aedP_RS26440']

# Use positive_control_df function to get the df (control_dir, control_names, BitScore_df, Evalue_df, Coverage_df)
S00154_BitScore = pd.DataFrame()
S00154_Evalue = pd.DataFrame()
S00154_coverage = pd.DataFrame()

S00154_BitScore, S00154_Evalue, S00154_coverage = positive_control_df(control_dir, control_names, S00154_BitScore, S00154_Evalue, S00154_coverage)


#---------------------------------------------------------------
# A_Rhodococcus_equi_DSSKP_R_001
# Define the directory that contains the "domtblout" files
control_dir = "../data/raw/Actino_HMM_Control/Positive/A_Rhodococcus_equi_DSSKP_R_001/"

# Define the names of the control_names (without the file extension)
# Do not use 'A_aedC_RS26365', 'A_aedQ_RS26445', 'A_aedR_RS26450' cause their Non-All homologous
control_names = ['A_Rhodococcus_equi_DSSKP_R_001_aedA_I_RS26385', 'A_Rhodococcus_equi_DSSKP_R_001_aedB_I_RS26395', 'A_Rhodococcus_equi_DSSKP_R_001_aedD_RS26370', 'A_Rhodococcus_equi_DSSKP_R_001_aedE_RS26375', 'A_Rhodococcus_equi_DSSKP_R_001_aedF_I_RS26380', 'A_Rhodococcus_equi_DSSKP_R_001_aedG_I_RS26390', 'A_Rhodococcus_equi_DSSKP_R_001_aedH_I_RS26400', 'A_Rhodococcus_equi_DSSKP_R_001_aedI_RS26405', 'A_Rhodococcus_equi_DSSKP_R_001_aedJ_I_RS26410', 'A_Rhodococcus_equi_DSSKP_R_001_aedK_I_RS26415', 'A_Rhodococcus_equi_DSSKP_R_001_aedL_RS26420', 'A_Rhodococcus_equi_DSSKP_R_001_aedM_RS26425', 'A_Rhodococcus_equi_DSSKP_R_001_aedN_RS26430', 'A_Rhodococcus_equi_DSSKP_R_001_aedO_RS26435', 'A_Rhodococcus_equi_DSSKP_R_001_aedP_RS26440']

# Use positive_control_df function to get the df (control_dir, control_names, BitScore_df, Evalue_df, Coverage_df)
R_001_BitScore = pd.DataFrame()
R_001_Evalue = pd.DataFrame()
R_001_coverage = pd.DataFrame()

R_001_BitScore, R_001_Evalue, R_001_coverage = positive_control_df(control_dir, control_names, R_001_BitScore, R_001_Evalue, R_001_coverage)


#---------------------------------------------------------------
# A_Rhodococcus_sp_B50
# Define the directory that contains the "domtblout" files
control_dir = "../data/raw/Actino_HMM_Control/Positive/A_Rhodococcus_sp_B50/"

# Define the names of the control_names (without the file extension)
# Do not use 'A_aedC_RS26365', 'A_aedQ_RS26445', 'A_aedR_RS26450' cause their Non-All homologous
control_names = ['A_Rhodococcus_sp_B50_aedA_I_RS26385', 'A_Rhodococcus_sp_B50_aedB_I_RS26395', 'A_Rhodococcus_sp_B50_aedD_RS26370', 'A_Rhodococcus_sp_B50_aedE_RS26375', 'A_Rhodococcus_sp_B50_aedF_I_RS26380', 'A_Rhodococcus_sp_B50_aedG_I_RS26390', 'A_Rhodococcus_sp_B50_aedH_I_RS26400', 'A_Rhodococcus_sp_B50_aedI_RS26405', 'A_Rhodococcus_sp_B50_aedJ_I_RS26410', 'A_Rhodococcus_sp_B50_aedK_I_RS26415', 'A_Rhodococcus_sp_B50_aedL_RS26420', 'A_Rhodococcus_sp_B50_aedM_RS26425', 'A_Rhodococcus_sp_B50_aedN_RS26430', 'A_Rhodococcus_sp_B50_aedO_RS26435', 'A_Rhodococcus_sp_B50_aedP_RS26440']

# Use positive_control_df function to get the df (control_dir, control_names, BitScore_df, Evalue_df, Coverage_df)
B50_BitScore = pd.DataFrame()
B50_Evalue = pd.DataFrame()
B50_coverage = pd.DataFrame()

B50_BitScore, B50_Evalue, B50_coverage = positive_control_df(control_dir, control_names, B50_BitScore, B50_Evalue, B50_coverage)


#---------------------------------------------------------------
# A_Tomitella_gaofuii_HY172
# Define the directory that contains the "domtblout" files
control_dir = "../data/raw/Actino_HMM_Control/Positive/A_Tomitella_gaofuii_HY172/"

# Define the names of the control_names (without the file extension)
# Do not use 'A_aedC_RS26365', 'A_aedQ_RS26445', 'A_aedR_RS26450' cause their Non-All homologous
control_names = ['A_Tomitella_gaofuii_HY172_aedA_I_RS26385', 'A_Tomitella_gaofuii_HY172_aedB_I_RS26395', 'A_Tomitella_gaofuii_HY172_aedD_RS26370', 'A_Tomitella_gaofuii_HY172_aedE_RS26375', 'A_Tomitella_gaofuii_HY172_aedF_I_RS26380', 'A_Tomitella_gaofuii_HY172_aedG_I_RS26390', 'A_Tomitella_gaofuii_HY172_aedH_I_RS26400', 'A_Tomitella_gaofuii_HY172_aedI_RS26405', 'A_Tomitella_gaofuii_HY172_aedJ_I_RS26410', 'A_Tomitella_gaofuii_HY172_aedK_I_RS26415', 'A_Tomitella_gaofuii_HY172_aedL_RS26420', 'A_Tomitella_gaofuii_HY172_aedM_RS26425', 'A_Tomitella_gaofuii_HY172_aedN_RS26430', 'A_Tomitella_gaofuii_HY172_aedO_RS26435', 'A_Tomitella_gaofuii_HY172_aedP_RS26440']

# Use positive_control_df function to get the df (control_dir, control_names, BitScore_df, Evalue_df, Coverage_df)
HY172_BitScore = pd.DataFrame()
HY172_Evalue = pd.DataFrame()
HY172_coverage = pd.DataFrame()

HY172_BitScore, HY172_Evalue, HY172_coverage = positive_control_df(control_dir, control_names, HY172_BitScore, HY172_Evalue, HY172_coverage)


#-------------------------------------------------------------------------------------------------------
# Merge tabel and create a df of max and min

# Got the min bit score of the df
# merge 5 best bit score df
Actino_PositiveBitScore_df = pd.concat([B32_BitScore, S00154_BitScore, R_001_BitScore, B50_BitScore, HY172_BitScore], axis=0)

Actino_MiniBitScore_df = Actino_PositiveBitScore_df.min()
Actino_MiniBitScore_df = Actino_MiniBitScore_df.iloc[0:]
Actino_MiniBitScore_df = Actino_MiniBitScore_df.to_frame()
Actino_MiniBitScore_df.columns = ['PositiveControl_Min_BitScore']

# transpose the DataFrame
Actino_MiniBitScore_df = Actino_MiniBitScore_df.T

# merge all nad Min table
Actino_PositiveBitScore_df = pd.concat([Actino_PositiveBitScore_df, Actino_MiniBitScore_df], axis=0)
Actino_PositiveBitScore_df.to_csv('../data/processed/Final/ForReader/ControlData/Actino_aed/Actino_PositiveBitScore.csv')


# Got the max e-value of the df
# merge 5 e-value df
Actino_PositiveEvalue_df = pd.concat([B32_Evalue, S00154_Evalue, R_001_Evalue, B50_Evalue, HY172_Evalue], axis=0)

Actino_MaxEvalue_df = Actino_PositiveEvalue_df.max()
Actino_MaxEvalue_df = Actino_MaxEvalue_df.iloc[0:]
Actino_MaxEvalue_df = Actino_MaxEvalue_df.to_frame()
Actino_MaxEvalue_df.columns = ['PositiveControl_Max_Evalue']

# transpose the DataFrame
Actino_MaxEvalue_df = Actino_MaxEvalue_df.T

# merge all nad Min table
Actino_PositiveEvalue_df = pd.concat([Actino_PositiveEvalue_df, Actino_MaxEvalue_df], axis=0)
Actino_PositiveEvalue_df.to_csv('../data/processed/Final/ForReader/ControlData/Actino_aed/Actino_PositiveEvalue.csv')


# Got the min coverage of the df
# merge 5 e-value df
Actino_Positivecoverage_df = pd.concat([B32_coverage, S00154_coverage, R_001_coverage, B50_coverage, HY172_coverage], axis=0)

Actino_Minicoverage_df = Actino_Positivecoverage_df.min()
Actino_Minicoverage_df = Actino_Minicoverage_df.iloc[0:]
Actino_Minicoverage_df = Actino_Minicoverage_df.to_frame()
Actino_Minicoverage_df.columns = ['PositiveControl_Min_coverage']

# transpose the DataFrame
Actino_Minicoverage_df = Actino_Minicoverage_df.T

# merge all nad Min table
Actino_Positivecoverage_df = pd.concat([Actino_Positivecoverage_df, Actino_Minicoverage_df], axis=0)
Actino_Positivecoverage_df.to_csv('../data/processed/Final/ForReader/ControlData/Actino_aed/Actino_PositiveCoverage.csv')


# Display three table
display(Actino_PositiveBitScore_df)
display(Actino_PositiveEvalue_df)
display(Actino_Positivecoverage_df)
print('Done')

Unnamed: 0,A_aedA_I_RS26385,A_aedB_I_RS26395,A_aedD_RS26370,A_aedE_RS26375,A_aedF_I_RS26380,A_aedG_I_RS26390,A_aedH_I_RS26400,A_aedI_RS26405,A_aedJ_I_RS26410,A_aedK_I_RS26415,A_aedL_RS26420,A_aedM_RS26425,A_aedN_RS26430,A_aedO_RS26435,A_aedP_RS26440
A_Dietzia_sp_B32_aed_bit_score,793.5,588.3,450.9,596.0,771.9,555.5,552.7,220.7,965.6,814.7,261.4,533.0,845.5,420.8,755.6
A_Mycobacteroides_chelonae_S00154_aed_bit_score,726.7,569.5,432.5,586.1,736.0,552.4,518.3,217.1,942.7,801.3,,532.7,656.6,126.6,439.4
A_Rhodococcus_equi_DSSKP_R_001_aed_bit_score,809.4,609.6,464.9,613.3,783.8,575.4,560.9,230.4,1007.8,838.8,273.9,547.4,865.6,443.7,784.6
A_Rhodococcus_sp_B50_aed_bit_score,804.4,612.8,466.2,614.9,785.2,577.2,562.7,233.0,1018.5,840.5,272.7,549.3,868.3,445.9,777.8
A_Tomitella_gaofuii_HY172_aed_bit_score,783.3,594.9,283.2,580.8,763.5,537.9,545.0,218.9,933.9,795.1,257.3,432.4,827.1,412.0,729.9
PositiveControl_Min_BitScore,726.7,569.5,283.2,580.8,736.0,537.9,518.3,217.1,933.9,795.1,257.3,432.4,656.6,126.6,439.4


Unnamed: 0,A_aedA_I_RS26385,A_aedB_I_RS26395,A_aedD_RS26370,A_aedE_RS26375,A_aedF_I_RS26380,A_aedG_I_RS26390,A_aedH_I_RS26400,A_aedI_RS26405,A_aedJ_I_RS26410,A_aedK_I_RS26415,A_aedL_RS26420,A_aedM_RS26425,A_aedN_RS26430,A_aedO_RS26435,A_aedP_RS26440
A_Dietzia_sp_B32_aed_Evalue,3.3999999999999996e-242,2.3e-180,8.1e-139,1.1e-182,1.1e-235,1.9e-170,8.6e-170,4.1000000000000004e-69,3.1999999999999995e-294,8.6e-249,1.1e-81,8.4e-164,3.3e-258,1.1e-129,7.900000000000001e-231
A_Mycobacteroides_chelonae_S00154_aed_Evalue,5.8e-222,1e-174,2.9999999999999998e-133,1.1e-179,7.4e-225,1.5e-169,2.4000000000000005e-159,4.5e-68,2.4000000000000003e-287,8.8e-245,,9e-164,6.599999999999999e-201,7.6e-40,7.9e-135
A_Rhodococcus_equi_DSSKP_R_001_aed_Evalue,5.7e-247,8.4e-187,5.0999999999999995e-143,7.200000000000001e-188,2.7999999999999997e-239,1.8999999999999997e-176,3.1999999999999996e-172,4.4e-72,5.7e-307,4.6e-256,1.8e-85,3.7e-168,2.9e-264,1.2e-136,1.3e-239
A_Rhodococcus_sp_B50_aed_Evalue,1.8e-245,8.600000000000001e-188,2e-143,2.4e-188,9.999999999999999e-240,5.5000000000000005e-177,8.7e-173,7.000000000000001e-73,0.0,1.4e-256,4.2999999999999994e-85,9.9e-169,4.4e-265,2.7e-137,1.6e-237
A_Tomitella_gaofuii_HY172_aed_Evalue,4.4e-239,2.2e-182,1.4e-87,5e-178,3.7000000000000002e-233,4.4e-165,2e-167,1.4e-68,1.2999999999999999e-284,7.500000000000001e-243,2.1e-80,4e-133,1.1999999999999998e-252,5.4e-127,4.9e-223
PositiveControl_Max_Evalue,5.8e-222,1e-174,1.4e-87,5e-178,7.4e-225,4.4e-165,2.4000000000000005e-159,4.5e-68,1.2999999999999999e-284,7.500000000000001e-243,2.1e-80,4e-133,6.599999999999999e-201,7.6e-40,7.9e-135


Unnamed: 0,A_aedA_I_RS26385,A_aedB_I_RS26395,A_aedD_RS26370,A_aedE_RS26375,A_aedF_I_RS26380,A_aedG_I_RS26390,A_aedH_I_RS26400,A_aedI_RS26405,A_aedJ_I_RS26410,A_aedK_I_RS26415,A_aedL_RS26420,A_aedM_RS26425,A_aedN_RS26430,A_aedO_RS26435,A_aedP_RS26440
A_Dietzia_sp_B32_aed_coverage,1.0,0.99,0.930502,0.954984,0.966321,0.910543,1.0,0.992308,0.942164,0.98977,0.904762,0.988679,1.0,0.663793,0.984615
A_Mycobacteroides_chelonae_S00154_aed_coverage,0.994975,0.97377,0.995902,0.973856,0.997375,1.0,0.988462,1.0,0.992141,0.994859,,0.981273,0.992366,0.714286,0.929095
A_Rhodococcus_equi_DSSKP_R_001_aed_coverage,0.97543,0.993333,0.899628,0.993333,0.97644,0.916933,0.934783,0.992308,0.988235,0.98977,0.917241,0.988679,0.997468,0.68915,1.0
A_Rhodococcus_sp_B50_aed_coverage,0.977887,0.993333,0.899628,0.990033,0.97644,0.916933,0.934783,0.992308,0.992126,0.98977,0.796407,0.988679,0.997468,0.899614,1.0
A_Tomitella_gaofuii_HY172_aed_coverage,0.958937,1.0,0.914729,0.996656,0.97644,0.914013,0.724719,0.984733,0.970817,0.98977,0.815951,0.966292,0.997468,0.692308,1.0
PositiveControl_Min_coverage,0.958937,0.97377,0.899628,0.954984,0.966321,0.910543,0.724719,0.984733,0.942164,0.98977,0.796407,0.966292,0.992366,0.663793,0.929095


Done


### 1.2 使用 negative control 來獲得合適的bit score

In [9]:
# HMM actino negative control bit-score threshold
import os
import pandas as pd
import re

#------------------------------------------------------------------------
# create a function can get the datafarme of each control's best bit-score, e-vale and coverage dataframes
# hmm domtblout name should be ...aed_I_... NOT ..._I_aed_....
def negative_control_df(control_dir, control_names, BitScore_df):
    # Create an empty dictionary to store the best bit-scores; best e-vale; best coverage for each hmmsearch
    best_bit_scores = {}

    # Loop over the hmmsearches and parse the corresponding "domtblout" file
    for control_name in control_names:
        # Load the "domtblout" file into a pandas DataFrame
        file_path = os.path.join(control_dir, control_name + ".domtblout")
        try:
            df = pd.read_csv(file_path, comment="#", sep='\s+', header=None)
        except pd.errors.EmptyDataError:
            df = pd.DataFrame()

        if df.empty:
            pattern = r'A_.*_aed'
            query_name = re.sub(pattern, 'A_aed', control_name)        
            best_bit_scores[query_name] = 0
        else:        
            # Assign column names to the DataFrame
            df.columns = ["target_name", "accession", "tlen", "query_name", "accession2", "qlen", "E-value", "score", "bias",
                          "num_domains_index", "num_domains_total", "c-Evalue", "i-Evalue", "score2", "bias2", "hmm_from", "hmm_to", "ali_from", "ali_to",
                          "env_from", "env_to", "acc", "description"]
            # Calculate the coverage for each hit
            df["coverage"] = (df["ali_to"] - df["ali_from"] + 1) / df["tlen"]

            # Filter the DataFrame by E-value and coverage, and sort by bit-score
            significant_hits = df[(df["E-value"] < 0.001) & (df["coverage"] > 0.50)].sort_values(by="score", ascending=False)

            # replace strains name to aed..
            pattern = r'A_.*_aed'
            query_name = re.sub(pattern, 'A_aed', control_name)

            # Extract the best bit-score and store it in the dictionary
            if not significant_hits.empty:
                best_bit_score = significant_hits.iloc[0]["score"]
                best_bit_scores[query_name] = best_bit_score
            else:
                best_bit_scores[query_name] = 0

    # create the index names for three df
    # Get the strain name
    pattern = r'A_.*_aed'
    StrainName = re.findall(pattern, control_names[0])
    
    # create index of bit score
    bitscore_name = StrainName[0] + '_bit_score'
    BitScore_Name = [bitscore_name]

    # create a dataframe of bit score
    BitScore_df = pd.DataFrame(best_bit_scores, index=BitScore_Name)

    return BitScore_df


#---------------------------------------------------------------
# A_Rhodococcus_jostii_RHA1
# Define the directory that contains the "domtblout" files
control_dir = "../data/raw/Actino_HMM_Control/Negative/A_Rhodococcus_jostii_RHA1/domtblout/"

# Define the names of the control_names (without the file extension)
# Do not use 'A_aedC_RS26365', 'A_aedQ_RS26445', 'A_aedR_RS26450' cause their Non-All homologous
control_names = ['A_Rhodococcus_jostii_RHA1_aedA_I_RS26385', 'A_Rhodococcus_jostii_RHA1_aedB_I_RS26395', 'A_Rhodococcus_jostii_RHA1_aedD_RS26370', 'A_Rhodococcus_jostii_RHA1_aedE_RS26375', 'A_Rhodococcus_jostii_RHA1_aedF_I_RS26380', 'A_Rhodococcus_jostii_RHA1_aedG_I_RS26390', 'A_Rhodococcus_jostii_RHA1_aedH_I_RS26400', 'A_Rhodococcus_jostii_RHA1_aedI_RS26405', 'A_Rhodococcus_jostii_RHA1_aedJ_I_RS26410', 'A_Rhodococcus_jostii_RHA1_aedK_I_RS26415', 'A_Rhodococcus_jostii_RHA1_aedL_RS26420', 'A_Rhodococcus_jostii_RHA1_aedM_RS26425', 'A_Rhodococcus_jostii_RHA1_aedN_RS26430', 'A_Rhodococcus_jostii_RHA1_aedO_RS26435', 'A_Rhodococcus_jostii_RHA1_aedP_RS26440']

# Use positive_control_df function to get the df (control_dir, control_names, BitScore_df, Evalue_df, Coverage_df)
RHA1_BitScore = pd.DataFrame()
RHA1_BitScore = negative_control_df(control_dir, control_names, RHA1_BitScore)


#---------------------------------------------------------------
# A_Mycobacterium_tuberculosis_H37Rv
# Define the directory that contains the "domtblout" files
control_dir = "../data/raw/Actino_HMM_Control/Negative/A_Mycobacterium_tuberculosis_H37Rv/domtblout"

# Define the names of the control_names (without the file extension)
# Do not use 'A_aedC_RS26365', 'A_aedQ_RS26445', 'A_aedR_RS26450' cause their Non-All homologous
control_names = ['A_Mycobacterium_tuberculosis_H37Rv_aedA_I_RS26385', 'A_Mycobacterium_tuberculosis_H37Rv_aedB_I_RS26395', 'A_Mycobacterium_tuberculosis_H37Rv_aedD_RS26370', 'A_Mycobacterium_tuberculosis_H37Rv_aedE_RS26375', 'A_Mycobacterium_tuberculosis_H37Rv_aedF_I_RS26380', 'A_Mycobacterium_tuberculosis_H37Rv_aedG_I_RS26390', 'A_Mycobacterium_tuberculosis_H37Rv_aedH_I_RS26400', 'A_Mycobacterium_tuberculosis_H37Rv_aedI_RS26405', 'A_Mycobacterium_tuberculosis_H37Rv_aedJ_I_RS26410', 'A_Mycobacterium_tuberculosis_H37Rv_aedK_I_RS26415', 'A_Mycobacterium_tuberculosis_H37Rv_aedL_RS26420', 'A_Mycobacterium_tuberculosis_H37Rv_aedM_RS26425', 'A_Mycobacterium_tuberculosis_H37Rv_aedN_RS26430', 'A_Mycobacterium_tuberculosis_H37Rv_aedO_RS26435', 'A_Mycobacterium_tuberculosis_H37Rv_aedP_RS26440']

# Use positive_control_df function to get the df (control_dir, control_names, BitScore_df, Evalue_df, Coverage_df)
H37Rv_BitScore = pd.DataFrame()
H37Rv_BitScore = negative_control_df(control_dir, control_names, H37Rv_BitScore)


#-------------------------------------------------------------------------------------------------------
# merge two negative best bit score df
Actino_Negative_df = pd.concat([RHA1_BitScore, H37Rv_BitScore], axis=0)

# Got the highest bit score of the df
Actino_BitScore_Criteria = Actino_Negative_df.max()
Actino_BitScore_Criteria = Actino_BitScore_Criteria.iloc[0:]
Actino_BitScore_Criteria = Actino_BitScore_Criteria.to_frame()
Actino_BitScore_Criteria.columns = ['Criteria_Bitscore']

# transpose the DataFrame
Actino_BitScore_Criteria_T = Actino_BitScore_Criteria.T

# merge all nad Min table
Actino_Negative_df = pd.concat([Actino_Negative_df, Actino_BitScore_Criteria_T], axis=0)
Actino_Negative_df.to_csv('../data/processed/Final/ForReader/ControlData/Actino_aed/Actino_NegativeBitscore.csv')

# done
display(Actino_BitScore_Criteria)
display(Actino_Negative_df)
print('done')

Unnamed: 0,Criteria_Bitscore
A_aedA_I_RS26385,195.4
A_aedB_I_RS26395,220.3
A_aedD_RS26370,128.9
A_aedE_RS26375,232.1
A_aedF_I_RS26380,617.3
A_aedG_I_RS26390,176.2
A_aedH_I_RS26400,270.8
A_aedI_RS26405,25.4
A_aedJ_I_RS26410,328.3
A_aedK_I_RS26415,321.5


Unnamed: 0,A_aedA_I_RS26385,A_aedB_I_RS26395,A_aedD_RS26370,A_aedE_RS26375,A_aedF_I_RS26380,A_aedG_I_RS26390,A_aedH_I_RS26400,A_aedI_RS26405,A_aedJ_I_RS26410,A_aedK_I_RS26415,A_aedL_RS26420,A_aedM_RS26425,A_aedN_RS26430,A_aedO_RS26435,A_aedP_RS26440
A_Rhodococcus_jostii_RHA1_aed_bit_score,189.0,220.3,126.7,232.1,617.3,176.2,270.8,25.4,328.3,321.5,106.7,98.3,221.1,58.9,181.3
A_Mycobacterium_tuberculosis_H37Rv_aed_bit_score,195.4,213.7,128.9,223.7,568.5,0.0,30.5,0.0,252.0,304.2,20.3,94.6,203.3,55.3,149.9
Criteria_Bitscore,195.4,220.3,128.9,232.1,617.3,176.2,270.8,25.4,328.3,321.5,106.7,98.3,221.1,58.9,181.3


done


### 2. 使用上述的 bit score 來篩選 MAGS 的 outpout data (不含有 hits 的篩選)

In [6]:
# Actino_HMM_MAGs
# 需先進行前步驟的cell (1. 使用 negative control 來獲得合適的bit score)
import os
import pandas as pd
import re

#-------------------------------
# Aed Cluster to MAGs

# Define the directory that contains the "domtblout" files.需要刪除discription
domtblout_dir = "../data/raw/Actino_HMM_MAGs_domtblout/"

# Create an empty dictionary to store the target name for each hmmsearch
MAGs_Hits = {}
MAGs_Hits_name = []

# covert criteria dataframe to serires
Actino_BitScore_Criteria_S = Actino_BitScore_Criteria['Criteria_Bitscore']
Actino_BitScore_Criteria_S = Actino_BitScore_Criteria_S.astype(float)    

# create a dataframe for a all hits 
columns = ["target_name", "accession", "tlen", "query_name", "accession2", "qlen", "E-value", "score", "bias",
           "num_domains_index", "num_domains_total", "c-Evalue", "i-Evalue", "score2", "bias2", "hmm_from",
           "hmm_to", "ali_from", "ali_to", "env_from", "env_to", "acc", "description"]

All_aed_Hits_df = pd.DataFrame(columns=columns)

# Loop over the HMM DOMTBLOUT files and filter the results based on bit score, e-value and coverage
# hmm name and bit score are in the Actino_BitScore_Criteria series
for hmmsearch, threshold in Actino_BitScore_Criteria_S.items():
    # Load the "domtblout" file into a pandas DataFrame
    file_path = os.path.join(domtblout_dir, hmmsearch + ".domtblout")
    df = pd.read_csv(file_path, comment="#", sep='\s+', header=None)    
    # Assign column names to the DataFrame
    df.columns = ["target_name", "accession", "tlen", "query_name", "accession2", "qlen", "E-value", "score", "bias",
                  "num_domains_index", "num_domains_total", "c-Evalue", "i-Evalue", "score2", "bias2", "hmm_from", "hmm_to", "ali_from", "ali_to",
                  "env_from", "env_to", "acc", "description"]

    # Calculate the coverage for each hit
    df["coverage"] = (df["ali_to"] - df["ali_from"] + 1) / df["tlen"]
    
    # Filter the DataFrame by E-value, coverage, and bit-score
    significant_hits = df[(df["E-value"] < 0.001) & (df["coverage"] > 0.50) & (df["score"] > threshold)]

    # Extract Target nmae and store it in the dictionary
    if not significant_hits.empty:
        MAGs_Hits_name = significant_hits["target_name"].tolist()
        MAGs_Hits[hmmsearch] = MAGs_Hits_name
    else:
        MAGs_Hits_name = None
        MAGs_Hits[hmmsearch] = MAGs_Hits_name
    
    # add hits table to a df
    All_aed_Hits_df = pd.concat([significant_hits, All_aed_Hits_df], axis=0)

# done
print('done')
print('unique query name/numbers: ', All_aed_Hits_df['query_name'].unique(), ' / ', len(All_aed_Hits_df['query_name'].unique()))
All_aed_Hits_df.to_csv('../data/processed/All_aed_Hits_df_bitscore.csv')    
All_aed_Hits_df

done
unique query name/numbers:  ['Q_A_aedP_RS26440' 'Q_A_aedO_RS26435' 'Q_A_aedN_RS26430'
 'Q_A_aedM_RS26425' 'Q_A_aedL_RS26420' 'Q_A_I_aedK_RS26415'
 'Q_A_I_aedJ_RS26410' 'Q_A_aedI_RS26405' 'Q_A_I_aedH_RS26400'
 'Q_A_I_aedG_RS26390' 'Q_A_I_aedF_RS26380' 'Q_A_aedE_RS26375'
 'Q_A_aedD_RS26370' 'Q_A_I_aedB_RS26395' 'Q_A_I_aedA_RS26385']  /  15


Unnamed: 0,target_name,accession,tlen,query_name,accession2,qlen,E-value,score,bias,num_domains_index,...,bias2,hmm_from,hmm_to,ali_from,ali_to,env_from,env_to,acc,description,coverage
0,3300009702.a:Ga0114931_10050376_3,-,376,Q_A_aedP_RS26440,-,385,6.300000e-116,400.0,2.4,1,...,2.4,1,372,1,374,1,375,0.99,-,0.994681
1,3300009702.a:Ga0114931_10022461_4,-,376,Q_A_aedP_RS26440,-,385,7.400000e-116,399.8,5.1,1,...,5.1,1,372,1,374,1,375,0.99,-,0.994681
2,3300027968.a:Ga0209061_1000039_348,-,378,Q_A_aedP_RS26440,-,385,2.300000e-112,388.3,3.1,1,...,3.1,1,372,2,375,2,377,0.98,-,0.989418
3,3300027662.a:Ga0208565_1003173_4,-,380,Q_A_aedP_RS26440,-,385,1.300000e-111,385.8,3.6,1,...,3.6,1,375,2,378,2,379,0.99,-,0.992105
4,3300017444.a:Ga0185300_10000551_65,-,375,Q_A_aedP_RS26440,-,385,3.500000e-111,384.4,3.0,1,...,3.0,1,372,1,374,1,374,0.99,-,0.997333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,3300014092.a:Ga0121367_100110_13,-,405,Q_A_I_aedA_RS26385,-,398,6.200000e-54,196.3,0.0,1,...,0.0,6,395,8,398,3,401,0.93,-,0.965432
240,3300027968.a:Ga0209061_1000140_88,-,412,Q_A_I_aedA_RS26385,-,398,6.800000e-54,196.1,0.0,1,...,0.0,15,396,17,405,4,407,0.89,-,0.944175
241,3300017532.a:Ga0185302_10010386_3,-,394,Q_A_I_aedA_RS26385,-,398,7.700000e-54,196.0,0.2,1,...,0.2,7,394,5,389,2,393,0.94,-,0.977157
242,3300025186.a:Ga0208056_100934_7,-,410,Q_A_I_aedA_RS26385,-,398,8.400000e-54,195.8,0.0,1,...,0.0,4,394,8,407,5,409,0.90,-,0.975610


#### 2.1 獲取positive的target name並連結MAGsID，整理出一個類似heatmap的table，再以寬鬆hits數量進行篩選(>=8)

In [7]:
# Got target name and query name
import os
import pandas as pd
import re

# read table
All_aed_Hits_df = pd.read_csv('../data/processed/All_aed_Hits_df_bitscore.csv') 

# Got target name and query name    
All_aed_Hits_TargetAndQuery = All_aed_Hits_df[['target_name', 'query_name']]

# Load a Dataframe with the lookup values for merge protein id to MAGs and merge them
TarToMAGs_aed = pd.read_csv('../data/interim/Actino_aed/aed_All_TarToMAGsID.csv')

# use merge() function to join the MAGsID data
aed_hits_TargetAndMAGsID = pd.merge(All_aed_Hits_TargetAndQuery, TarToMAGs_aed, on='target_name', how='left')                                                                               

# check the null value
print('Any Null in TargetToMAGsID: ', aed_hits_TargetAndMAGsID['MAGsID'].isnull().any())

# create the crosstab table (like heatmap)
aed_hits_heatmap = pd.crosstab(aed_hits_TargetAndMAGsID['query_name'], aed_hits_TargetAndMAGsID['MAGsID'], dropna=False)
aed_hits_heatmap = aed_hits_heatmap.transpose()
# aed_hits_heatmap
# aed_hits_heatmap.to_csv('../data/processed/aed_hits_heatmap_bitscore.csv')

# Count the non-zero values in hmm profiles hit row to calculate the number of different HMM profiles that have hits in a given MAG. 
def count_nonzero(row):
    return len(row[row != 0])

num_hits = aed_hits_heatmap.apply(count_nonzero, axis=1)
aed_hits_heatmap['num_hits'] = num_hits

# sort them by hits numer
aed_hits_heatmap = aed_hits_heatmap.sort_values(by="num_hits", ascending=False)

# extract > 8 hmm profiles hits and the necessary hits (aedA、aedB、aedJ)
aed_hits_FinalFilter =  aed_hits_heatmap[(aed_hits_heatmap['num_hits'] >= 8)]
# >7 = 326, >8 = 167, >9 = 70, >10 = 17 用8較為恰當 大於一半的query gene

# Reset index and move index column to first position
aed_hits_FinalFilter.index.name = None
aed_hits_FinalFilter = aed_hits_FinalFilter.reset_index()
aed_hits_FinalFilter.insert(0, 'index', aed_hits_FinalFilter.pop('index'))

# rename MAGsID
aed_hits_FinalFilter = aed_hits_FinalFilter.rename(columns={'index': 'genome_id'})

# extract the MAGsID and num_hits column
aed_Positive_MAGsID = aed_hits_FinalFilter[['genome_id', 'num_hits']]

# done
print('done')
aed_Positive_MAGsID

Any Null in TargetToMAGsID:  False
done


query_name,genome_id,num_hits
0,3300024344_6,13
1,3300017444_33,12
2,3300027902_34,12
3,3300027902_36,12
4,3300027649_17,12
...,...,...
162,3300020193_29,8
163,3300010158_16,8
164,3300027827_15,8
165,3300017452_8,8


##### 2.1.1 連結positive MAGs到 metagenomes 總資料中並進行處理成最株能給R繪製 stacket barchart 的資料格式

In [8]:
# 需先執行上一個cell
# open metagenome csv files
metagenmoes_df = pd.read_csv('../data/external/genome_metadata_editForAnalysis_NotReference.csv')

# merge positive MAGs with metagenome
aed_Positive_metagenomes_df = pd.merge(aed_Positive_MAGsID, metagenmoes_df, on='genome_id', how='left')

# extract certain column
aed_Positive_metagenomes_df = aed_Positive_metagenomes_df[['genome_id', 'metagenome_id', 'taxonomy', 'ecosystem', 'ecosystem_category', 'num_hits', 'longitude', 'latitude']]

# extract phylum and class from taxonomy column and expand to new column
aed_Positive_metagenomes_df['Phylum'] = aed_Positive_metagenomes_df['taxonomy'].str.extract('(p__\w+)', expand=True)
aed_Positive_metagenomes_df['Class'] = aed_Positive_metagenomes_df['taxonomy'].str.extract('(c__\w+)', expand=True)

# # print non-duplicated values in ecosystem_type than check the lable
# print(aed_Positive_metagenomes_df['ecosystem_type'].unique())
# aed_Positive_metagenomes_df

# # check the None value in ecosystem column Create a Boolean mask to identify NaN values
# mask = aed_addEco_df.isna()
# aed_addEco_df_nan_rows = aed_addEco_df[mask.any(axis=1)]
# aed_addEco_df_nan_rows

# remove, add and rearrange column
aed_Positive_metagenomes_df.drop('taxonomy', axis=1, inplace=True)
aed_Positive_metagenomes_df = aed_Positive_metagenomes_df.reindex(columns=['genome_id', 'metagenome_id', 'num_hits', 'Phylum', 'Class', 'ecosystem', 'ecosystem_category', 'longitude', 'latitude'])
aed_Positive_metagenomes_df['Homologous_cluster']='Actino_aed_cluster'

# check phylum data
print(aed_Positive_metagenomes_df['Phylum'].unique())
print('Any Null in aed_Positive_metagenomes_df:\n', aed_Positive_metagenomes_df.isnull().any())

# write file
aed_Positive_metagenomes_df.to_csv('../data/processed/Final/Actino/aed_PositiveHits_ForR_loose.csv')
print('Number of p__UBP10: ', aed_Positive_metagenomes_df[aed_Positive_metagenomes_df['Phylum'] == 'p__UBP10'].shape[0])
print('done')

['p__Myxococcota' 'p__Actinobacteriota' 'p__UBP10' 'p__Proteobacteria'
 'p__Chloroflexota']
Any Null in aed_Positive_metagenomes_df:
 genome_id             False
metagenome_id         False
num_hits              False
Phylum                False
Class                 False
ecosystem             False
ecosystem_category    False
longitude              True
latitude               True
Homologous_cluster    False
dtype: bool
Number of p__UBP10:  7
done


#### 不執行:2.2 獲取positive的target name並連結MAGsID，整理出一個類似heatmap的table，再以嚴格hits數量進行篩選(>=8, aedA, aedB, aedJ)

In [24]:
# Got target name and query name
import os
import pandas as pd
import re

# read table
All_aed_Hits_df = pd.read_csv('../data/processed/All_aed_Hits_df_bitscore.csv') 

# Got target name and query name    
All_aed_Hits_TargetAndQuery = All_aed_Hits_df[['target_name', 'query_name']]

# Load a Dataframe with the lookup values for merge protein id to MAGs and merge them
TarToMAGs_aed = pd.read_csv('../data/interim/Actino_aed/aed_All_TarToMAGsID.csv')

# use merge() function to join the MAGsID data
aed_hits_TargetAndMAGsID = pd.merge(All_aed_Hits_TargetAndQuery, TarToMAGs_aed, on='target_name', how='left')                                                                               

# check the null value
print('Any Null in TargetToMAGsID: ', aed_hits_TargetAndMAGsID['MAGsID'].isnull().any())

# create the crosstab table (like heatmap)
aed_hits_heatmap = pd.crosstab(aed_hits_TargetAndMAGsID['query_name'], aed_hits_TargetAndMAGsID['MAGsID'], dropna=False)
aed_hits_heatmap = aed_hits_heatmap.transpose()
# aed_hits_heatmap
# aed_hits_heatmap.to_csv('../data/processed/aed_hits_heatmap_bitscore.csv')

# Count the non-zero values in hmm profiles hit row to calculate the number of different HMM profiles that have hits in a given MAG. 
def count_nonzero(row):
    return len(row[row != 0])

num_hits = aed_hits_heatmap.apply(count_nonzero, axis=1)
aed_hits_heatmap['num_hits'] = num_hits

# sort them by hits numer
aed_hits_heatmap = aed_hits_heatmap.sort_values(by="num_hits", ascending=False)

# extract > 8 hmm profiles hits and the necessary hits (aedA、aedB、aedJ)
aed_hits_FinalFilter =  aed_hits_heatmap[(aed_hits_heatmap['num_hits'] >= 8) & (aed_hits_heatmap['Q_A_I_aedA_RS26385'] != 0) & (aed_hits_heatmap['Q_A_I_aedB_RS26395'] != 0) & (aed_hits_heatmap['Q_A_I_aedJ_RS26410'] != 0)]
# 4hits; 3300024344_6, 3300017444_33, 3300027902_34,3300025886_13

# Reset index and move index column to first position
aed_hits_FinalFilter.index.name = None
aed_hits_FinalFilter = aed_hits_FinalFilter.reset_index()
aed_hits_FinalFilter.insert(0, 'index', aed_hits_FinalFilter.pop('index'))

# rename MAGsID
aed_hits_FinalFilter = aed_hits_FinalFilter.rename(columns={'index': 'genome_id'})

# extract the MAGsID and num_hits column
aed_Positive_MAGsID = aed_hits_FinalFilter[['genome_id', 'num_hits']]

# done
print('done')
aed_Positive_MAGsID

Any Null in TargetToMAGsID:  False
done


query_name,genome_id,num_hits
0,3300024344_6,13
1,3300017444_33,12
2,3300027902_34,12
3,3300025886_13,11


##### 2.2.1 連結critical_positive MAGs到 metagenomes 總資料中並進行處理成最株能給R繪製 stacket barchart 的資料格式

In [25]:
# 需先執行上一個cell
# open metagenome csv files
metagenmoes_df = pd.read_csv('../data/external/genome_metadata_editForAnalysis_NotReference.csv')

# merge positive MAGs with metagenome
aed_Positive_metagenomes_df = pd.merge(aed_Positive_MAGsID, metagenmoes_df, on='genome_id', how='left')

# extract certain column
aed_Positive_metagenomes_df = aed_Positive_metagenomes_df[['genome_id', 'metagenome_id', 'taxonomy', 'ecosystem', 'ecosystem_category', 'num_hits', 'longitude', 'latitude']]

# extract phylum and class from taxonomy column and expand to new column
aed_Positive_metagenomes_df['Phylum'] = aed_Positive_metagenomes_df['taxonomy'].str.extract('(p__\w+)', expand=True)
aed_Positive_metagenomes_df['Class'] = aed_Positive_metagenomes_df['taxonomy'].str.extract('(c__\w+)', expand=True)

# # print non-duplicated values in ecosystem_type than check the lable
# print(aed_Positive_metagenomes_df['ecosystem_type'].unique())
# aed_Positive_metagenomes_df

# # check the None value in ecosystem column Create a Boolean mask to identify NaN values
# mask = aed_addEco_df.isna()
# aed_addEco_df_nan_rows = aed_addEco_df[mask.any(axis=1)]
# aed_addEco_df_nan_rows

# remove, add and rearrange column
aed_Positive_metagenomes_df.drop('taxonomy', axis=1, inplace=True)
aed_Positive_metagenomes_df= aed_Positive_metagenomes_df.reindex(columns=['genome_id', 'metagenome_id', 'num_hits', 'Phylum', 'Class', 'ecosystem', 'ecosystem_category', 'longitude', 'latitude'])
aed_Positive_metagenomes_df['Homologous_cluster']='Actino_aed_cluster'

# check phylum data
print(aed_Positive_metagenomes_df['Phylum'].unique())
print('Any Null in aed_Positive_metagenomes_df:\n', aed_Positive_metagenomes_df.isnull().any())

# write file
aed_Positive_metagenomes_df.to_csv('../data/processed/Final/Actino/aed_PositiveHits_ForR_critical.csv')
print('done')
aed_Positive_metagenomes_df

['p__Myxococcota' 'p__Actinobacteriota']
Any Null in aed_Positive_metagenomes_df:
 genome_id             False
metagenome_id         False
num_hits              False
Phylum                False
Class                 False
ecosystem             False
ecosystem_category    False
longitude             False
latitude              False
Homologous_cluster    False
dtype: bool
done


Unnamed: 0,genome_id,metagenome_id,num_hits,Phylum,Class,ecosystem,ecosystem_category,longitude,latitude,Homologous_cluster
0,3300024344_6,3300024344,13,p__Myxococcota,c__UBA9160,Aquatic,Marine,25.4868,36.5264,Actino_aed_cluster
1,3300017444_33,3300017444,12,p__Myxococcota,c__UBA9160,Aquatic,Thermal springs,-125.513,60.1987,Actino_aed_cluster
2,3300027902_34,3300027902,12,p__Actinobacteriota,c__Acidimicrobiia,Aquatic,Freshwater,-86.23,41.7,Actino_aed_cluster
3,3300025886_13,3300025886,11,p__Myxococcota,c__UBA9160,Aquatic,Marine,7.9,54.1842,Actino_aed_cluster


# 不執行

#### 2.3 獲取positive的target name並連結MAGsID，整理出一個類似heatmap的table，再以本先討論的hits數量進行篩選來看差異(>=10)

In [4]:
# Got target name and query name
import os
import pandas as pd
import re

# read table
All_aed_Hits_df = pd.read_csv('../data/processed/All_aed_Hits_df_bitscore.csv') 

# Got target name and query name    
All_aed_Hits_TargetAndQuery = All_aed_Hits_df[['target_name', 'query_name']]

# Load a Dataframe with the lookup values for merge protein id to MAGs and merge them
TarToMAGs_aed = pd.read_csv('../data/interim/Actino_aed/aed_All_TarToMAGsID.csv')

# use merge() function to join the MAGsID data
aed_hits_TargetAndMAGsID = pd.merge(All_aed_Hits_TargetAndQuery, TarToMAGs_aed, on='target_name', how='left')                                                                               

# check the null value
print('Any Null: ', aed_hits_TargetAndMAGsID['MAGsID'].isnull().any())

# create the crosstab table (like heatmap)
aed_hits_heatmap = pd.crosstab(aed_hits_TargetAndMAGsID['query_name'], aed_hits_TargetAndMAGsID['MAGsID'], dropna=False)
aed_hits_heatmap = aed_hits_heatmap.transpose()
# aed_hits_heatmap
# aed_hits_heatmap.to_csv('../data/processed/aed_hits_heatmap_bitscore.csv')

# Count the non-zero values in hmm profiles hit row to calculate the number of different HMM profiles that have hits in a given MAG. 
def count_nonzero(row):
    return len(row[row != 0])

num_hits = aed_hits_heatmap.apply(count_nonzero, axis=1)
aed_hits_heatmap['num_hits'] = num_hits

# sort them by hits numer
aed_hits_heatmap = aed_hits_heatmap.sort_values(by="num_hits", ascending=False)

# extract > 10 hmm profiles hits and the necessary hits
aed_hits_FinalFilter =  aed_hits_heatmap[(aed_hits_heatmap['num_hits'] >= 10)]
# 17個hits

# Reset index and move index column to first position
aed_hits_FinalFilter.index.name = None
aed_hits_FinalFilter = aed_hits_FinalFilter.reset_index()
aed_hits_FinalFilter.insert(0, 'index', aed_hits_FinalFilter.pop('index'))

# rename MAGsID
aed_hits_FinalFilter = aed_hits_FinalFilter.rename(columns={'index': 'genome_id'})

# extract the MAGsID and num_hits column
aed_Positive_MAGsID = aed_hits_FinalFilter[['genome_id', 'num_hits']]

# done
print('done')
aed_Positive_MAGsID

Any Null:  False
done


query_name,genome_id,num_hits
0,3300024344_6,13
1,3300027902_36,12
2,3300017444_33,12
3,3300027902_34,12
4,3300027649_17,12
5,3300005529_56,11
6,3300007722_64,11
7,3300027627_9,11
8,3300025886_13,11
9,3300027983_50,11


##### 2.3.1 連結 10 Hits positive (>10) MAGs到 metagenomes 總資料中並進行處理成最株能給R繪製 stacket barchart 的資料格式

In [57]:
# 需先執行上一個cell
# open metagenome csv files
metagenmoes_df = pd.read_csv('../data/external/Paper_genome_metadata.csv')
metagenmoes_df = metagenmoes_df.rename(columns={'ecosystem': 'taxonomy'})

# merge positive MAGs with metagenome
aed_Positive_metagenomes_df = pd.merge(aed_Positive_MAGsID, metagenmoes_df, on='genome_id', how='left')

# print(aed_Positive_metagenomes_df['ecosystem_category'].unique())
# print(aed_Positive_metagenomes_df['ecosystem_type'].unique())
# display(aed_Positive_metagenomes_df[aed_Positive_metagenomes_df['ecosystem_type'] == 'Anaerobic'])

# extract certain column
aed_Positive_metagenomes_df = aed_Positive_metagenomes_df[['genome_id', 'taxonomy', 'ecosystem_type', 'num_hits']]

# extract phylum and class from taxonomy column and expand to new column
aed_Positive_metagenomes_df['Phylum'] = aed_Positive_metagenomes_df['taxonomy'].str.extract('(p__\w+)', expand=True)
aed_Positive_metagenomes_df['Class'] = aed_Positive_metagenomes_df['taxonomy'].str.extract('(c__\w+)', expand=True)

# # print non-duplicated values in ecosystem_type than check the lable
# print(aed_Positive_metagenomes_df['ecosystem_type'].unique())
# aed_Positive_metagenomes_df

# Load a EcosystemToEco_type dataframe for lookup
ForEcoLookup = pd.read_csv("../data/interim/EcosystemToEco_type.csv")

# merge them with ecosystem_type
aed_addEco_df = pd.merge(aed_Positive_metagenomes_df, ForEcoLookup, on='ecosystem_type', how='left')

# # check the None value in ecosystem column Create a Boolean mask to identify NaN values
# mask = aed_addEco_df.isna()
# aed_addEco_df_nan_rows = aed_addEco_df[mask.any(axis=1)]
# aed_addEco_df_nan_rows

# !!!! change the ecosystem_type value name
aed_addEco_df['ecosystem_type'] = aed_addEco_df['ecosystem_type'].replace({'Rhizoplane': 'Plant', 'Defined media': "Lab enrichment", 'Nutrient removal': 'Wastewater', 'Activated Sludge': 'Wastewater'})
print(aed_addEco_df['ecosystem_type'].unique())

# remove, add and rearrange column
aed_addEco_df.drop('taxonomy', axis=1, inplace=True)
aed_addEco_df= aed_addEco_df.reindex(columns=['genome_id', 'num_hits', 'Phylum', 'Class', 'ecosystem', 'ecosystem_type'])
aed_addEco_df['Homologous_cluster']='Actino_aed_cluster'

# # check phylum data
# print(aed_addEco_df['Phylum'].unique())

# write file
aed_addEco_df.to_csv('../data/processed/Final/Actino/aed_PositiveHits_ForR_10Hits.csv')
print('done')
aed_addEco_df

['Marine' 'Freshwater' 'Thermal spring' 'Soil' 'Wastewater']
done


Unnamed: 0,genome_id,num_hits,Phylum,Class,ecosystem,ecosystem_type,Homologous_cluster
0,3300024344_6,13,p__Myxococcota,c__UBA9160,Aquatic,Marine,Actino_aed_cluster
1,3300027902_36,12,p__Actinobacteriota,c__Acidimicrobiia,Aquatic,Freshwater,Actino_aed_cluster
2,3300017444_33,12,p__Myxococcota,c__UBA9160,Aquatic,Thermal spring,Actino_aed_cluster
3,3300027902_34,12,p__Actinobacteriota,c__Acidimicrobiia,Aquatic,Freshwater,Actino_aed_cluster
4,3300027649_17,12,p__Actinobacteriota,c__Acidimicrobiia,Aquatic,Freshwater,Actino_aed_cluster
5,3300005529_56,11,p__Actinobacteriota,c__Acidimicrobiia,Terrestrial,Soil,Actino_aed_cluster
6,3300007722_64,11,p__Actinobacteriota,c__Acidimicrobiia,Aquatic,Freshwater,Actino_aed_cluster
7,3300027627_9,11,p__Actinobacteriota,c__Acidimicrobiia,Aquatic,Freshwater,Actino_aed_cluster
8,3300025886_13,11,p__Myxococcota,c__UBA9160,Aquatic,Marine,Actino_aed_cluster
9,3300027983_50,11,p__Actinobacteriota,c__Acidimicrobiia,Aquatic,Freshwater,Actino_aed_cluster


## 第二部分: 舊版參考前人標準 (e-value coverage)，不含有bit score
DOI: https://doi.org/10.1128/mBio.00166-16


### 1. 用一致的e-value與coverage進行篩選

In [58]:
# Actino MAGs reference filtering
# Define the directory that contains the "domtblout" files, 這個genome protein fasta要把discription的文字刪除，因有空格會影響read
domtblout_dir = "../data/raw/Actino_HMM_MAGs_domtblout/"

# create the list of hmmsearches output files name (without the file extension).
hmmsearches = ["MAGs_A_aedC_RS26365", "MAGs_A_aedD_RS26370", "MAGs_A_aedE_RS26375", "MAGs_A_aedI_RS26405", "MAGs_A_aedL_RS26420", "MAGs_A_aedM_RS26425", "MAGs_A_aedN_RS26430",
               "MAGs_A_aedO_RS26435", "MAGs_A_aedP_RS26440", "MAGs_A_aedQ_RS26445", "MAGs_A_aedR_RS26450", "MAGs_A_I_aedA_RS26385",
               "MAGs_A_I_aedB_RS26395", "MAGs_A_I_aedF_RS26380", "MAGs_A_I_aedG_RS26390", "MAGs_A_I_aedH_RS26400", "MAGs_A_I_aedJ_RS26410", "MAGs_A_I_aedK_RS26415"]

# create a empty dataframe for hits target 
columns = ["target_name", "accession", "tlen", "query_name", "accession2", "qlen", "E-value", "score", "bias",
           "num_domains_index", "num_domains_total", "c-Evalue", "i-Evalue", "score2", "bias2", "hmm_from",
           "hmm_to", "ali_from", "ali_to", "env_from", "env_to", "acc", "description"]

MAGs_Hits = {}
MAGs_Hits_name = []

All_aed_Hits_df = pd.DataFrame(columns=columns)

# Loop over the hmmsearches and parse the corresponding "domtblout" file
for hmmsearch in hmmsearches:
    # Load the "domtblout" file into a pandas DataFrame
    file_path = os.path.join(domtblout_dir, hmmsearch + ".domtblout")
    df = pd.read_csv(file_path, comment="#", sep='\s+', header=None)    
    # Assign column names to the DataFrame
    df.columns = ["target_name", "accession", "tlen", "query_name", "accession2", "qlen", "E-value", "score", "bias",
                  "num_domains_index", "num_domains_total", "c-Evalue", "i-Evalue", "score2", "bias2", "hmm_from", "hmm_to", "ali_from", "ali_to",
                  "env_from", "env_to", "acc", "description"]
    # Calculate the coverage for each hit
    df["coverage"] = (df["ali_to"] - df["ali_from"] + 1) / df["tlen"]

    # Filter the DataFrame by E-value and coverage, and sort by bit-score
    significant_hits_Ref = df[(df["E-value"] <= 1e-25) & (df["coverage"] > 0.30)]

    # Extract Target nmae and store it in the dictionary
    if not significant_hits_Ref.empty:
        MAGs_Hits_name = significant_hits_Ref["target_name"].tolist()
        MAGs_Hits[hmmsearch] = MAGs_Hits_name
    else:
        MAGs_Hits_name = None
        MAGs_Hits[hmmsearch] = MAGs_Hits_name
    
    # add hits table to a df
    All_aed_Hits_df = pd.concat([significant_hits_Ref, All_aed_Hits_df], axis=0)

All_aed_Hits_df.to_csv('../data/processed/All_aed_Hits_df_reference.csv')    

# done
print('done')
All_aed_Hits_df

done


Unnamed: 0,target_name,accession,tlen,query_name,accession2,qlen,E-value,score,bias,num_domains_index,...,bias2,hmm_from,hmm_to,ali_from,ali_to,env_from,env_to,acc,description,coverage
0,3300024344.a:Ga0209992_10000019_32,-,396,Q_A_I_aedK_RS26415,-,389,2.900000e-179,608.4,3.0,1,...,3.0,2,384,6,391,5,395,0.98,-,0.974747
1,3300009540.a:Ga0073899_10003596_14,-,362,Q_A_I_aedK_RS26415,-,389,5.000000e-166,564.9,6.8,1,...,6.8,26,386,1,361,1,361,1.00,-,0.997238
2,3300017449.a:Ga0185342_1000806_2,-,388,Q_A_I_aedK_RS26415,-,389,8.600000e-142,485.1,1.2,1,...,1.2,2,386,3,387,2,387,0.97,-,0.992268
3,3300017971.a:Ga0180438_10046985_3,-,388,Q_A_I_aedK_RS26415,-,389,2.100000e-141,483.8,0.5,1,...,0.5,2,386,3,387,2,387,0.96,-,0.992268
4,3300007722.a:Ga0105051_10000035_61,-,387,Q_A_I_aedK_RS26415,-,389,4.800000e-141,482.6,1.5,1,...,1.5,3,387,5,385,3,386,0.98,-,0.984496
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4652,3300026288.a:Ga0209572_1000144_104,-,168,Q_A_aedC_RS26365,-,319,7.100000e-26,103.5,0.4,1,...,0.4,151,319,1,167,1,167,0.94,-,0.994048
4653,3300026303.a:Ga0209793_1000105_103,-,168,Q_A_aedC_RS26365,-,319,7.100000e-26,103.5,0.4,1,...,0.4,151,319,1,167,1,167,0.94,-,0.994048
4654,3300012973.a:Ga0123351_1000371_12,-,345,Q_A_aedC_RS26365,-,319,9.500000e-26,103.1,0.0,1,...,0.0,51,298,54,322,31,338,0.82,-,0.779710
4655,3300012833.a:Ga0160489_100006_226,-,357,Q_A_aedC_RS26365,-,319,9.900000e-26,103.0,0.2,1,...,0.2,37,306,34,337,6,350,0.73,-,0.851541


### 2. 獲取 positive的 target name 並連結 MAGs 並整理出一個類似heatmap的table，再以>= 10 hits數量進行篩選，並有6個必須基因的限制(aedA, aedB, aedG, aedH, aedK, aedF, aedJ)

In [2]:
# 須先進行前一個 cell 程式
import os
import pandas as pd
import re


# read table
All_aed_Hits_df = pd.read_csv('../data/processed/All_aed_Hits_df_reference.csv') 

# Got target name and query name    
All_aed_Hits_TargetAndQuery = All_aed_Hits_df[['target_name', 'query_name']]

# Load a Dataframe with the lookup values for merge protein id to MAGs and merge them
TarToMAGs_aed = pd.read_csv('../data/interim/Actino_aed/aed_All_TarToMAGsID.csv')

# use merge() function to join the MAGsID data
aed_hits_TargetAndMAGsID = pd.merge(All_aed_Hits_TargetAndQuery, TarToMAGs_aed, on='target_name', how='left')

# check the null value
print('Any Null: ', aed_hits_TargetAndMAGsID['MAGsID'].isnull().any())

# got null row
null = aed_hits_TargetAndMAGsID[aed_hits_TargetAndMAGsID['MAGsID'].isnull()]

# create the crosstab table (like heatmap)
aed_hits_heatmap = pd.crosstab(aed_hits_TargetAndMAGsID['query_name'], aed_hits_TargetAndMAGsID['MAGsID'], dropna=False)
aed_hits_heatmap = aed_hits_heatmap.transpose()
# aed_hits_heatmap
# aed_hits_heatmap.to_csv('../data/processed/aed_hits_heatmap_bitscore.csv')

# Count the non-zero values in hmm profiles hit row to calculate the number of different HMM profiles that have hits in a given MAG. 
def count_nonzero(row):
    return len(row[row != 0])

num_hits = aed_hits_heatmap.apply(count_nonzero, axis=1)
aed_hits_heatmap['num_hits'] = num_hits

# sort them by hits numer
aed_hits_heatmap = aed_hits_heatmap.sort_values(by="num_hits", ascending=False)

# extract > 10 hmm profiles hits and the necessary hits (aedA、aedB、aedH、aedG、aedJ、aedF、aedK)
aed_hits_FinalFilter =  aed_hits_heatmap[(aed_hits_heatmap['num_hits'] >= 10) & (aed_hits_heatmap['Q_A_I_aedA_RS26385'] != 0) & 
                                         (aed_hits_heatmap['Q_A_I_aedB_RS26395'] != 0) & (aed_hits_heatmap['Q_A_I_aedJ_RS26410'] != 0) &
                                        (aed_hits_heatmap['Q_A_I_aedF_RS26380'] != 0) & (aed_hits_heatmap['Q_A_I_aedG_RS26390'] != 0) &
                                         (aed_hits_heatmap['Q_A_I_aedH_RS26400'] != 0) & (aed_hits_heatmap['Q_A_I_aedK_RS26415'] != 0)]
#

# Reset index and move index column to first position
aed_hits_FinalFilter.index.name = None
aed_hits_FinalFilter = aed_hits_FinalFilter.reset_index()
aed_hits_FinalFilter.insert(0, 'index', aed_hits_FinalFilter.pop('index'))

# rename MAGsID
aed_hits_FinalFilter = aed_hits_FinalFilter.rename(columns={'index': 'genome_id'})

# extract the MAGsID and num_hits column
aed_Positive_MAGsID = aed_hits_FinalFilter[['genome_id', 'num_hits']]

# done
print('done')
aed_Positive_MAGsID

Any Null:  False
done


query_name,genome_id,num_hits
0,3300027951_10,15
1,3300017448_15,14
2,3300009540_25,14
3,3300018412_3,14
4,3300006913_5,14
...,...,...
71,3300027708_8,11
72,3300027965_24,11
73,3300025519_2,11
74,3300026167_17,11


### 3. 連結Reference_positive MAGs到 metagenomes 總資料中並進行處理成最株能給R繪製 stacket barchart 的資料格式

In [3]:
# 需先執行上一個cell
# open metagenome csv files
metagenmoes_df = pd.read_csv('../data/external/Paper_genome_metadata.csv')
metagenmoes_df = metagenmoes_df.rename(columns={'ecosystem': 'taxonomy'})

# merge positive MAGs with metagenome
aed_Positive_metagenomes_df = pd.merge(aed_Positive_MAGsID, metagenmoes_df, on='genome_id', how='left')

# print(aed_Positive_metagenomes_df['ecosystem_category'].unique())
# print(aed_Positive_metagenomes_df['ecosystem_type'].unique())
# display(aed_Positive_metagenomes_df[aed_Positive_metagenomes_df['ecosystem_type'] == 'Anaerobic'])

# extract certain column
aed_Positive_metagenomes_df = aed_Positive_metagenomes_df[['genome_id', 'taxonomy', 'ecosystem_type', 'num_hits']]

# extract phylum and class from taxonomy column and expand to new column
aed_Positive_metagenomes_df['Phylum'] = aed_Positive_metagenomes_df['taxonomy'].str.extract('(p__\w+)', expand=True)
aed_Positive_metagenomes_df['Class'] = aed_Positive_metagenomes_df['taxonomy'].str.extract('(c__\w+)', expand=True)

# # print non-duplicated values in ecosystem_type than check the lable
# print(aed_Positive_metagenomes_df['ecosystem_type'].unique())
# aed_Positive_metagenomes_df

# Load a EcosystemToEco_type dataframe for lookup
ForEcoLookup = pd.read_csv("../data/interim/EcosystemToEco_type.csv")

# merge them with ecosystem_type
aed_addEco_df = pd.merge(aed_Positive_metagenomes_df, ForEcoLookup, on='ecosystem_type', how='left')

# check the None value in ecosystem column Create a Boolean mask to identify NaN values
mask = aed_addEco_df.isna()
aed_addEco_df_nan_rows = aed_addEco_df[mask.any(axis=1)]
aed_addEco_df_nan_rows

# !!!! change the ecosystem_type value name
aed_addEco_df['ecosystem_type'] = aed_addEco_df['ecosystem_type'].replace({'Activated Sludge': 'Wastewater', 'City': 'Built environment', 'Rhizoplane': 'Plant', 'Anaerobic': 'Bioreactor',
                                                                           'Defined media': 'Lab enrichment', 'Tetrachloroethylene and derivatives': 'Bioremediation', 'Mycelium': 'Fungi'})
print(aed_addEco_df['ecosystem_type'].unique())

# remove, add and rearrange column
aed_addEco_df.drop('taxonomy', axis=1, inplace=True)
aed_addEco_df= aed_addEco_df.reindex(columns=['genome_id', 'num_hits', 'Phylum', 'Class', 'ecosystem', 'ecosystem_type'])
aed_addEco_df['Homologous_cluster']='Actino_aed_cluster'

# # check phylum data
# print(aed_addEco_df['Phylum'].unique())

# write file
aed_addEco_df.to_csv('../data/processed/Final/Actino/aed_PositiveHits_ForR_reference.csv')
print('done')
aed_addEco_df

['Porifera' 'Soil' 'Wastewater' 'Freshwater' 'Annelida' 'Cnidaria'
 'Non-marine Saline and Alkaline' 'Marine' 'Fungi' 'Built environment'
 'Plant' 'Bioreactor' 'Lab enrichment' 'Bioremediation' 'Sediment']
done


Unnamed: 0,genome_id,num_hits,Phylum,Class,ecosystem,ecosystem_type,Homologous_cluster
0,3300027951_10,15,p__Actinobacteriota,c__Acidimicrobiia,Host-associated,Porifera,Actino_aed_cluster
1,3300017448_15,14,p__Actinobacteriota,c__Actinobacteria,Terrestrial,Soil,Actino_aed_cluster
2,3300009540_25,14,p__Actinobacteriota,c__Acidimicrobiia,Engineered,Wastewater,Actino_aed_cluster
3,3300018412_3,14,p__Proteobacteria,c__Alphaproteobacteria,Aquatic,Freshwater,Actino_aed_cluster
4,3300006913_5,14,p__Proteobacteria,c__Gammaproteobacteria,Host-associated,Annelida,Actino_aed_cluster
...,...,...,...,...,...,...,...
71,3300027708_8,11,p__Proteobacteria,c__Alphaproteobacteria,Aquatic,Freshwater,Actino_aed_cluster
72,3300027965_24,11,p__Proteobacteria,c__Alphaproteobacteria,Terrestrial,Soil,Actino_aed_cluster
73,3300025519_2,11,p__Proteobacteria,c__Gammaproteobacteria,Engineered,Lab enrichment,Actino_aed_cluster
74,3300026167_17,11,p__Actinobacteriota,c__Acidimicrobiia,Terrestrial,Soil,Actino_aed_cluster


# 補充資料

In [15]:
# print(aed_hits_heatmap.columns)

Index(['Q_A_I_aedA_RS26385', 'Q_A_I_aedB_RS26395', 'Q_A_I_aedF_RS26380',
       'Q_A_I_aedG_RS26390', 'Q_A_I_aedH_RS26400', 'Q_A_I_aedJ_RS26410',
       'Q_A_I_aedK_RS26415', 'Q_A_aedD_RS26370', 'Q_A_aedE_RS26375',
       'Q_A_aedI_RS26405', 'Q_A_aedL_RS26420', 'Q_A_aedM_RS26425',
       'Q_A_aedN_RS26430', 'Q_A_aedO_RS26435', 'Q_A_aedP_RS26440', 'num_hits'],
      dtype='object', name='query_name')
