  # meta_step.04.MergeFile & SelectRepResult
  ## - author:Xiao Nan
  ## - begin date: 2024.3.12

In [1]:
import pandas as pd
from tqdm.notebook import tqdm
import os
import glob

## 合并表格

In [4]:
folder_path = '../blast_info/blastp_result'
# all_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.csv')]
all_files = glob.glob(f'{folder_path}/*.csv')
print(type(all_files))
print(all_files[:10])
print(len(all_files))
# all_files = all_files[:10]

<class 'list'>
['../blast_info/blastp_result/batch_108.csv', '../blast_info/blastp_result/batch_178.csv', '../blast_info/blastp_result/batch_27.csv', '../blast_info/blastp_result/batch_80.csv', '../blast_info/blastp_result/batch_81.csv', '../blast_info/blastp_result/batch_53.csv', '../blast_info/blastp_result/batch_150.csv', '../blast_info/blastp_result/batch_26.csv', '../blast_info/blastp_result/batch_204.csv', '../blast_info/blastp_result/batch_54.csv']
229


In [5]:
# 初始化一个列表来收集所有行的数据
data = []

In [6]:
for file_path in all_files:
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split(',')
            try:  # 使用try-except来避免转换错误
                qseqid = parts[0]
                sseqid = parts[-3]
                pident = float(parts[-2].strip())
                evalue = float(parts[-1])
                stitle = ','.join(parts[1:-3])  # 重新合并stitle部分
            except ValueError:
                print(f"Error converting line in file '{file_path}', line : {line}")
                continue
            # merged_df = merged_df.append({"qseqid": qseqid, "stitle": stitle, "sseqid": sseqid, "pident": pident, "evalue": evalue}, ignore_index=True)
            data.append([qseqid, stitle, sseqid, pident, evalue])

Error converting line in file '../blast_info/blastp_result/merged_blastp_results.csv', line : qseqid,stitle,sseqid,pident,evalue

Error converting line in file '../blast_info/blastp_result/final_blastp_results.csv', line : qseqid,stitle,pident,evalue



In [7]:
# 一次性将列表转换为DataFrame
merged_df = pd.DataFrame(data, columns=["qseqid", "stitle", "sseqid", "pident", "evalue"])
merged_df.head()

Unnamed: 0,qseqid,stitle,sseqid,pident,evalue
0,mG.blast.ref_prok_rep_genomes_199.fna.gz_NZ_BI...,VanZ family protein [Paenibacillus illinoisensis],ref|WP_127536894.1|,100.0,0.0
1,mG.blast.ref_prok_rep_genomes_199.fna.gz_NZ_BI...,VanZ family protein [Paenibacillus xylanexedens],ref|WP_154893995.1|,96.421,0.0
2,mG.blast.ref_prok_rep_genomes_199.fna.gz_NZ_BI...,VanZ family protein [Paenibacillus sp. CC-CFT742],ref|WP_286457057.1|,93.053,0.0
3,mG.blast.ref_prok_rep_genomes_199.fna.gz_NZ_BI...,hypothetical protein CHI14_01605 [Paenibacillu...,gb|PAF33837.1|,92.842,0.0
4,mG.blast.ref_prok_rep_genomes_199.fna.gz_NZ_BI...,VanZ family protein [Paenibacillus sp. 7516],ref|WP_170947979.1|,92.842,0.0


In [8]:
# 统计一下合并的后的结果
print(merged_df.shape)
unique_qseqid_count = merged_df['qseqid'].nunique()
print(f"unique_qseqid_count: {unique_qseqid_count}")
unique_stitles = merged_df['stitle'].unique()
print(f"unique_stitles_count: {len(unique_stitles)}")
print(unique_stitles[:10])

(9550535, 5)
unique_qseqid_count: 217235
unique_stitles_count: 1098752
['VanZ family protein [Paenibacillus illinoisensis]'
 'VanZ family protein [Paenibacillus xylanexedens]'
 'VanZ family protein [Paenibacillus sp. CC-CFT742]'
 'hypothetical protein CHI14_01605 [Paenibacillus sp. 7516]'
 'VanZ family protein [Paenibacillus sp. 7516]'
 'MULTISPECIES: VanZ family protein [Paenibacillus]'
 'VanZ family protein [Paenibacillus sp. 7523-1]'
 'VanZ family protein [Paenibacillus sp.]'
 'hypothetical protein [Heliophilum fasciatum]'
 'tetratricopeptide repeat protein [Syntrophaceae bacterium]']


In [9]:
# 存储合并后的CSV文件
merged_df.to_csv(f"{folder_path}/merged_blastp_results.csv", index=False)

In [11]:
# 筛选数据
filtered_df = merged_df[~merged_df['stitle'].str.contains('hypothetical|PREDICTED|putative', case=False)]
filtered_df = filtered_df[filtered_df['evalue'] <= 1e-5]

In [12]:
# pident进行条件筛选

# filtered_df = filtered_df[filtered_df['pident'] >= 70]
# 筛选出stitle中包含"cas"或"crispr"的行（不区分大小写）
contains_keywords_df = filtered_df[filtered_df['stitle'].str.contains('Cas|CRISPR', case=False, na=False)] # case=False参数使得匹配不区分大小写

# 筛选出其他情况且pident >= 70的行
other_conditions_df = filtered_df[~filtered_df['stitle'].str.contains('Cas|CRISPR', case=False, na=False) & (filtered_df['pident'] >= 70)]

# 合并这两部分DataFrame
final_filtered_df = pd.concat([contains_keywords_df, other_conditions_df])

# 重置索引，因为合并后的DataFrame可能会有重复的索引
final_filtered_df.reset_index(drop=True, inplace=True)

In [13]:
# 对筛选后的数据进行统计
print(filtered_df.shape)
filtered_unique_qseqid_count = filtered_df['qseqid'].nunique()
filtered_unique_stitles = filtered_df['stitle'].unique()
print(f"filtered_unique_qseqid_count: {filtered_unique_qseqid_count}")
print(f"filtered_unique_stitles_count: {len(filtered_unique_stitles)}")
print(filtered_unique_stitles[:10])

(6312347, 5)
filtered_unique_qseqid_count: 177232
filtered_unique_stitles_count: 852655
['VanZ family protein [Paenibacillus illinoisensis]'
 'VanZ family protein [Paenibacillus xylanexedens]'
 'VanZ family protein [Paenibacillus sp. CC-CFT742]'
 'VanZ family protein [Paenibacillus sp. 7516]'
 'MULTISPECIES: VanZ family protein [Paenibacillus]'
 'VanZ family protein [Paenibacillus sp. 7523-1]'
 'VanZ family protein [Paenibacillus sp.]'
 'tetratricopeptide repeat protein [Syntrophaceae bacterium]'
 'tetratricopeptide repeat protein [Dehalococcoidales bacterium]'
 'CRISPR-associated helicase/endonuclease Cas3 [Effusibacillus lacus]']


In [17]:
filtered_df.head(20)

Unnamed: 0,qseqid,stitle,sseqid,pident,evalue
0,mG.blast.ref_prok_rep_genomes_199.fna.gz_NZ_BI...,VanZ family protein [Paenibacillus illinoisensis],ref|WP_127536894.1|,100.0,0.0
1,mG.blast.ref_prok_rep_genomes_199.fna.gz_NZ_BI...,VanZ family protein [Paenibacillus xylanexedens],ref|WP_154893995.1|,96.421,0.0
2,mG.blast.ref_prok_rep_genomes_199.fna.gz_NZ_BI...,VanZ family protein [Paenibacillus sp. CC-CFT742],ref|WP_286457057.1|,93.053,0.0
4,mG.blast.ref_prok_rep_genomes_199.fna.gz_NZ_BI...,VanZ family protein [Paenibacillus sp. 7516],ref|WP_170947979.1|,92.842,0.0
5,mG.blast.ref_prok_rep_genomes_199.fna.gz_NZ_BI...,MULTISPECIES: VanZ family protein [Paenibacillus],ref|WP_162009950.1|,92.632,0.0
6,mG.blast.ref_prok_rep_genomes_199.fna.gz_NZ_BI...,VanZ family protein [Paenibacillus illinoisensis],ref|WP_110823032.1|,92.421,0.0
7,mG.blast.ref_prok_rep_genomes_199.fna.gz_NZ_BI...,VanZ family protein [Paenibacillus illinoisensis],gb|MBY0218045.1|,92.0,0.0
8,mG.blast.ref_prok_rep_genomes_199.fna.gz_NZ_BI...,VanZ family protein [Paenibacillus sp. 7523-1],ref|WP_095357566.1|,92.0,0.0
9,mG.blast.ref_prok_rep_genomes_199.fna.gz_NZ_BI...,VanZ family protein [Paenibacillus sp.],gb|MBM6387176.1|,92.0,0.0
11,mG.blast.ref_prok_rep_genomes_199.fna.gz_NZ_SL...,tetratricopeptide repeat protein [Syntrophacea...,tpg|HIC91513.1|,23.064,9.580000000000001e-27


In [16]:
# 对每个qseqid只保留最优的行（evalue更小和pident更大）
final_df = filtered_df.sort_values(by=['qseqid', 'evalue', 'pident'], ascending=[True, True, False])
print(final_df.shape)
final_df.head(20)

(6312347, 5)


Unnamed: 0,qseqid,stitle,sseqid,pident,evalue
6209845,known_Cas13Bt-A11_JGI_Munlanlewell138R_LiWei20...,"""""""""""""""Chain A, Cas13bt3 [Planctomycetota bact...",pdb|7VTN|A,26.62,2.7600000000000003e-33
6767004,known_Cas13Bt-A11_JGI_Munlanlewell138R_LiWei20...,"""""""Chain A, Cas13bt3 [Planctomycetota bacteriu...",pdb|7VTN|A,26.62,2.7600000000000003e-33
7055215,known_Cas13Bt-A11_JGI_Munlanlewell138R_LiWei20...,"""""""""""""""Chain A""",""" Cas13bt3 [Planctomycetota bacterium]""""""""""""""",26.62,2.7600000000000003e-33
7401056,known_Cas13Bt-A11_JGI_Munlanlewell138R_LiWei20...,"""Chain A, Cas13bt3 [Planctomycetota bacterium]""",pdb|7VTN|A,26.62,2.7600000000000003e-33
7703774,known_Cas13Bt-A11_JGI_Munlanlewell138R_LiWei20...,"""""""""""""""Chain A","Cas13bt3 [Planctomycetota bacterium]""""""""""""""",26.62,2.7600000000000003e-33
8050672,known_Cas13Bt-A11_JGI_Munlanlewell138R_LiWei20...,"Chain A, Cas13bt3 [Planctomycetota bacterium]",pdb|7VTN|A,26.62,2.7600000000000003e-33
6209846,known_Cas13Bt-A11_JGI_Munlanlewell138R_LiWei20...,"""""""""""""""Chain A, Cas13bt3 [Planctomycetota bact...",pdb|7VTI|A,26.266,3.07e-31
6767005,known_Cas13Bt-A11_JGI_Munlanlewell138R_LiWei20...,"""""""Chain A, Cas13bt3 [Planctomycetota bacteriu...",pdb|7VTI|A,26.266,3.07e-31
7401057,known_Cas13Bt-A11_JGI_Munlanlewell138R_LiWei20...,"""Chain A, Cas13bt3 [Planctomycetota bacterium]""",pdb|7VTI|A,26.266,3.07e-31
8050673,known_Cas13Bt-A11_JGI_Munlanlewell138R_LiWei20...,"Chain A, Cas13bt3 [Planctomycetota bacterium]",pdb|7VTI|A,26.266,3.07e-31


In [18]:
final_df = final_df.drop_duplicates(subset=['qseqid'], keep='first')
print(final_df.shape)
final_df.head(20)

(177232, 5)


Unnamed: 0,qseqid,stitle,sseqid,pident,evalue
6209845,known_Cas13Bt-A11_JGI_Munlanlewell138R_LiWei20...,"""""""""""""""Chain A, Cas13bt3 [Planctomycetota bact...",pdb|7VTN|A,26.62,2.7600000000000003e-33
110716,known_Cas13Bt-A13_JGI_Ga0246100_LiWei2022CellD...,"Chain A, Cas13bt3 [Planctomycetota bacterium]",pdb|7VTN|A,26.99,1.2100000000000001e-33
59381,known_Cas13Bt-A17_JGI_SalMarSW160370MG_LiWei20...,"Chain A, Cas13bt3 [Planctomycetota bacterium]",pdb|7VTN|A,32.838,3.19e-74
9452511,known_Cas13Bt-A18_JGI_SalMarWE160370MG_LiWei20...,"Chain A, Cas13bt3 [Planctomycetota bacterium]",pdb|7VTN|A,34.474,9.09e-106
537898,known_Cas13Bt-A2_NCBI-Prokaryotes_GCA_01693289...,"Chain A, Cas13bt3 [Planctomycetota bacterium]",pdb|7VTN|A,39.268,1.4699999999999998e-146
7007334,known_Cas13a12_Lwa2Cas13a_WP_021746774.1,Cas13a/puromycin N-acetyltransferase fusion pr...,gb|QID24124.1|,100.0,0.0
6209831,known_Cas13a17_HheCas13a_CRZ35554.1,type VI-A CRISPR-associated RNA-guided ribonuc...,ref|WP_103203632.1|,100.0,0.0
361893,known_Cas13a18_EreCas13a_WP_055061018.1,type VI-A CRISPR-associated RNA-guided ribonuc...,gb|MCH3945239.1|,99.777,0.0
8347833,known_Cas13a20_BmaCas13a_WP_062808098.1,type VI-A CRISPR-associated RNA-guided ribonuc...,ref|WP_062808098.1|,100.0,0.0
122052,known_Cas13a21_LspCas13a_WP_021744063.1,type VI-A CRISPR-associated RNA-guided ribonuc...,ref|WP_021744063.1|,100.0,0.0


In [19]:
# 输出最后的csv文件（只要：qseqid、stitle、pident、evalue四列）
final_df[['qseqid', 'stitle', 'pident', 'evalue']].to_csv(f"{folder_path}/final_blastp_results.csv", index=False)

In [20]:
# 打印统计结果，以供检查
print(f"Initial shape: {merged_df.shape}")
print(f"Filtered shape: {filtered_df.shape}")
print(f"Final shape: {final_df.shape}")

Initial shape: (9550535, 5)
Filtered shape: (6312347, 5)
Final shape: (177232, 5)


## 测试和后面的表格合并

In [21]:
df_has_known = pd.read_pickle('../blast_info/df_has_known.pkl.gz')
print(df_has_known.shape)
df_has_known.head()

(585, 21)


Unnamed: 0,rep,member,SeqRecord:rep,SeqRecord:member,crispr_id:rep,crispr_id:member,crispr_info=member:crispr_id,crispr_info=member:representative_repeat_seq,crispr_info=member:repeat_mismatch,crispr_info=member:representative_repeat_length,...,crispr_info=member:spacer_seq,crispr_info=member:representative_spacer_length,crispr_info=member:spacer_length,crispr_info=member:position,crispr_info=member:%identity,crispr_info=member:left_flank,crispr_info=member:spacer_length:std,crispr_info=member:repeat_mismatch_count,crispr_info=member:repeat_mismatch_count:mean,crispr_info=member:repeat_mismatch_count:std
898543,known_Cas13Bt-A11_JGI_Munlanlewell138R_LiWei20...,known_Cas13Bt-A11_JGI_Munlanlewell138R_LiWei20...,"(M, G, N, I, S, G, E, K, I, G, I, K, M, D, N, ...","(M, G, N, I, S, G, E, K, I, G, I, K, M, D, N, ...",,,,,,,...,,,,,,,,,,
898544,known_Cas13Bt-A11_JGI_Munlanlewell138R_LiWei20...,mG.JGI.192505_00.fna.gz_Ga0265293_10004442_4,"(M, G, N, I, S, G, E, K, I, G, I, K, M, D, N, ...","(M, G, N, I, S, G, E, K, I, G, I, K, M, D, N, ...",,mG.JGI.192505_00.fna.gz+Ga0265293_10004442+125,mG.JGI.192505_00.fna.gz+Ga0265293_10004442+125,GCTGTGATTACCCTGCAAATCGAGGGCTGCTCCAGC,...................-................,36.0,...,CCATTGGGTACAGAGCATCCATTGGGGACG,29.0,30.0,3685.0,97.0,TTATGCTGCT,0.48795,0.0,0.0,0.0
898545,known_Cas13Bt-A11_JGI_Munlanlewell138R_LiWei20...,mG.JGI.173879_01.fna.gz_Ga0212093_1029592_2,"(M, G, N, I, S, G, E, K, I, G, I, K, M, D, N, ...","(M, G, I, D, Y, S, L, T, S, D, C, Y, R, G, I, ...",,mG.JGI.173879_01.fna.gz+Ga0212093_1029592+323,mG.JGI.173879_01.fna.gz+Ga0212093_1029592+323,GGCTGGAGCAGCCCTCGATTTGCAGGGTAATCACAGC,.....................................,37.0,...,CAGTAGACATCCTGTCCGTTAATCTTGGC,29.0,29.0,3475.0,100.0,GCGATAGAGC,0.0,0.0,0.0,0.0
898546,known_Cas13Bt-A11_JGI_Munlanlewell138R_LiWei20...,mG.JGI.75245_01.fna.gz_Ga0101944_10026827_2,"(M, G, N, I, S, G, E, K, I, G, I, K, M, D, N, ...","(M, G, I, D, Y, S, L, T, S, D, C, Y, R, G, I, ...",,mG.JGI.75245_01.fna.gz+Ga0101944_10026827+136,mG.JGI.75245_01.fna.gz+Ga0101944_10026827+136,GGCTGGAGCAGCCCTCGATTTGCAGGGTAATCACAGC,.....................................,37.0,...,CAGTAGACATCCTGTCCGTTAATCTTGGC,29.0,29.0,3476.0,100.0,GCGATAGAGC,0.0,0.0,0.0,0.0
898547,known_Cas13Bt-A11_JGI_Munlanlewell138R_LiWei20...,mG.JGI.76967_01.fna.gz_Ga0073932_1026561_2,"(M, G, N, I, S, G, E, K, I, G, I, K, M, D, N, ...","(M, G, I, D, Y, S, L, T, S, D, C, Y, R, G, I, ...",,mG.JGI.76967_01.fna.gz+Ga0073932_1026561+88,mG.JGI.76967_01.fna.gz+Ga0073932_1026561+88,GGCTGGAGCAGCCCTCGATTTGCAGGGTAATCACAGC,.....................................,37.0,...,CAGTAGACATCCTGTCCGTTAATCTTGGC,29.0,29.0,3473.0,100.0,GCGATAGAGC,0.0,0.0,0.0,0.0


In [22]:
df_blastp = pd.read_csv("../blast_info/blastp_result/final_blastp_results.csv")
print(df_blastp.shape)
df_blastp.head()

(177232, 4)


Unnamed: 0,qseqid,stitle,pident,evalue
0,known_Cas13Bt-A11_JGI_Munlanlewell138R_LiWei20...,"""""""""""""""Chain A, Cas13bt3 [Planctomycetota bact...",26.62,2.7600000000000003e-33
1,known_Cas13Bt-A13_JGI_Ga0246100_LiWei2022CellD...,"Chain A, Cas13bt3 [Planctomycetota bacterium]",26.99,1.2100000000000001e-33
2,known_Cas13Bt-A17_JGI_SalMarSW160370MG_LiWei20...,"Chain A, Cas13bt3 [Planctomycetota bacterium]",32.838,3.19e-74
3,known_Cas13Bt-A18_JGI_SalMarWE160370MG_LiWei20...,"Chain A, Cas13bt3 [Planctomycetota bacterium]",34.474,9.09e-106
4,known_Cas13Bt-A2_NCBI-Prokaryotes_GCA_01693289...,"Chain A, Cas13bt3 [Planctomycetota bacterium]",39.268,1.4699999999999998e-146


In [23]:
add_blast_df = pd.merge(df_has_known, df_blastp, left_on='member', right_on='qseqid', how='left')
print(add_blast_df.shape)
print(f"'qseqid'列中非NA的数量为:{add_blast_df['qseqid'].dropna().loc[lambda x: x != 0].count()}")
# add_blast_df.head(20)
add_blast_df.to_csv('merged_final_blastp_and_patent_info.csv', index=False)

(585, 25)
'qseqid'列中非NA的数量为:43


### 因为合并之后的结果，有注释的太少了，看看原始的

In [24]:
df_all = pd.read_csv("../blast_info/blastp_result/merged_blastp_results.csv")
print(df_all.shape)
df_all.head()

(9550535, 5)


Unnamed: 0,qseqid,stitle,sseqid,pident,evalue
0,mG.blast.ref_prok_rep_genomes_199.fna.gz_NZ_BI...,VanZ family protein [Paenibacillus illinoisensis],ref|WP_127536894.1|,100.0,0.0
1,mG.blast.ref_prok_rep_genomes_199.fna.gz_NZ_BI...,VanZ family protein [Paenibacillus xylanexedens],ref|WP_154893995.1|,96.421,0.0
2,mG.blast.ref_prok_rep_genomes_199.fna.gz_NZ_BI...,VanZ family protein [Paenibacillus sp. CC-CFT742],ref|WP_286457057.1|,93.053,0.0
3,mG.blast.ref_prok_rep_genomes_199.fna.gz_NZ_BI...,hypothetical protein CHI14_01605 [Paenibacillu...,gb|PAF33837.1|,92.842,0.0
4,mG.blast.ref_prok_rep_genomes_199.fna.gz_NZ_BI...,VanZ family protein [Paenibacillus sp. 7516],ref|WP_170947979.1|,92.842,0.0


In [25]:
merge_blast_df = pd.merge(df_has_known, df_all, left_on='member', right_on='qseqid', how='left')
# final_df = final_df.drop_duplicates(subset=['qseqid'], keep='first')
merge_blast_df = merge_blast_df.drop_duplicates(subset=['qseqid'], keep='first')
print(merge_blast_df.shape)
print(f"'qseqid'列中非NA的数量为:{merge_blast_df['qseqid'].dropna().loc[lambda x: x != 0].count()}")
merge_blast_df.head()

(103, 26)
'qseqid'列中非NA的数量为:102


Unnamed: 0,rep,member,SeqRecord:rep,SeqRecord:member,crispr_id:rep,crispr_id:member,crispr_info=member:crispr_id,crispr_info=member:representative_repeat_seq,crispr_info=member:repeat_mismatch,crispr_info=member:representative_repeat_length,...,crispr_info=member:left_flank,crispr_info=member:spacer_length:std,crispr_info=member:repeat_mismatch_count,crispr_info=member:repeat_mismatch_count:mean,crispr_info=member:repeat_mismatch_count:std,qseqid,stitle,sseqid,pident,evalue
0,known_Cas13Bt-A11_JGI_Munlanlewell138R_LiWei20...,known_Cas13Bt-A11_JGI_Munlanlewell138R_LiWei20...,"(M, G, N, I, S, G, E, K, I, G, I, K, M, D, N, ...","(M, G, N, I, S, G, E, K, I, G, I, K, M, D, N, ...",,,,,,,...,,,,,,known_Cas13Bt-A11_JGI_Munlanlewell138R_LiWei20...,hypothetical protein [Sedimentisphaerales bact...,gb|MBN1795829.1|,54.745,0.0
42,known_Cas13Bt-A11_JGI_Munlanlewell138R_LiWei20...,mG.JGI.192505_00.fna.gz_Ga0265293_10004442_4,"(M, G, N, I, S, G, E, K, I, G, I, K, M, D, N, ...","(M, G, N, I, S, G, E, K, I, G, I, K, M, D, N, ...",,mG.JGI.192505_00.fna.gz+Ga0265293_10004442+125,mG.JGI.192505_00.fna.gz+Ga0265293_10004442+125,GCTGTGATTACCCTGCAAATCGAGGGCTGCTCCAGC,...................-................,36.0,...,TTATGCTGCT,0.48795,0.0,0.0,0.0,,,,,
49,known_Cas13Bt-A13_JGI_Ga0246100_LiWei2022CellD...,known_Cas13Bt-A13_JGI_Ga0246100_LiWei2022CellD...,"(M, E, K, Y, L, I, K, N, F, E, G, I, N, K, S, ...","(M, E, K, Y, L, I, K, N, F, E, G, I, N, K, S, ...",,,,,,,...,,,,,,known_Cas13Bt-A13_JGI_Ga0246100_LiWei2022CellD...,hypothetical protein [Sedimentisphaerales bact...,gb|MBN1795829.1|,46.525,0.0
141,known_Cas13Bt-A17_JGI_SalMarSW160370MG_LiWei20...,known_Cas13Bt-A17_JGI_SalMarSW160370MG_LiWei20...,"(M, Q, T, A, T, Q, E, Q, K, Q, K, Q, S, I, Y, ...","(M, Q, T, A, T, Q, E, Q, K, Q, K, Q, S, I, Y, ...",,,,,,,...,,,,,,known_Cas13Bt-A17_JGI_SalMarSW160370MG_LiWei20...,hypothetical protein [Sedimentisphaerales bact...,gb|MBN1806721.1|,30.296,1.05e-96
253,known_Cas13Bt-A18_JGI_SalMarWE160370MG_LiWei20...,known_Cas13Bt-A18_JGI_SalMarWE160370MG_LiWei20...,"(M, N, P, V, D, I, K, E, A, S, K, K, A, V, Y, ...","(M, N, P, V, D, I, K, E, A, S, K, K, A, V, Y, ...",,,,,,,...,,,,,,known_Cas13Bt-A18_JGI_SalMarWE160370MG_LiWei20...,hypothetical protein [Sedimentisphaerales bact...,gb|MBN1806721.1|,35.956,2.2e-154


In [26]:
merge_blast_df.to_csv('merged_all_blastp_and_patent_info.csv', index=False)