#  Helper notebook to dms_workflow
### 1- Link FASTA sequences and respective IDs from Protein Gym sheets
### 2- Export FASTA files in the appropriate path to be used by dms_workflow


In [None]:
# == NATIVE MODULES
import subprocess
# == INSTALLED MODULES
import pandas as pd
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
from Bio.Seq import Seq

In [2]:
def capture_fasta_from_df(df, seq_col_name, id_col_name):
	df.index = df[id_col_name]
	df = df.drop(columns=[id_col_name])
	source_dict_from_df = df[seq_col_name].to_dict()
	return source_dict_from_df

def create_SeqRecord(sequence_dict):
	records = []
	for seq_id in sequence_dict:
		records.append(
			SeqRecord(
				Seq(sequence_dict[seq_id]),
				id=seq_id,
				description='',
				name=seq_id
			)
		)
	return records

In [3]:
# == FILEPATHS ==
source_table_path = "../protein_gym/proteingym_metadata.csv"
sequence_col_name = "target_seq"
id_col_name = "UniProt_ID"
output_dir = "/groups/doudna/projects/daniel_projects/prywes_n/pgym_input_data/fasta"

In [4]:
source_table = pd.read_csv(source_table_path)
match_dict = capture_fasta_from_df(source_table, sequence_col_name, id_col_name)
records = create_SeqRecord(match_dict)

In [5]:
cmd = f"mkdir {output_dir}"
subprocess.run(cmd, shell=True)
# Go through each record gathered from the source table
for record in records:
	filepath = f"{output_dir}/{record.id}.fasta"
	# Write the fasta output to user-provided path
	SeqIO.write(record, filepath, "fasta")
	print(filepath)

mkdir: cannot create directory ‘/groups/doudna/projects/daniel_projects/prywes_n/pgym_input_data/fasta’: File exists


/groups/doudna/projects/daniel_projects/prywes_n/pgym_input_data/fasta/A0A140D2T1_ZIKV.fasta
/groups/doudna/projects/daniel_projects/prywes_n/pgym_input_data/fasta/A0A192B1T2_9HIV1.fasta
/groups/doudna/projects/daniel_projects/prywes_n/pgym_input_data/fasta/A0A1I9GEU1_NEIME.fasta
/groups/doudna/projects/daniel_projects/prywes_n/pgym_input_data/fasta/A0A2Z5U3Z0_9INFA.fasta
/groups/doudna/projects/daniel_projects/prywes_n/pgym_input_data/fasta/A4_HUMAN.fasta
/groups/doudna/projects/daniel_projects/prywes_n/pgym_input_data/fasta/A4D664_9INFA.fasta
/groups/doudna/projects/daniel_projects/prywes_n/pgym_input_data/fasta/A4GRB6_PSEAI.fasta
/groups/doudna/projects/daniel_projects/prywes_n/pgym_input_data/fasta/AACC1_PSEAI.fasta
/groups/doudna/projects/daniel_projects/prywes_n/pgym_input_data/fasta/ADRB2_HUMAN.fasta
/groups/doudna/projects/daniel_projects/prywes_n/pgym_input_data/fasta/AMIE_PSEAE.fasta
/groups/doudna/projects/daniel_projects/prywes_n/pgym_input_data/fasta/LGK_LIPST.fasta
/group

In [22]:
source_table.head()

Unnamed: 0_level_0,DMS_id,DMS_filename,UniProt_ID,taxon,target_seq,seq_len,includes_multiple_mutants,DMS_total_number_mutants,DMS_number_single_mutants,DMS_number_multiple_mutants,...,MSA_N_eff,MSA_Neff_L,MSA_Neff_L_category,MSA_num_significant,MSA_num_significant_L,raw_DMS_filename,raw_DMS_phenotype_name,raw_DMS_directionality,raw_DMS_mutant_column,weight_file_name
UniProt_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0A140D2T1_ZIKV,A0A140D2T1_ZIKV_Sourisseau_growth_2019,A0A140D2T1_ZIKV_Sourisseau_growth_2019.csv,A0A140D2T1_ZIKV,Virus,MKNPKKKSGGFRIVNMLKRGVARVNPLGGLKRLPAGLLLGHGPIRM...,3423,False,9576,9576,0,...,1357.9,2.732193,medium,329,0.661972,A0A140D2T1_ZIKV_Sourisseau_growth_2019.csv,effect,1,mutant,A0A140D2T1_ZIKV_theta_0.01.npy
A0A192B1T2_9HIV1,A0A192B1T2_9HIV1_Haddox_2018,A0A192B1T2_9HIV1_Haddox_2018.csv,A0A192B1T2_9HIV1,Virus,MRVKGIQMNSQHLLRWGIMILGMIMICSVAGNLWVTVYYGVPVWKD...,852,False,12577,12577,0,...,36319.9,43.237976,medium,2382,2.835714,A0A192B1T2_9HIV1_Haddox_2018.csv,fitness,1,mutant,A0A192B1T2_9HIV1_theta_0.01.npy
A0A1I9GEU1_NEIME,A0A1I9GEU1_NEIME_Kennouche_2019,A0A1I9GEU1_NEIME_Kennouche_2019.csv,A0A1I9GEU1_NEIME,Prokaryote,FTLIELMIVIAIVGILAAVALPAYQDYTARAQVSEAILLAEGQKSA...,161,False,922,922,0,...,2183.6,15.823188,medium,72,0.521739,A0A1I9GEU1_NEIME_Kennouche_2019.csv,piliation_log2_ratio,1,mutants,A0A1I9GEU1_NEIME_theta_0.2.npy
A0A2Z5U3Z0_9INFA,A0A2Z5U3Z0_9INFA_Doud_2016,A0A2Z5U3Z0_9INFA_Doud_2016.csv,A0A2Z5U3Z0_9INFA,Virus,MKAKLLVLLYAFVATDADTICIGYHANNSTDTVDTILEKNVAVTHS...,565,False,10715,10715,0,...,9809.4,17.93309,medium,925,1.691042,A0A2Z5U3Z0_9INFA_Doud_2016.csv,transformed_pref,1,mutant,A0A2Z5U3Z0_9INFA_theta_0.01.npy
A0A2Z5U3Z0_9INFA,A0A2Z5U3Z0_9INFA_Wu_2014,A0A2Z5U3Z0_9INFA_Wu_2014.csv,A0A2Z5U3Z0_9INFA,Virus,MKAKLLVLLYAFVATDADTICIGYHANNSTDTVDTILEKNVAVTHS...,565,False,2350,2350,0,...,9809.4,17.93309,medium,925,1.691042,A0A2Z5U3Z0_9INFA_Wu_2014.csv,RF Index,1,mutant,A0A2Z5U3Z0_9INFA_theta_0.01.npy
