From f4861e711f3efd934cae7cd4647dead3ead8cf09 Mon Sep 17 00:00:00 2001 From: Vladimir Grigoriants Date: Wed, 27 Sep 2023 13:10:50 +0400 Subject: [PATCH 01/36] Add folder HW4_Grigoriants, create README.md --- HW4_Grigoriants/README.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 HW4_Grigoriants/README.md diff --git a/HW4_Grigoriants/README.md b/HW4_Grigoriants/README.md new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/HW4_Grigoriants/README.md @@ -0,0 +1 @@ + From c3b919cccef914ea93586a2ff7a5ca9041d9859d Mon Sep 17 00:00:00 2001 From: VovaGrig Date: Fri, 29 Sep 2023 13:57:45 +0300 Subject: [PATCH 02/36] Add protein_tools.py with run_protein_tools and check_for_motif functions inside --- HW4_Grigoriants/protein_tools.py | 41 ++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 HW4_Grigoriants/protein_tools.py diff --git a/HW4_Grigoriants/protein_tools.py b/HW4_Grigoriants/protein_tools.py new file mode 100644 index 0000000..bf06915 --- /dev/null +++ b/HW4_Grigoriants/protein_tools.py @@ -0,0 +1,41 @@ +def check_for_motifs(sequences, motif): + start = 0 + nl = "\n" # used for user-friendly output + all_positions = [] + for sequence in sequences: + if motif in sequence: + positions = [] + while True: + start = sequence.find(motif, start) + if start == -1: + break + positions.append(start) + start += 1 # use += len(motif) not to count overlapping matches + all_positions.append(positions) + pos_for_print = ", ".join(str(x) for x in positions) + print(f"Sequence: {sequence}") + print(f"Motif: {motif}") + print( + f"Motif is present in protein sequence starting at positions: {pos_for_print}{nl}" + ) + else: + all_positions.append([]) + print(f"Sequence: {sequence}") + print(f"Motif: {motif}") + print(f"Motif is not present in protein sequence{nl}") + return all_positions + + +procedures_to_functions = {"check_for_motifs": check_for_motifs} + + +def run_protein_tools(*args, **kwargs): + sequences = list(args) + procedure = kwargs["procedure"] + if procedure not in procedures_to_functions.keys(): + raise ValueError("Wrong procedure") + procedure_arguments = {} + procedure_arguments["sequences"] = sequences + if procedure == "check_for_motifs": + procedure_arguments["motif"] = kwargs["motif"] + return procedures_to_functions[procedure](**procedure_arguments) From bc24a411248e1c73ed401ceab4983cea6a0e334b Mon Sep 17 00:00:00 2001 From: Ekaterina Shitik Date: Fri, 29 Sep 2023 13:34:14 +0200 Subject: [PATCH 03/36] Add 'search_for_alt_frames' function --- HW4_Grigoriants/protein_tools.py | 36 ++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/HW4_Grigoriants/protein_tools.py b/HW4_Grigoriants/protein_tools.py index bf06915..f78b59c 100644 --- a/HW4_Grigoriants/protein_tools.py +++ b/HW4_Grigoriants/protein_tools.py @@ -26,6 +26,42 @@ def check_for_motifs(sequences, motif): return all_positions +def search_for_alt_frames(sequences: str, alt_st_codon: str, num_position=0): + """ + Search for alternative frames in a protein sequences + + Without an alt_st_codon argument search for frames that start with methionine ('M') + To search frames with alternative start codon add alt_st_codon argument + In alt_st_codon argument use one-letter code + + The function ignores the last three amino acids in sequences + + Arguments: + - sequences (tuple(str) or list(str)): sequences to check + - alt_st_codon (str): the name of an amino acid that is encoded by alternative start codon (Optional) + Example: alt_st_codon = 'I' + + Return: + - dictionary: the number of a sequence and a collection of alternative frames + """ + if len(alt_st_codon) > 1: + raise ValueError('Invalid start codon!') + alternative_frames = {} + for sequence in sequences: + for amino_acid in sequence[1:-3]: + num_position += 1 + if (amino_acid == alt_st_codon or + amino_acid == alt_st_codon.swapcase()): + key = sequences.index(sequence) + 1 + if key in alternative_frames: + alternative_frames[key] += sequence[num_position:] + ' ' + else: + alternative_frames[key] = sequence[num_position:] + ' ' + num_position = 0 + for key, value in alternative_frames.items(): + print(key, value) + + procedures_to_functions = {"check_for_motifs": check_for_motifs} From f81d442a56d5a1c631e2f5337a881c49ef52738b Mon Sep 17 00:00:00 2001 From: Ekaterina Shitik Date: Fri, 29 Sep 2023 13:36:44 +0200 Subject: [PATCH 04/36] Add 'convert_to_nucl_acids' function --- HW4_Grigoriants/protein_tools.py | 64 ++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/HW4_Grigoriants/protein_tools.py b/HW4_Grigoriants/protein_tools.py index f78b59c..762ea54 100644 --- a/HW4_Grigoriants/protein_tools.py +++ b/HW4_Grigoriants/protein_tools.py @@ -62,6 +62,70 @@ def search_for_alt_frames(sequences: str, alt_st_codon: str, num_position=0): print(key, value) +def convert_to_nucl_acids(sequences: str, nucl_acids: str): + """ + Convert protein sequences to RNA or DNA sequences. + + Use the most frequent codons in human. The source - https://www.genscript.com/tools/codon-frequency-table + All nucleic acids (DNA and RNA) are showed in 5'-3' direction + + Arguments: + - sequences (tuple(str) or list(str)): sequences to convert + - nucl_acids (str): the nucleic acid that is prefered + Example: nucl_acids = 'RNA' - convert to RNA + nucl_acids = 'DNA' - convert to DNA + nucl_acids = 'both' - convert to RNA and DNA + Return: + - dictionary: a collection of alternative frames + If nucl_acids = 'RNA' or nucl_acids = 'DNA' output a collection of frames + If nucl_acids = 'both' output the name of a nucleic acid and a collection of frames + """ + alphabet = {'F': 'UUU', 'f': 'uuu', + 'L': 'CUG', 'l': 'cug', + 'I': 'AUU', 'i': 'auu', + 'M': 'AUG', 'm': 'aug', + 'V': 'GUG', 'v': 'gug', + 'P': 'CCG', 'p': 'ccg', + 'T': 'ACC', 't': 'acc', + 'A': 'GCG', 'a': 'gcg', + 'Y': 'UAU', 'y': 'uau', + 'H': 'CAU', 'h': 'cau', + 'Q': 'CAG', 'q': 'cag', + 'N': 'AAC', 'n': 'aac', + 'K': 'AAA', 'k': 'aaa', + 'D': 'GAU', 'd': 'gau', + 'E': 'GAA', 'e': 'gaa', + 'C': 'UGC', 'c': 'ugc', + 'W': 'UGG', 'w': 'ugg', + 'R': 'CGU', 'r': 'cgu', + 'S': 'AGC', 's': 'agc', + 'G': 'GGC', 'g': 'ggc', + } + if nucl_acids not in {'DNA', 'RNA', 'both'}: + raise ValueError('Invalid nucl_acids argument!') + rule_of_translation = sequences[0].maketrans(alphabet) + rule_of_transcription = sequences[0].maketrans('AaUuCcGg', 'TtAaGgCc') + nucl_acid_seqs = {} + for sequence in sequences: + rna_seq = sequence.translate(rule_of_translation) + reverse_dna_seq = rna_seq.translate(rule_of_transcription)[::-1] + if 'RNA' in nucl_acid_seqs.keys(): + nucl_acid_seqs['RNA'] += rna_seq + ' ' + else: + nucl_acid_seqs['RNA'] = rna_seq + ' ' + if 'DNA' in nucl_acid_seqs.keys(): + nucl_acid_seqs['DNA'] += reverse_dna_seq + ' ' + else: + nucl_acid_seqs['DNA'] = reverse_dna_seq + ' ' + if nucl_acids == 'RNA': + return nucl_acid_seqs['RNA'] + elif nucl_acids == 'DNA': + return nucl_acid_seqs['DNA'] + elif nucl_acids == 'both': + for key, value in nucl_acid_seqs.items(): + print(key, value) + + procedures_to_functions = {"check_for_motifs": check_for_motifs} From cbeb58a4feea1015c3ca7558bcaf24a32000781f Mon Sep 17 00:00:00 2001 From: Ekaterina Shitik Date: Fri, 29 Sep 2023 13:43:56 +0200 Subject: [PATCH 05/36] Add conditions in 'main' function --- HW4_Grigoriants/protein_tools.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/HW4_Grigoriants/protein_tools.py b/HW4_Grigoriants/protein_tools.py index 762ea54..3eb3cb9 100644 --- a/HW4_Grigoriants/protein_tools.py +++ b/HW4_Grigoriants/protein_tools.py @@ -126,7 +126,10 @@ def convert_to_nucl_acids(sequences: str, nucl_acids: str): print(key, value) -procedures_to_functions = {"check_for_motifs": check_for_motifs} +procedures_to_functions = {"check_for_motifs": check_for_motifs, + 'search_for_alt_frames': search_for_alt_frames, + 'convert_to_nucl_acids': convert_to_nucl_acids + } def run_protein_tools(*args, **kwargs): @@ -138,4 +141,18 @@ def run_protein_tools(*args, **kwargs): procedure_arguments["sequences"] = sequences if procedure == "check_for_motifs": procedure_arguments["motif"] = kwargs["motif"] - return procedures_to_functions[procedure](**procedure_arguments) + return procedures_to_functions[procedure](**procedure_arguments) + elif procedure == 'search_for_alt_frames': + if 'alt_st_codon' not in kwargs.keys(): + procedure_arguments['alt_st_codon'] = 'M' + else: + procedure_arguments['alt_st_codon'] = kwargs['alt_st_codon'] + procedure_arguments['sequences'] = sequences + return procedures_to_functions[procedure](**procedure_arguments) + elif procedure == 'convert_to_nucl_acids': + if 'nucl_acids' not in kwargs.keys(): + raise ValueError('Add type of nucl_acids!') + else: + procedure_arguments['nucl_acids'] = kwargs['nucl_acids'] + procedure_arguments['sequences'] = sequences + return procedures_to_functions[procedure](**procedure_arguments) From d91cfd4850ef1b005dd70d3354c1c88c94d744d7 Mon Sep 17 00:00:00 2001 From: VovaGrig Date: Fri, 29 Sep 2023 15:21:20 +0300 Subject: [PATCH 06/36] Add minor fix to protein_tools.py --- HW4_Grigoriants/protein_tools.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/HW4_Grigoriants/protein_tools.py b/HW4_Grigoriants/protein_tools.py index bf06915..1aa99ce 100644 --- a/HW4_Grigoriants/protein_tools.py +++ b/HW4_Grigoriants/protein_tools.py @@ -1,6 +1,6 @@ def check_for_motifs(sequences, motif): start = 0 - nl = "\n" # used for user-friendly output + new_line = "\n" # used for user-friendly output all_positions = [] for sequence in sequences: if motif in sequence: @@ -16,13 +16,13 @@ def check_for_motifs(sequences, motif): print(f"Sequence: {sequence}") print(f"Motif: {motif}") print( - f"Motif is present in protein sequence starting at positions: {pos_for_print}{nl}" + f"Motif is present in protein sequence starting at positions: {pos_for_print}{new_line}" ) else: all_positions.append([]) print(f"Sequence: {sequence}") print(f"Motif: {motif}") - print(f"Motif is not present in protein sequence{nl}") + print(f"Motif is not present in protein sequence{new_line}") return all_positions From 39b8acd4cd4d237f4e224fc3ba21d9f9f18a9661 Mon Sep 17 00:00:00 2001 From: VovaGrig Date: Fri, 29 Sep 2023 19:02:08 +0300 Subject: [PATCH 07/36] Add check_and_parse_user_input in protein_tools.py, add fixes --- HW4_Grigoriants/protein_tools.py | 217 +++++++++++++++++++------------ 1 file changed, 131 insertions(+), 86 deletions(-) diff --git a/HW4_Grigoriants/protein_tools.py b/HW4_Grigoriants/protein_tools.py index b4da468..8f610b9 100644 --- a/HW4_Grigoriants/protein_tools.py +++ b/HW4_Grigoriants/protein_tools.py @@ -1,32 +1,30 @@ def check_for_motifs(sequences, motif): - start = 0 - new_line = "\n" # used for user-friendly output - all_positions = [] + # new_line = "\n" # used for user-friendly output + all_positions = {} for sequence in sequences: + start = 0 + positions = [] if motif in sequence: - positions = [] while True: start = sequence.find(motif, start) if start == -1: break positions.append(start) start += 1 # use += len(motif) not to count overlapping matches - all_positions.append(positions) - pos_for_print = ", ".join(str(x) for x in positions) - print(f"Sequence: {sequence}") - print(f"Motif: {motif}") - print( - f"Motif is present in protein sequence starting at positions: {pos_for_print}{new_line}" - ) - else: - all_positions.append([]) - print(f"Sequence: {sequence}") - print(f"Motif: {motif}") - print(f"Motif is not present in protein sequence{new_line}") + # pos_for_print = ", ".join(str(x) for x in positions) + # print(f"Sequence: {sequence}") + # print(f"Motif: {motif}") + # print( + # f"Motif is present in protein sequence starting at positions: {pos_for_print}{new_line}" + # ) + all_positions[sequence] = positions + # print(f"Sequence: {sequence}") + # print(f"Motif: {motif}") + # print(f"Motif is not present in protein sequence{new_line}") return all_positions -def search_for_alt_frames(sequences: str, alt_st_codon: str, num_position=0): +def search_for_alt_frames(sequences: str, alt_st_codon: str): """ Search for alternative frames in a protein sequences @@ -44,22 +42,23 @@ def search_for_alt_frames(sequences: str, alt_st_codon: str, num_position=0): Return: - dictionary: the number of a sequence and a collection of alternative frames """ - if len(alt_st_codon) > 1: - raise ValueError('Invalid start codon!') + # if len(alt_st_codon) > 1: + # raise ValueError("Invalid start codon!") alternative_frames = {} + num_position = 0 for sequence in sequences: for amino_acid in sequence[1:-3]: num_position += 1 - if (amino_acid == alt_st_codon or - amino_acid == alt_st_codon.swapcase()): + if amino_acid == alt_st_codon or amino_acid == alt_st_codon.swapcase(): key = sequences.index(sequence) + 1 if key in alternative_frames: - alternative_frames[key] += sequence[num_position:] + ' ' + alternative_frames[key] += sequence[num_position:] + " " else: - alternative_frames[key] = sequence[num_position:] + ' ' + alternative_frames[key] = sequence[num_position:] + " " num_position = 0 - for key, value in alternative_frames.items(): - print(key, value) + # for key, value in alternative_frames.items(): + # print(key, value) + return alternative_frames def convert_to_nucl_acids(sequences: str, nucl_acids: str): @@ -80,79 +79,125 @@ def convert_to_nucl_acids(sequences: str, nucl_acids: str): If nucl_acids = 'RNA' or nucl_acids = 'DNA' output a collection of frames If nucl_acids = 'both' output the name of a nucleic acid and a collection of frames """ - alphabet = {'F': 'UUU', 'f': 'uuu', - 'L': 'CUG', 'l': 'cug', - 'I': 'AUU', 'i': 'auu', - 'M': 'AUG', 'm': 'aug', - 'V': 'GUG', 'v': 'gug', - 'P': 'CCG', 'p': 'ccg', - 'T': 'ACC', 't': 'acc', - 'A': 'GCG', 'a': 'gcg', - 'Y': 'UAU', 'y': 'uau', - 'H': 'CAU', 'h': 'cau', - 'Q': 'CAG', 'q': 'cag', - 'N': 'AAC', 'n': 'aac', - 'K': 'AAA', 'k': 'aaa', - 'D': 'GAU', 'd': 'gau', - 'E': 'GAA', 'e': 'gaa', - 'C': 'UGC', 'c': 'ugc', - 'W': 'UGG', 'w': 'ugg', - 'R': 'CGU', 'r': 'cgu', - 'S': 'AGC', 's': 'agc', - 'G': 'GGC', 'g': 'ggc', - } - if nucl_acids not in {'DNA', 'RNA', 'both'}: - raise ValueError('Invalid nucl_acids argument!') + # if nucl_acids not in {"DNA", "RNA", "both"}: + # raise ValueError("Invalid nucl_acids argument!") rule_of_translation = sequences[0].maketrans(alphabet) - rule_of_transcription = sequences[0].maketrans('AaUuCcGg', 'TtAaGgCc') - nucl_acid_seqs = {} + rule_of_transcription = sequences[0].maketrans("AaUuCcGg", "TtAaGgCc") + nucl_acid_seqs = {"RNA": [], "DNA": []} for sequence in sequences: rna_seq = sequence.translate(rule_of_translation) reverse_dna_seq = rna_seq.translate(rule_of_transcription)[::-1] - if 'RNA' in nucl_acid_seqs.keys(): - nucl_acid_seqs['RNA'] += rna_seq + ' ' - else: - nucl_acid_seqs['RNA'] = rna_seq + ' ' - if 'DNA' in nucl_acid_seqs.keys(): - nucl_acid_seqs['DNA'] += reverse_dna_seq + ' ' - else: - nucl_acid_seqs['DNA'] = reverse_dna_seq + ' ' - if nucl_acids == 'RNA': - return nucl_acid_seqs['RNA'] - elif nucl_acids == 'DNA': - return nucl_acid_seqs['DNA'] - elif nucl_acids == 'both': - for key, value in nucl_acid_seqs.items(): - print(key, value) + # if "RNA" in nucl_acid_seqs.keys(): + # nucl_acid_seqs["RNA"] += rna_seq + " " + # else: + # nucl_acid_seqs["RNA"] = rna_seq + " " + if nucl_acids == "RNA": + nucl_acid_seqs["RNA"].append(rna_seq) + if sequence == sequences[-1]: + del nucl_acid_seqs["DNA"] + if nucl_acids == "DNA": + nucl_acid_seqs["DNA"].append(reverse_dna_seq) + if sequence == sequences[-1]: + del nucl_acid_seqs["RNA"] + if nucl_acids == "both": + nucl_acid_seqs["RNA"].append(rna_seq) + nucl_acid_seqs["DNA"].append(reverse_dna_seq) + # if "DNA" in nucl_acid_seqs.keys(): + # nucl_acid_seqs["DNA"] += reverse_dna_seq + " " + # else: + # nucl_acid_seqs["DNA"] = reverse_dna_seq + " " + # if nucl_acids == "RNA": + # return nucl_acid_seqs["RNA"] + # elif nucl_acids == "DNA": + # return nucl_acid_seqs["DNA"] + # elif nucl_acids == "both": + # for key, value in nucl_acid_seqs.items(): + # print(key, value) + return nucl_acid_seqs -procedures_to_functions = {"check_for_motifs": check_for_motifs, - 'search_for_alt_frames': search_for_alt_frames, - 'convert_to_nucl_acids': convert_to_nucl_acids - } +procedures_to_functions = { + "check_for_motifs": check_for_motifs, + "search_for_alt_frames": search_for_alt_frames, + "convert_to_nucl_acids": convert_to_nucl_acids, +} +alphabet = { + "F": "UUU", + "f": "uuu", + "L": "CUG", + "l": "cug", + "I": "AUU", + "i": "auu", + "M": "AUG", + "m": "aug", + "V": "GUG", + "v": "gug", + "P": "CCG", + "p": "ccg", + "T": "ACC", + "t": "acc", + "A": "GCG", + "a": "gcg", + "Y": "UAU", + "y": "uau", + "H": "CAU", + "h": "cau", + "Q": "CAG", + "q": "cag", + "N": "AAC", + "n": "aac", + "K": "AAA", + "k": "aaa", + "D": "GAU", + "d": "gau", + "E": "GAA", + "e": "gaa", + "C": "UGC", + "c": "ugc", + "W": "UGG", + "w": "ugg", + "R": "CGU", + "r": "cgu", + "S": "AGC", + "s": "agc", + "G": "GGC", + "g": "ggc", +} -def run_protein_tools(*args, **kwargs): + +def check_and_parse_user_input(*args, **kwargs): + if len(args) == 0: + raise ValueError("No sequences provided") sequences = list(args) + for sequence in sequences: + if not all(letters in "".join(alphabet.keys()) for letters in sequence): + raise ValueError("Invalid sequence given") procedure = kwargs["procedure"] + procedure_arguments = {} if procedure not in procedures_to_functions.keys(): raise ValueError("Wrong procedure") - procedure_arguments = {} - procedure_arguments["sequences"] = sequences if procedure == "check_for_motifs": + if "motif" not in kwargs.keys(): + raise ValueError("Please provide desired motif") procedure_arguments["motif"] = kwargs["motif"] - return procedures_to_functions[procedure](**procedure_arguments) - elif procedure == 'search_for_alt_frames': - if 'alt_st_codon' not in kwargs.keys(): - procedure_arguments['alt_st_codon'] = 'M' + elif procedure == "search_for_alt_frames": + if "alt_st_codon" not in kwargs.keys(): + procedure_arguments["alt_st_codon"] = "M" else: - procedure_arguments['alt_st_codon'] = kwargs['alt_st_codon'] - procedure_arguments['sequences'] = sequences - return procedures_to_functions[procedure](**procedure_arguments) - elif procedure == 'convert_to_nucl_acids': - if 'nucl_acids' not in kwargs.keys(): - raise ValueError('Add type of nucl_acids!') - else: - procedure_arguments['nucl_acids'] = kwargs['nucl_acids'] - procedure_arguments['sequences'] = sequences - return procedures_to_functions[procedure](**procedure_arguments) + if len(kwargs["alt_st_codon"]) > 1: + raise ValueError("Invalid start codon!") + procedure_arguments["alt_st_codon"] = kwargs["alt_st_codon"] + elif procedure == "convert_to_nucl_acids": + if "nucl_acids" not in kwargs.keys(): + raise ValueError("Please provide desired type of nucl_acids") + if kwargs["nucl_acids"] not in {"DNA", "RNA", "both"}: + raise ValueError("Invalid nucl_acids argument") + procedure_arguments["nucl_acids"] = kwargs["nucl_acids"] + procedure_arguments["sequences"] = sequences + return procedure_arguments, procedure + + +def run_protein_tools(*args, **kwargs): + procedure_arguments, procedure = check_and_parse_user_input(*args, **kwargs) + return procedures_to_functions[procedure](**procedure_arguments) From 29fd75253370379202830e0175941b70f6eaa2f4 Mon Sep 17 00:00:00 2001 From: VovaGrig Date: Fri, 29 Sep 2023 19:21:43 +0300 Subject: [PATCH 08/36] Add minor fixes in protein_tools.py --- HW4_Grigoriants/protein_tools.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/HW4_Grigoriants/protein_tools.py b/HW4_Grigoriants/protein_tools.py index 8f610b9..08e5512 100644 --- a/HW4_Grigoriants/protein_tools.py +++ b/HW4_Grigoriants/protein_tools.py @@ -24,32 +24,32 @@ def check_for_motifs(sequences, motif): return all_positions -def search_for_alt_frames(sequences: str, alt_st_codon: str): +def search_for_alt_frames(sequences: str, alt_start_aa: str): """ Search for alternative frames in a protein sequences - Without an alt_st_codon argument search for frames that start with methionine ('M') - To search frames with alternative start codon add alt_st_codon argument - In alt_st_codon argument use one-letter code + Without an alt_start_aa argument search for frames that start with methionine ('M') + To search frames with alternative start codon add alt_start_aa argument + In alt_start_aa argument use one-letter code The function ignores the last three amino acids in sequences Arguments: - sequences (tuple(str) or list(str)): sequences to check - - alt_st_codon (str): the name of an amino acid that is encoded by alternative start codon (Optional) - Example: alt_st_codon = 'I' + - alt_start_aa (str): the name of an amino acid that is encoded by alternative start codon (Optional) + Example: alt_start_aa = 'I' Return: - dictionary: the number of a sequence and a collection of alternative frames """ - # if len(alt_st_codon) > 1: + # if len(alt_start_aa) > 1: # raise ValueError("Invalid start codon!") alternative_frames = {} num_position = 0 for sequence in sequences: for amino_acid in sequence[1:-3]: num_position += 1 - if amino_acid == alt_st_codon or amino_acid == alt_st_codon.swapcase(): + if amino_acid == alt_start_aa or amino_acid == alt_start_aa.swapcase(): key = sequences.index(sequence) + 1 if key in alternative_frames: alternative_frames[key] += sequence[num_position:] + " " @@ -182,12 +182,12 @@ def check_and_parse_user_input(*args, **kwargs): raise ValueError("Please provide desired motif") procedure_arguments["motif"] = kwargs["motif"] elif procedure == "search_for_alt_frames": - if "alt_st_codon" not in kwargs.keys(): - procedure_arguments["alt_st_codon"] = "M" + if "alt_start_aa" not in kwargs.keys(): + procedure_arguments["alt_start_aa"] = "M" else: - if len(kwargs["alt_st_codon"]) > 1: - raise ValueError("Invalid start codon!") - procedure_arguments["alt_st_codon"] = kwargs["alt_st_codon"] + if len(kwargs["alt_start_aa"]) > 1: + raise ValueError("Invalid start AA!") + procedure_arguments["alt_start_aa"] = kwargs["alt_start_aa"] elif procedure == "convert_to_nucl_acids": if "nucl_acids" not in kwargs.keys(): raise ValueError("Please provide desired type of nucl_acids") From de4e146ad1ab9b08717a165d9d92355433e78d1f Mon Sep 17 00:00:00 2001 From: VovaGrig Date: Fri, 29 Sep 2023 19:26:31 +0300 Subject: [PATCH 09/36] Add check_and_parse_user_input in protein_tools.py, add fixes --- HW4_Grigoriants/protein_tools.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/HW4_Grigoriants/protein_tools.py b/HW4_Grigoriants/protein_tools.py index 08e5512..ea3bf08 100644 --- a/HW4_Grigoriants/protein_tools.py +++ b/HW4_Grigoriants/protein_tools.py @@ -1,5 +1,5 @@ def check_for_motifs(sequences, motif): - # new_line = "\n" # used for user-friendly output + new_line = "\n" # used for user-friendly output all_positions = {} for sequence in sequences: start = 0 @@ -11,16 +11,16 @@ def check_for_motifs(sequences, motif): break positions.append(start) start += 1 # use += len(motif) not to count overlapping matches - # pos_for_print = ", ".join(str(x) for x in positions) - # print(f"Sequence: {sequence}") - # print(f"Motif: {motif}") - # print( - # f"Motif is present in protein sequence starting at positions: {pos_for_print}{new_line}" - # ) + pos_for_print = ", ".join(str(x) for x in positions) + print(f"Sequence: {sequence}") + print(f"Motif: {motif}") + print( + f"Motif is present in protein sequence starting at positions: {pos_for_print}{new_line}" + ) all_positions[sequence] = positions - # print(f"Sequence: {sequence}") - # print(f"Motif: {motif}") - # print(f"Motif is not present in protein sequence{new_line}") + print(f"Sequence: {sequence}") + print(f"Motif: {motif}") + print(f"Motif is not present in protein sequence{new_line}") return all_positions @@ -81,7 +81,7 @@ def convert_to_nucl_acids(sequences: str, nucl_acids: str): """ # if nucl_acids not in {"DNA", "RNA", "both"}: # raise ValueError("Invalid nucl_acids argument!") - rule_of_translation = sequences[0].maketrans(alphabet) + rule_of_translation = sequences[0].maketrans(translation_rule) rule_of_transcription = sequences[0].maketrans("AaUuCcGg", "TtAaGgCc") nucl_acid_seqs = {"RNA": [], "DNA": []} for sequence in sequences: @@ -122,7 +122,7 @@ def convert_to_nucl_acids(sequences: str, nucl_acids: str): "convert_to_nucl_acids": convert_to_nucl_acids, } -alphabet = { +translation_rule = { "F": "UUU", "f": "uuu", "L": "CUG", @@ -171,7 +171,7 @@ def check_and_parse_user_input(*args, **kwargs): raise ValueError("No sequences provided") sequences = list(args) for sequence in sequences: - if not all(letters in "".join(alphabet.keys()) for letters in sequence): + if not all(letters in "".join(translation_rule.keys()) for letters in sequence): raise ValueError("Invalid sequence given") procedure = kwargs["procedure"] procedure_arguments = {} From 620a551f08c16193d49bb558c7bbe452baaa3d2b Mon Sep 17 00:00:00 2001 From: Vlada Tuliavko Date: Fri, 29 Sep 2023 21:52:08 +0300 Subject: [PATCH 10/36] Add three_one_letter_code and define_molecular_weight functions and fixes --- HW4_Grigoriants/protein_tools.py | 123 ++++++++++++++++++++++++++++--- 1 file changed, 113 insertions(+), 10 deletions(-) diff --git a/HW4_Grigoriants/protein_tools.py b/HW4_Grigoriants/protein_tools.py index ea3bf08..815f2e8 100644 --- a/HW4_Grigoriants/protein_tools.py +++ b/HW4_Grigoriants/protein_tools.py @@ -1,3 +1,32 @@ +def three_one_letter_code(sequences): + inversed_sequences = [] + for sequence in sequences: + inversed_sequence = "" + if "-" not in sequence: + for letter in sequence: + inversed_sequence += amino_acids[letter] + "-" + inversed_sequence = inversed_sequence[:-1] + inversed_sequences.append(inversed_sequence) + else: + aa_splitted = sequence.split("-") + for aa in aa_splitted: + inversed_sequence += list(amino_acids.keys())[ + list(amino_acids.values()).index(aa) + ] + inversed_sequences.append(inversed_sequence) + return inversed_sequences + + +def define_molecular_weight(sequences): + sequences_weights = [] + for sequence in sequences: + sequence_weight = 0 + for letter in sequence: + sequence_weight += amino_acid_weights[letter] + sequences_weights.append(sequence_weight) + return sequences_weights + + def check_for_motifs(sequences, motif): new_line = "\n" # used for user-friendly output all_positions = {} @@ -10,17 +39,19 @@ def check_for_motifs(sequences, motif): if start == -1: break positions.append(start) - start += 1 # use += len(motif) not to count overlapping matches + # use += len(motif) not to count overlapping matches + start += 1 pos_for_print = ", ".join(str(x) for x in positions) print(f"Sequence: {sequence}") print(f"Motif: {motif}") print( f"Motif is present in protein sequence starting at positions: {pos_for_print}{new_line}" ) + else: + print(f"Sequence: {sequence}") + print(f"Motif: {motif}") + print(f"Motif is not present in protein sequence{new_line}") all_positions[sequence] = positions - print(f"Sequence: {sequence}") - print(f"Motif: {motif}") - print(f"Motif is not present in protein sequence{new_line}") return all_positions @@ -72,8 +103,8 @@ def convert_to_nucl_acids(sequences: str, nucl_acids: str): - sequences (tuple(str) or list(str)): sequences to convert - nucl_acids (str): the nucleic acid that is prefered Example: nucl_acids = 'RNA' - convert to RNA - nucl_acids = 'DNA' - convert to DNA - nucl_acids = 'both' - convert to RNA and DNA + nucl_acids = 'DNA' - convert to DNA + nucl_acids = 'both' - convert to RNA and DNA Return: - dictionary: a collection of alternative frames If nucl_acids = 'RNA' or nucl_acids = 'DNA' output a collection of frames @@ -120,6 +151,50 @@ def convert_to_nucl_acids(sequences: str, nucl_acids: str): "check_for_motifs": check_for_motifs, "search_for_alt_frames": search_for_alt_frames, "convert_to_nucl_acids": convert_to_nucl_acids, + "three_one_letter_code": three_one_letter_code, + "define_molecular_weight": define_molecular_weight, +} +amino_acids = { + "A": "Ala", + "C": "Cys", + "D": "Asp", + "E": "Glu", + "F": "Phe", + "G": "Gly", + "H": "His", + "I": "Ile", + "K": "Lys", + "L": "Leu", + "M": "Met", + "N": "Asn", + "P": "Pro", + "Q": "Gln", + "R": "Arg", + "S": "Ser", + "T": "Thr", + "V": "Val", + "W": "Trp", + "Y": "Tyr", + "a": "ala", + "c": "cys", + "d": "asp", + "e": "glu", + "f": "phe", + "g": "gly", + "h": "his", + "i": "ile", + "k": "lys", + "l": "leu", + "m": "met", + "n": "asn", + "p": "pro", + "q": "gln", + "r": "arg", + "s": "ser", + "t": "thr", + "v": "val", + "w": "trp", + "y": "tyr", } translation_rule = { @@ -165,18 +240,46 @@ def convert_to_nucl_acids(sequences: str, nucl_acids: str): "g": "ggc", } +amino_acid_weights = { + "A": 89.09, + "C": 121.16, + "D": 133.10, + "E": 147.13, + "F": 165.19, + "G": 75.07, + "H": 155.16, + "I": 131.17, + "K": 146.19, + "L": 131.17, + "M": 149.21, + "N": 132.12, + "P": 115.13, + "Q": 146.15, + "R": 174.20, + "S": 105.09, + "T": 119.12, + "V": 117.15, + "W": 204.23, + "Y": 181.19, +} + def check_and_parse_user_input(*args, **kwargs): if len(args) == 0: raise ValueError("No sequences provided") + procedure = kwargs["procedure"] + if procedure not in procedures_to_functions.keys(): + raise ValueError("Wrong procedure") sequences = list(args) + allowed_inputs = set(amino_acids.keys()).union( + set(amino_acids.values()).union(set("-")) + ) + if procedure != "three_one_letter_code": + allowed_inputs.remove("-") for sequence in sequences: - if not all(letters in "".join(translation_rule.keys()) for letters in sequence): + if not all(letters in allowed_inputs for letters in sequence): raise ValueError("Invalid sequence given") - procedure = kwargs["procedure"] procedure_arguments = {} - if procedure not in procedures_to_functions.keys(): - raise ValueError("Wrong procedure") if procedure == "check_for_motifs": if "motif" not in kwargs.keys(): raise ValueError("Please provide desired motif") From 93d2d5f36c4c001edb3d7b80966deef42dfb7fea Mon Sep 17 00:00:00 2001 From: VovaGrig Date: Sat, 30 Sep 2023 12:37:04 +0300 Subject: [PATCH 11/36] Add minor fixes in protein_tools.py --- HW4_Grigoriants/protein_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HW4_Grigoriants/protein_tools.py b/HW4_Grigoriants/protein_tools.py index ea3bf08..98b2b88 100644 --- a/HW4_Grigoriants/protein_tools.py +++ b/HW4_Grigoriants/protein_tools.py @@ -17,10 +17,10 @@ def check_for_motifs(sequences, motif): print( f"Motif is present in protein sequence starting at positions: {pos_for_print}{new_line}" ) - all_positions[sequence] = positions print(f"Sequence: {sequence}") print(f"Motif: {motif}") print(f"Motif is not present in protein sequence{new_line}") + all_positions[sequence] = positions return all_positions From d731697b430a2b3b18ad5924da0c2740551a432d Mon Sep 17 00:00:00 2001 From: VovaGrig Date: Sat, 30 Sep 2023 15:29:43 +0300 Subject: [PATCH 12/36] Add minor fixes in protein_tools.py --- HW4_Grigoriants/protein_tools.py | 47 ++++++++++++-------------------- 1 file changed, 18 insertions(+), 29 deletions(-) diff --git a/HW4_Grigoriants/protein_tools.py b/HW4_Grigoriants/protein_tools.py index 815f2e8..67366a3 100644 --- a/HW4_Grigoriants/protein_tools.py +++ b/HW4_Grigoriants/protein_tools.py @@ -33,6 +33,8 @@ def check_for_motifs(sequences, motif): for sequence in sequences: start = 0 positions = [] + print(f"Sequence: {sequence}") + print(f"Motif: {motif}") if motif in sequence: while True: start = sequence.find(motif, start) @@ -42,14 +44,10 @@ def check_for_motifs(sequences, motif): # use += len(motif) not to count overlapping matches start += 1 pos_for_print = ", ".join(str(x) for x in positions) - print(f"Sequence: {sequence}") - print(f"Motif: {motif}") print( f"Motif is present in protein sequence starting at positions: {pos_for_print}{new_line}" ) else: - print(f"Sequence: {sequence}") - print(f"Motif: {motif}") print(f"Motif is not present in protein sequence{new_line}") all_positions[sequence] = positions return all_positions @@ -92,7 +90,7 @@ def search_for_alt_frames(sequences: str, alt_start_aa: str): return alternative_frames -def convert_to_nucl_acids(sequences: str, nucl_acids: str): +def convert_to_nucl_acids(sequences: list, nucl_acids: str): """ Convert protein sequences to RNA or DNA sequences. @@ -110,18 +108,12 @@ def convert_to_nucl_acids(sequences: str, nucl_acids: str): If nucl_acids = 'RNA' or nucl_acids = 'DNA' output a collection of frames If nucl_acids = 'both' output the name of a nucleic acid and a collection of frames """ - # if nucl_acids not in {"DNA", "RNA", "both"}: - # raise ValueError("Invalid nucl_acids argument!") rule_of_translation = sequences[0].maketrans(translation_rule) rule_of_transcription = sequences[0].maketrans("AaUuCcGg", "TtAaGgCc") nucl_acid_seqs = {"RNA": [], "DNA": []} for sequence in sequences: rna_seq = sequence.translate(rule_of_translation) reverse_dna_seq = rna_seq.translate(rule_of_transcription)[::-1] - # if "RNA" in nucl_acid_seqs.keys(): - # nucl_acid_seqs["RNA"] += rna_seq + " " - # else: - # nucl_acid_seqs["RNA"] = rna_seq + " " if nucl_acids == "RNA": nucl_acid_seqs["RNA"].append(rna_seq) if sequence == sequences[-1]: @@ -133,17 +125,6 @@ def convert_to_nucl_acids(sequences: str, nucl_acids: str): if nucl_acids == "both": nucl_acid_seqs["RNA"].append(rna_seq) nucl_acid_seqs["DNA"].append(reverse_dna_seq) - # if "DNA" in nucl_acid_seqs.keys(): - # nucl_acid_seqs["DNA"] += reverse_dna_seq + " " - # else: - # nucl_acid_seqs["DNA"] = reverse_dna_seq + " " - # if nucl_acids == "RNA": - # return nucl_acid_seqs["RNA"] - # elif nucl_acids == "DNA": - # return nucl_acid_seqs["DNA"] - # elif nucl_acids == "both": - # for key, value in nucl_acid_seqs.items(): - # print(key, value) return nucl_acid_seqs @@ -264,21 +245,29 @@ def convert_to_nucl_acids(sequences: str, nucl_acids: str): } -def check_and_parse_user_input(*args, **kwargs): - if len(args) == 0: +def check_and_parse_user_input(sequences, **kwargs): + if len(sequences) == 0: raise ValueError("No sequences provided") procedure = kwargs["procedure"] if procedure not in procedures_to_functions.keys(): raise ValueError("Wrong procedure") - sequences = list(args) allowed_inputs = set(amino_acids.keys()).union( set(amino_acids.values()).union(set("-")) ) if procedure != "three_one_letter_code": allowed_inputs.remove("-") + allowed_inputs -= set(amino_acids.values()) for sequence in sequences: - if not all(letters in allowed_inputs for letters in sequence): - raise ValueError("Invalid sequence given") + allowed_inputs_seq = allowed_inputs + if procedure == "three_one_letter_code" and "-" in sequence: + allowed_inputs_seq -= set(amino_acids.keys()) + if not all( + aminoacids in allowed_inputs_seq for aminoacids in sequence.split("-") + ): + raise ValueError("Invalid sequence given") + else: + if not all(aminoacids in allowed_inputs_seq for aminoacids in sequence): + raise ValueError("Invalid sequence given") procedure_arguments = {} if procedure == "check_for_motifs": if "motif" not in kwargs.keys(): @@ -301,6 +290,6 @@ def check_and_parse_user_input(*args, **kwargs): return procedure_arguments, procedure -def run_protein_tools(*args, **kwargs): - procedure_arguments, procedure = check_and_parse_user_input(*args, **kwargs) +def run_protein_tools(sequences=[], **kwargs): + procedure_arguments, procedure = check_and_parse_user_input(sequences, **kwargs) return procedures_to_functions[procedure](**procedure_arguments) From e67042900ef2f2ee899e87256250215cb09c29a6 Mon Sep 17 00:00:00 2001 From: Ekaterina Shitik Date: Sat, 30 Sep 2023 14:39:50 +0200 Subject: [PATCH 13/36] Add minor changes to 'convert_to_nucl_acids' function --- HW4_Grigoriants/protein_tools.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/HW4_Grigoriants/protein_tools.py b/HW4_Grigoriants/protein_tools.py index 67366a3..181421e 100644 --- a/HW4_Grigoriants/protein_tools.py +++ b/HW4_Grigoriants/protein_tools.py @@ -113,18 +113,18 @@ def convert_to_nucl_acids(sequences: list, nucl_acids: str): nucl_acid_seqs = {"RNA": [], "DNA": []} for sequence in sequences: rna_seq = sequence.translate(rule_of_translation) - reverse_dna_seq = rna_seq.translate(rule_of_transcription)[::-1] + dna_seq = rna_seq.translate(rule_of_transcription) if nucl_acids == "RNA": nucl_acid_seqs["RNA"].append(rna_seq) if sequence == sequences[-1]: del nucl_acid_seqs["DNA"] if nucl_acids == "DNA": - nucl_acid_seqs["DNA"].append(reverse_dna_seq) + nucl_acid_seqs["DNA"].append(dna_seq) if sequence == sequences[-1]: del nucl_acid_seqs["RNA"] if nucl_acids == "both": nucl_acid_seqs["RNA"].append(rna_seq) - nucl_acid_seqs["DNA"].append(reverse_dna_seq) + nucl_acid_seqs["DNA"].append(dna_seq) return nucl_acid_seqs From fe41d8514cd139b32fdd8bb00fc57d2e0455a1fd Mon Sep 17 00:00:00 2001 From: Ekaterina Shitik Date: Sat, 30 Sep 2023 16:00:03 +0200 Subject: [PATCH 14/36] Change transcription rule in 'convert_to_nucl_acids' function --- HW4_Grigoriants/protein_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HW4_Grigoriants/protein_tools.py b/HW4_Grigoriants/protein_tools.py index 181421e..a3d37bc 100644 --- a/HW4_Grigoriants/protein_tools.py +++ b/HW4_Grigoriants/protein_tools.py @@ -109,7 +109,7 @@ def convert_to_nucl_acids(sequences: list, nucl_acids: str): If nucl_acids = 'both' output the name of a nucleic acid and a collection of frames """ rule_of_translation = sequences[0].maketrans(translation_rule) - rule_of_transcription = sequences[0].maketrans("AaUuCcGg", "TtAaGgCc") + rule_of_transcription = sequences[0].maketrans("Uu", "Tt") nucl_acid_seqs = {"RNA": [], "DNA": []} for sequence in sequences: rna_seq = sequence.translate(rule_of_translation) From c8e982387dd29336428671997234a88312e9bc0d Mon Sep 17 00:00:00 2001 From: Ekaterina Shitik Date: Sat, 30 Sep 2023 16:03:49 +0200 Subject: [PATCH 15/36] Correct inaccuracies in the dockstring of 'convert_to_nucl_acids' --- HW4_Grigoriants/protein_tools.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/HW4_Grigoriants/protein_tools.py b/HW4_Grigoriants/protein_tools.py index a3d37bc..c0c227d 100644 --- a/HW4_Grigoriants/protein_tools.py +++ b/HW4_Grigoriants/protein_tools.py @@ -104,9 +104,9 @@ def convert_to_nucl_acids(sequences: list, nucl_acids: str): nucl_acids = 'DNA' - convert to DNA nucl_acids = 'both' - convert to RNA and DNA Return: - - dictionary: a collection of alternative frames - If nucl_acids = 'RNA' or nucl_acids = 'DNA' output a collection of frames - If nucl_acids = 'both' output the name of a nucleic acid and a collection of frames + - dictionary: a collection of nucleic acids sequences + If nucl_acids = 'RNA' or nucl_acids = 'DNA' output a collection of sequences + If nucl_acids = 'both' output the name of a nucleic acid and a collection of sequences """ rule_of_translation = sequences[0].maketrans(translation_rule) rule_of_transcription = sequences[0].maketrans("Uu", "Tt") From cb03cf4073ede885c1207a0c4995a2c3636b9314 Mon Sep 17 00:00:00 2001 From: Ekaterina Shitik Date: Sat, 30 Sep 2023 16:20:25 +0200 Subject: [PATCH 16/36] Change inaccuracies in the dockstring of 'convert_to_nucl_acids' --- HW4_Grigoriants/protein_tools.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/HW4_Grigoriants/protein_tools.py b/HW4_Grigoriants/protein_tools.py index c0c227d..c8fcef5 100644 --- a/HW4_Grigoriants/protein_tools.py +++ b/HW4_Grigoriants/protein_tools.py @@ -104,9 +104,7 @@ def convert_to_nucl_acids(sequences: list, nucl_acids: str): nucl_acids = 'DNA' - convert to DNA nucl_acids = 'both' - convert to RNA and DNA Return: - - dictionary: a collection of nucleic acids sequences - If nucl_acids = 'RNA' or nucl_acids = 'DNA' output a collection of sequences - If nucl_acids = 'both' output the name of a nucleic acid and a collection of sequences + - dictionary: output the name of nucleic acid and a collection of sequences """ rule_of_translation = sequences[0].maketrans(translation_rule) rule_of_transcription = sequences[0].maketrans("Uu", "Tt") From b193a6b8a6a24d392a8b7ef5d6ccf1cbcb0daa9d Mon Sep 17 00:00:00 2001 From: Ekaterina Shitik Date: Sat, 30 Sep 2023 16:26:50 +0200 Subject: [PATCH 17/36] Change annotation of 'search_for_alt_frames' function --- HW4_Grigoriants/protein_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HW4_Grigoriants/protein_tools.py b/HW4_Grigoriants/protein_tools.py index c8fcef5..76397ee 100644 --- a/HW4_Grigoriants/protein_tools.py +++ b/HW4_Grigoriants/protein_tools.py @@ -53,7 +53,7 @@ def check_for_motifs(sequences, motif): return all_positions -def search_for_alt_frames(sequences: str, alt_start_aa: str): +def search_for_alt_frames(sequences: list, alt_start_aa: str): """ Search for alternative frames in a protein sequences From f53914a10e3cafb397704a77f3e0513c01a0c565 Mon Sep 17 00:00:00 2001 From: VovaGrig Date: Sat, 30 Sep 2023 18:46:06 +0300 Subject: [PATCH 18/36] Add minor fixes in protein_tools.py --- HW4_Grigoriants/protein_tools.py | 49 +++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/HW4_Grigoriants/protein_tools.py b/HW4_Grigoriants/protein_tools.py index 67366a3..d7621df 100644 --- a/HW4_Grigoriants/protein_tools.py +++ b/HW4_Grigoriants/protein_tools.py @@ -27,7 +27,26 @@ def define_molecular_weight(sequences): return sequences_weights -def check_for_motifs(sequences, motif): +def check_for_motifs(sequences, motif, overlapping): + """ + Search for motifs - conserved amino acids residues in protein sequence + + Search for one motif at a time + Search is letter case sensitive + Use one-letter aminoacids code for desired sequences and motifs + Positions of AA in sequences are counted from 0 + By default, overlapping matches are counted (see ) + + + Arguments: + - sequences (tuple(str), list(str)): sequences to check for given motif within + - motif (str): desired motif to check presense in every given sequence + Example: sequences = ["AMGAGW", "GAWSGRAGA"] + motif = "GA" + Return: + - dictionary: sequences as keys (str), starting positions for presented motif (list) as values + Example: {'AMGAGW': [2], 'GAWSGRAGA': [0, 7]} + """ new_line = "\n" # used for user-friendly output all_positions = {} for sequence in sequences: @@ -41,11 +60,14 @@ def check_for_motifs(sequences, motif): if start == -1: break positions.append(start) - # use += len(motif) not to count overlapping matches - start += 1 - pos_for_print = ", ".join(str(x) for x in positions) + if overlapping: + start += 1 + else: + start += len(motif) + print_pos = ", ".join(str(x) for x in positions) + print_pos = f'{print_pos}{new_line}' print( - f"Motif is present in protein sequence starting at positions: {pos_for_print}{new_line}" + f"Motif is present in protein sequence starting at positions: {print_pos}" ) else: print(f"Motif is not present in protein sequence{new_line}") @@ -71,22 +93,17 @@ def search_for_alt_frames(sequences: str, alt_start_aa: str): Return: - dictionary: the number of a sequence and a collection of alternative frames """ - # if len(alt_start_aa) > 1: - # raise ValueError("Invalid start codon!") alternative_frames = {} num_position = 0 for sequence in sequences: + alternative_frames[sequence] = [] for amino_acid in sequence[1:-3]: + alt_frame = "" num_position += 1 if amino_acid == alt_start_aa or amino_acid == alt_start_aa.swapcase(): - key = sequences.index(sequence) + 1 - if key in alternative_frames: - alternative_frames[key] += sequence[num_position:] + " " - else: - alternative_frames[key] = sequence[num_position:] + " " + alt_frame += sequence[num_position:] + alternative_frames[sequence].append(alt_frame) num_position = 0 - # for key, value in alternative_frames.items(): - # print(key, value) return alternative_frames @@ -273,6 +290,10 @@ def check_and_parse_user_input(sequences, **kwargs): if "motif" not in kwargs.keys(): raise ValueError("Please provide desired motif") procedure_arguments["motif"] = kwargs["motif"] + if "overlapping" not in kwargs.keys(): + procedure_arguments["overlapping"] = True + else: + procedure_arguments["overlapping"] = kwargs["overlapping"] elif procedure == "search_for_alt_frames": if "alt_start_aa" not in kwargs.keys(): procedure_arguments["alt_start_aa"] = "M" From 2ce8ada6c92ba1e6ebf23f776b9f5a184b0cbcf6 Mon Sep 17 00:00:00 2001 From: EkaterinShitik <144039338+EkaterinShitik@users.noreply.github.com> Date: Sat, 30 Sep 2023 18:05:38 +0200 Subject: [PATCH 19/36] Add plan of README.md --- HW4_Grigoriants/README.md | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/HW4_Grigoriants/README.md b/HW4_Grigoriants/README.md index 8b13789..f6cbaac 100644 --- a/HW4_Grigoriants/README.md +++ b/HW4_Grigoriants/README.md @@ -1 +1,29 @@ +# Protein_tools.py +## A tool to work with protein sequences +*Proteins* are under +**Protein_tools.py** is an open-source program that facilitates operating with protein sequences + + + +## Usage + +three_one_letter_code - convert one-letter code sequences to three-letter code sequences and vice-versa + +define_molecular_weight - determine the exact molecular weight of input protein sequences + +check_for_motifs - search for the motif of interest in input protein sequences + +search_for_alt_frames - search for alternative frames that start with methyonine or other non-canonical start amino acids + +convert_to_nucl_acids - covert protein sequences to DNA and RNA + +run_protein_tools + +## Options + +## Examples + +## Troubleshooting + +## Contacts From 18c1a760fb584496d75e4b3fb3dba704e6e303a8 Mon Sep 17 00:00:00 2001 From: EkaterinShitik <144039338+EkaterinShitik@users.noreply.github.com> Date: Sat, 30 Sep 2023 19:31:41 +0200 Subject: [PATCH 20/36] Complete 'Usage' --- HW4_Grigoriants/README.md | 58 +++++++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 9 deletions(-) diff --git a/HW4_Grigoriants/README.md b/HW4_Grigoriants/README.md index f6cbaac..aa7553d 100644 --- a/HW4_Grigoriants/README.md +++ b/HW4_Grigoriants/README.md @@ -1,29 +1,69 @@ # Protein_tools.py ## A tool to work with protein sequences -*Proteins* are under -**Protein_tools.py** is an open-source program that facilitates operating with protein sequences +*Proteins* are under the constant focus of scientists. Currently, there are an enormous amount of tools to operate with nucleotide sequences, however, the same ones for proteins are extremely rare. +`Protein_tools.py` is an open-source program that facilitates working with protein sequences. + +*В моём представлении здесь должна быть картинка* ## Usage +The programm is based on `run_protein_tools` function that takes the list of **one-letter amino acid sequences**, name of procedure and relevant arguments. If you have three-letter amino acid sequences you could convert them by using `three_one_letter_code` procedure. + +To start with the program run the following command: -three_one_letter_code - convert one-letter code sequences to three-letter code sequences and vice-versa +`run_protein_tools([sequence_1, sequence_2 ..., sequence_n], procedure, ...)` -define_molecular_weight - determine the exact molecular weight of input protein sequences +Where: +- [sequence_1, sequence_2 ..., sequence_n] - a list of protein sequences +- procedure - a type of procedure to use that is inputed in *string* type +- ... - an additional argument that is to be inputed in *string* type -check_for_motifs - search for the motif of interest in input protein sequences +## Options -search_for_alt_frames - search for alternative frames that start with methyonine or other non-canonical start amino acids +The program has five types of procedures: -convert_to_nucl_acids - covert protein sequences to DNA and RNA +#### `three_one_letter_code` -run_protein_tools +- The main aim - to convert three-letter amino acid sequences to one-letter ones and vice-versa +- An additional argument: no -## Options +#### `define_molecular_weight` + +- The main aim - to determine the exact molecular weight of protein sequences +- An additional argument: no + +`check_for_motifs` - to search for the motif of interest in protein sequences + +`search_for_alt_frames` - to look for alternative frames that start with methyonine or other non-canonical start amino acids + +`convert_to_nucl_acids` - covert protein sequences to DNA and RNA +**Requirments** + +Use only sequences that are encoded with one-letter. Если у вас трёхбуквенный код используйте наше функции для конвертации +Трёхбуквенный код также используется для конвертации. Он разделён дефисами ## Examples ## Troubleshooting ## Contacts +Authors: + +Vladimir Grigoriants + +Tulyavko Vlada + +Ekaterina Shitik (EkaterinShitik) + + +**Список процедур:** + +- `transcribe` — напечатать транскрибированную последовательность* +- `reverse` — напечатать перевёрнутую последовательность +- `complement` — напечатать комплементарную последовательность +- `reverse_complement` — напечатать обратную комплементарную последовательность +- `gc_count` — посчитать содержание нуклеотидов *G* и *C* в процентах + +\* Обратная транскрипция в рамках данной процедуры также учитывается (РНК в ДНК) From ea3be7ed76fa1d34c8e33f3495522c262c6fac46 Mon Sep 17 00:00:00 2001 From: EkaterinShitik <144039338+EkaterinShitik@users.noreply.github.com> Date: Sat, 30 Sep 2023 19:47:05 +0200 Subject: [PATCH 21/36] Add preliminary 'Options' --- HW4_Grigoriants/README.md | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/HW4_Grigoriants/README.md b/HW4_Grigoriants/README.md index aa7553d..62c974b 100644 --- a/HW4_Grigoriants/README.md +++ b/HW4_Grigoriants/README.md @@ -24,26 +24,33 @@ Where: The program has five types of procedures: -#### `three_one_letter_code` + `three_one_letter_code` - The main aim - to convert three-letter amino acid sequences to one-letter ones and vice-versa - An additional argument: no -#### `define_molecular_weight` + `define_molecular_weight` - The main aim - to determine the exact molecular weight of protein sequences - An additional argument: no -`check_for_motifs` - to search for the motif of interest in protein sequences + `check_for_motifs` -`search_for_alt_frames` - to look for alternative frames that start with methyonine or other non-canonical start amino acids +- The main aim - to search for the motif of interest in protein sequences +- An additional argument: motif (*str*) -`convert_to_nucl_acids` - covert protein sequences to DNA and RNA + `search_for_alt_frames` -**Requirments** +- The main aim - to look for alternative frames that start with methyonine or other non-canonical start amino acids +- An additional argument: alt_start_aa (*str*) +- Use alt_start_aa only for non-canonical start amino acids +- Without alt_start_aa the procedure find alternative frames that start with methyonine + +`convert_to_nucl_acids` +- Convert protein sequences to DNA, RNA or both nucleic acid sequences +- The program use the most frequent codons in human that could be found [here](https://www.genscript.com/tools/codon-frequency-table) +- An additional argument: nucl_acids (*str*) -Use only sequences that are encoded with one-letter. Если у вас трёхбуквенный код используйте наше функции для конвертации -Трёхбуквенный код также используется для конвертации. Он разделён дефисами ## Examples ## Troubleshooting @@ -56,14 +63,3 @@ Vladimir Grigoriants Tulyavko Vlada Ekaterina Shitik (EkaterinShitik) - - -**Список процедур:** - -- `transcribe` — напечатать транскрибированную последовательность* -- `reverse` — напечатать перевёрнутую последовательность -- `complement` — напечатать комплементарную последовательность -- `reverse_complement` — напечатать обратную комплементарную последовательность -- `gc_count` — посчитать содержание нуклеотидов *G* и *C* в процентах - -\* Обратная транскрипция в рамках данной процедуры также учитывается (РНК в ДНК) From a1c1c23cd4522deb9089b290eabbbfb4d53510fe Mon Sep 17 00:00:00 2001 From: EkaterinShitik <144039338+EkaterinShitik@users.noreply.github.com> Date: Sat, 30 Sep 2023 20:10:48 +0200 Subject: [PATCH 22/36] Add preliminary 'Examples' --- HW4_Grigoriants/README.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/HW4_Grigoriants/README.md b/HW4_Grigoriants/README.md index 62c974b..8865262 100644 --- a/HW4_Grigoriants/README.md +++ b/HW4_Grigoriants/README.md @@ -47,11 +47,20 @@ The program has five types of procedures: - Without alt_start_aa the procedure find alternative frames that start with methyonine `convert_to_nucl_acids` -- Convert protein sequences to DNA, RNA or both nucleic acid sequences +- The main aim - to convert protein sequences to DNA, RNA or both nucleic acid sequences - The program use the most frequent codons in human that could be found [here](https://www.genscript.com/tools/codon-frequency-table) - An additional argument: nucl_acids (*str*) + ## Examples +```python +run_protein_tools(['met-Asn-Tyr', 'Ile-Ala-Ala'], procedure = 'three_one_letter_code') # ['mNY', 'IAA'] +run_protein_tools(['mNY','IAA'], procedure = 'three_one_letter_code') # ['met-Asn-Tyr', 'Ile-Ala-Ala'] +run_protein_tools(['MNY','IAA'], procedure = 'define_molecular_weight') # [462.52000000000004, 309.35] +``` +```python +run_protein_tools(['mNY','IAA'], procedure = 'three_one_letter_code') # ['met-Asn-Tyr', 'Ile-Ala-Ala'] +``` ## Troubleshooting From 454d703dcfa545100c2858740b5f706b32104736 Mon Sep 17 00:00:00 2001 From: EkaterinShitik <144039338+EkaterinShitik@users.noreply.github.com> Date: Sat, 30 Sep 2023 20:40:46 +0200 Subject: [PATCH 23/36] Complete 'Examples' --- HW4_Grigoriants/README.md | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/HW4_Grigoriants/README.md b/HW4_Grigoriants/README.md index 8865262..6f43f17 100644 --- a/HW4_Grigoriants/README.md +++ b/HW4_Grigoriants/README.md @@ -42,8 +42,9 @@ The program has five types of procedures: `search_for_alt_frames` - The main aim - to look for alternative frames that start with methyonine or other non-canonical start amino acids +- Ignores the last three amino acids due to the insignicance of proteins of this length - An additional argument: alt_start_aa (*str*) -- Use alt_start_aa only for non-canonical start amino acids +- Use alt_start_aa **only for non-canonical start amino acids** - Without alt_start_aa the procedure find alternative frames that start with methyonine `convert_to_nucl_acids` @@ -54,12 +55,33 @@ The program has five types of procedures: ## Examples ```python -run_protein_tools(['met-Asn-Tyr', 'Ile-Ala-Ala'], procedure = 'three_one_letter_code') # ['mNY', 'IAA'] -run_protein_tools(['mNY','IAA'], procedure = 'three_one_letter_code') # ['met-Asn-Tyr', 'Ile-Ala-Ala'] -run_protein_tools(['MNY','IAA'], procedure = 'define_molecular_weight') # [462.52000000000004, 309.35] -``` -```python -run_protein_tools(['mNY','IAA'], procedure = 'three_one_letter_code') # ['met-Asn-Tyr', 'Ile-Ala-Ala'] +# three_one_letter_code +run_protein_tools(['met-Asn-Tyr', 'Ile-Ala-Ala'], procedure='three_one_letter_code') # ['mNY', 'IAA'] +run_protein_tools(['mNY','IAA'], procedure='three_one_letter_code') # ['met-Asn-Tyr', 'Ile-Ala-Ala'] + +# define_molecular_weight +run_protein_tools(['MNY','IAA'], procedure='define_molecular_weight') # [462.52000000000004, 309.35] + +# check_for_motifs +run_protein_tools(['mNY','IAA'], procedure='check_for_motifs', motif='NY') +# Sequence: mNY +# Motif: NY +# Motif is present in protein sequence starting at positions: 1 +# Sequence: IAA +# Motif: NY +# Motif is not present in protein sequence +# {'mNY': [1], 'IAA': []} + +# search_for_alt_frames +run_protein_tools(['mNYQTMSPYYDMId'], procedure='search_for_alt_frames') # {'mNYQTMSPYYDMId': ['MSPYYDMId']} +run_protein_tools(['mNYTQTSP'], procedure='search_for_alt_frames', alt_start_aa='T') # {'mNYTQTSP': ['TQTSP']} + +# convert_to_nucl_acids +run_protein_tools(['MNY'], procedure='convert_to_nucl_acids', nucl_acids = 'RNA') # {'RNA': ['AUGAACUAU']} +run_protein_tools(['MNY'], procedure='convert_to_nucl_acids', nucl_acids = 'DNA') # {'DNA': ['ATGAACTAT']} +run_protein_tools(['MNY'], procedure='convert_to_nucl_acids', nucl_acids = 'both') +# {'RNA': ['AUGAACUAU'], 'DNA': ['ATGAACTAT']} + ``` ## Troubleshooting From e5628a551216c8869db98df2777e2c9ae3dc5659 Mon Sep 17 00:00:00 2001 From: EkaterinShitik <144039338+EkaterinShitik@users.noreply.github.com> Date: Sat, 30 Sep 2023 20:58:58 +0200 Subject: [PATCH 24/36] Complete four first parts --- HW4_Grigoriants/README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/HW4_Grigoriants/README.md b/HW4_Grigoriants/README.md index 6f43f17..c898970 100644 --- a/HW4_Grigoriants/README.md +++ b/HW4_Grigoriants/README.md @@ -9,7 +9,7 @@ *В моём представлении здесь должна быть картинка* ## Usage -The programm is based on `run_protein_tools` function that takes the list of **one-letter amino acid sequences**, name of procedure and relevant arguments. If you have three-letter amino acid sequences you could convert them by using `three_one_letter_code` procedure. +The programm is based on `run_protein_tools` function that takes the list of **one-letter amino acid sequences**, a name of procedure and a relevant argument. If you have three-letter amino acids sequences you could convert them by using `three_one_letter_code` procedure in advance. Before using this procedure, check the *Options*. To start with the program run the following command: @@ -27,6 +27,7 @@ The program has five types of procedures: `three_one_letter_code` - The main aim - to convert three-letter amino acid sequences to one-letter ones and vice-versa +- In case of three-to-one translation the names of amino acids **must be separated with hyphen** - An additional argument: no `define_molecular_weight` @@ -51,6 +52,7 @@ The program has five types of procedures: - The main aim - to convert protein sequences to DNA, RNA or both nucleic acid sequences - The program use the most frequent codons in human that could be found [here](https://www.genscript.com/tools/codon-frequency-table) - An additional argument: nucl_acids (*str*) +- Use as nucl_acids only DNA, RNA or both (for more detailes, check *Examples*) ## Examples @@ -79,8 +81,7 @@ run_protein_tools(['mNYTQTSP'], procedure='search_for_alt_frames', alt_start_aa= # convert_to_nucl_acids run_protein_tools(['MNY'], procedure='convert_to_nucl_acids', nucl_acids = 'RNA') # {'RNA': ['AUGAACUAU']} run_protein_tools(['MNY'], procedure='convert_to_nucl_acids', nucl_acids = 'DNA') # {'DNA': ['ATGAACTAT']} -run_protein_tools(['MNY'], procedure='convert_to_nucl_acids', nucl_acids = 'both') -# {'RNA': ['AUGAACUAU'], 'DNA': ['ATGAACTAT']} +run_protein_tools(['MNY'], procedure='convert_to_nucl_acids', nucl_acids = 'both') # {'RNA': ['AUGAACUAU'], 'DNA': ['ATGAACTAT']} ``` From 53a75563b9db6cabb24aec81c6ec5afc76045dfe Mon Sep 17 00:00:00 2001 From: EkaterinShitik <144039338+EkaterinShitik@users.noreply.github.com> Date: Sat, 30 Sep 2023 21:40:03 +0200 Subject: [PATCH 25/36] Complete all parts except for contacts --- HW4_Grigoriants/README.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/HW4_Grigoriants/README.md b/HW4_Grigoriants/README.md index c898970..70ad9b2 100644 --- a/HW4_Grigoriants/README.md +++ b/HW4_Grigoriants/README.md @@ -41,7 +41,7 @@ The program has five types of procedures: - An additional argument: motif (*str*) `search_for_alt_frames` - + - The main aim - to look for alternative frames that start with methyonine or other non-canonical start amino acids - Ignores the last three amino acids due to the insignicance of proteins of this length - An additional argument: alt_start_aa (*str*) @@ -49,6 +49,7 @@ The program has five types of procedures: - Without alt_start_aa the procedure find alternative frames that start with methyonine `convert_to_nucl_acids` + - The main aim - to convert protein sequences to DNA, RNA or both nucleic acid sequences - The program use the most frequent codons in human that could be found [here](https://www.genscript.com/tools/codon-frequency-table) - An additional argument: nucl_acids (*str*) @@ -87,6 +88,17 @@ run_protein_tools(['MNY'], procedure='convert_to_nucl_acids', nucl_acids = 'both ## Troubleshooting +| Type of the problem | Probable cause +| ------------------------------------------------------------ |-------------------- +| Output does not correspond the expected resultes | The name of procedure is wrong. You see the results of another procedure +| ValueError: No sequences provided | A list of sequences are not inputed +| ValueError: Wrong procedure | The procedure does not exist in this program +| TypeError: takes from 0 to 1 positional arguments but n were given | Sequences are not collected into the list type +| ValueError: Invalid sequence given | The sequences do not correspond to standard amino acid code +| ValueError: Please provide desired motif | There are no an additional argument *motif* in `check_for_motifs` +| ValueError: Invalid start AA! | There is more than one letter in an additional argument *alt_start_aa* in `search_for_alt_frames` +| ValueError: Please provide desired type of nucl_acids | There are no an additional argument *nucl_acids* in `convert_to_nucl_acids` +| ValueError: Invalid nucl_acids argument | An additional argument in `convert_to_nucl_acids` is written incorrectly ## Contacts Authors: From 33744ad82a6a6ebfe7bcd3ffcb71ff118eddb332 Mon Sep 17 00:00:00 2001 From: EkaterinShitik <144039338+EkaterinShitik@users.noreply.github.com> Date: Sat, 30 Sep 2023 21:48:06 +0200 Subject: [PATCH 26/36] Complete all parts --- HW4_Grigoriants/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/HW4_Grigoriants/README.md b/HW4_Grigoriants/README.md index 70ad9b2..c41a0e7 100644 --- a/HW4_Grigoriants/README.md +++ b/HW4_Grigoriants/README.md @@ -102,8 +102,8 @@ run_protein_tools(['MNY'], procedure='convert_to_nucl_acids', nucl_acids = 'both ## Contacts Authors: -Vladimir Grigoriants +Vladimir Grigoriants (*адрес*) -Tulyavko Vlada +Tulyavko Vlada (*адрес*) -Ekaterina Shitik (EkaterinShitik) +Ekaterina Shitik (shitik.ekaterina@gmail.com) From 0fbb1848bd7d373792cfe088b6888436645c0cf3 Mon Sep 17 00:00:00 2001 From: EkaterinShitik <144039338+EkaterinShitik@users.noreply.github.com> Date: Sat, 30 Sep 2023 21:51:22 +0200 Subject: [PATCH 27/36] Add minor changes in 'Options' --- HW4_Grigoriants/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HW4_Grigoriants/README.md b/HW4_Grigoriants/README.md index c41a0e7..227f9eb 100644 --- a/HW4_Grigoriants/README.md +++ b/HW4_Grigoriants/README.md @@ -43,7 +43,7 @@ The program has five types of procedures: `search_for_alt_frames` - The main aim - to look for alternative frames that start with methyonine or other non-canonical start amino acids -- Ignores the last three amino acids due to the insignicance of proteins of this length +- Ignores the last three amino acids due to the insignicance of alternative frames of this length - An additional argument: alt_start_aa (*str*) - Use alt_start_aa **only for non-canonical start amino acids** - Without alt_start_aa the procedure find alternative frames that start with methyonine From d9bdb5072c5cafdab390d4e5b18fc6ac8aad0cc5 Mon Sep 17 00:00:00 2001 From: VovaGrig Date: Sun, 1 Oct 2023 00:08:21 +0300 Subject: [PATCH 28/36] Add dockstrings to main function, search_for_motifs function, add minor fixes, add dictionaries.py --- HW4_Grigoriants/dictionaries.py | 106 +++++++++++++++ HW4_Grigoriants/protein_tools.py | 223 ++++++++++++------------------- 2 files changed, 194 insertions(+), 135 deletions(-) create mode 100644 HW4_Grigoriants/dictionaries.py diff --git a/HW4_Grigoriants/dictionaries.py b/HW4_Grigoriants/dictionaries.py new file mode 100644 index 0000000..f4a1ada --- /dev/null +++ b/HW4_Grigoriants/dictionaries.py @@ -0,0 +1,106 @@ +amino_acids = { + "A": "Ala", + "C": "Cys", + "D": "Asp", + "E": "Glu", + "F": "Phe", + "G": "Gly", + "H": "His", + "I": "Ile", + "K": "Lys", + "L": "Leu", + "M": "Met", + "N": "Asn", + "P": "Pro", + "Q": "Gln", + "R": "Arg", + "S": "Ser", + "T": "Thr", + "V": "Val", + "W": "Trp", + "Y": "Tyr", + "a": "ala", + "c": "cys", + "d": "asp", + "e": "glu", + "f": "phe", + "g": "gly", + "h": "his", + "i": "ile", + "k": "lys", + "l": "leu", + "m": "met", + "n": "asn", + "p": "pro", + "q": "gln", + "r": "arg", + "s": "ser", + "t": "thr", + "v": "val", + "w": "trp", + "y": "tyr", +} +translation_rule = { + "F": "UUU", + "f": "uuu", + "L": "CUG", + "l": "cug", + "I": "AUU", + "i": "auu", + "M": "AUG", + "m": "aug", + "V": "GUG", + "v": "gug", + "P": "CCG", + "p": "ccg", + "T": "ACC", + "t": "acc", + "A": "GCG", + "a": "gcg", + "Y": "UAU", + "y": "uau", + "H": "CAU", + "h": "cau", + "Q": "CAG", + "q": "cag", + "N": "AAC", + "n": "aac", + "K": "AAA", + "k": "aaa", + "D": "GAU", + "d": "gau", + "E": "GAA", + "e": "gaa", + "C": "UGC", + "c": "ugc", + "W": "UGG", + "w": "ugg", + "R": "CGU", + "r": "cgu", + "S": "AGC", + "s": "agc", + "G": "GGC", + "g": "ggc", +} +amino_acid_weights = { + "A": 89.09, + "C": 121.16, + "D": 133.10, + "E": 147.13, + "F": 165.19, + "G": 75.07, + "H": 155.16, + "I": 131.17, + "K": 146.19, + "L": 131.17, + "M": 149.21, + "N": 132.12, + "P": 115.13, + "Q": 146.15, + "R": 174.20, + "S": 105.09, + "T": 119.12, + "V": 117.15, + "W": 204.23, + "Y": 181.19, +} diff --git a/HW4_Grigoriants/protein_tools.py b/HW4_Grigoriants/protein_tools.py index d7621df..9d2290c 100644 --- a/HW4_Grigoriants/protein_tools.py +++ b/HW4_Grigoriants/protein_tools.py @@ -1,53 +1,60 @@ -def three_one_letter_code(sequences): +import dictionaries + + +def three_one_letter_code(sequences) -> list: inversed_sequences = [] for sequence in sequences: inversed_sequence = "" if "-" not in sequence: for letter in sequence: - inversed_sequence += amino_acids[letter] + "-" + inversed_sequence += dictionaries.amino_acids[letter] + "-" inversed_sequence = inversed_sequence[:-1] inversed_sequences.append(inversed_sequence) else: aa_splitted = sequence.split("-") for aa in aa_splitted: - inversed_sequence += list(amino_acids.keys())[ - list(amino_acids.values()).index(aa) + inversed_sequence += list(dictionaries.amino_acids.keys())[ + list(dictionaries.amino_acids.values()).index(aa) ] inversed_sequences.append(inversed_sequence) return inversed_sequences -def define_molecular_weight(sequences): +def define_molecular_weight(sequences) -> dict: sequences_weights = [] for sequence in sequences: sequence_weight = 0 for letter in sequence: - sequence_weight += amino_acid_weights[letter] + sequence_weight += dictionaries.amino_acid_weights[letter.upper()] sequences_weights.append(sequence_weight) return sequences_weights -def check_for_motifs(sequences, motif, overlapping): +def search_for_motifs( + sequences: (tuple(str) or list(str)), motif: str, overlapping: bool +) -> dict: """ Search for motifs - conserved amino acids residues in protein sequence Search for one motif at a time Search is letter case sensitive - Use one-letter aminoacids code for desired sequences and motifs + Use one-letter aminoacids code for desired sequences and motifs Positions of AA in sequences are counted from 0 - By default, overlapping matches are counted (see ) - + By default, overlapping matches are counted + Arguments: - - sequences (tuple(str), list(str)): sequences to check for given motif within - - motif (str): desired motif to check presense in every given sequence + - sequences (tuple(str) or list(str)): sequences to check for given motif within Example: sequences = ["AMGAGW", "GAWSGRAGA"] - motif = "GA" + - motif (str): desired motif to check presense in every given sequence + Example: motif = "GA" + - overlapping (bool): count (True) or skip (False) overlapping matches. (Optional) + Example: overlapping = False Return: - - dictionary: sequences as keys (str), starting positions for presented motif (list) as values + - dictionary: sequences (str) as keys , starting positions for presented motif (list) as values Example: {'AMGAGW': [2], 'GAWSGRAGA': [0, 7]} """ - new_line = "\n" # used for user-friendly output + new_line = "\n" all_positions = {} for sequence in sequences: start = 0 @@ -61,11 +68,11 @@ def check_for_motifs(sequences, motif, overlapping): break positions.append(start) if overlapping: - start += 1 + start += 1 else: start += len(motif) print_pos = ", ".join(str(x) for x in positions) - print_pos = f'{print_pos}{new_line}' + print_pos = f"{print_pos}{new_line}" print( f"Motif is present in protein sequence starting at positions: {print_pos}" ) @@ -75,7 +82,7 @@ def check_for_motifs(sequences, motif, overlapping): return all_positions -def search_for_alt_frames(sequences: str, alt_start_aa: str): +def search_for_alt_frames(sequences: str, alt_start_aa: str) -> dict: """ Search for alternative frames in a protein sequences @@ -98,7 +105,7 @@ def search_for_alt_frames(sequences: str, alt_start_aa: str): for sequence in sequences: alternative_frames[sequence] = [] for amino_acid in sequence[1:-3]: - alt_frame = "" + alt_frame = "" num_position += 1 if amino_acid == alt_start_aa or amino_acid == alt_start_aa.swapcase(): alt_frame += sequence[num_position:] @@ -107,7 +114,7 @@ def search_for_alt_frames(sequences: str, alt_start_aa: str): return alternative_frames -def convert_to_nucl_acids(sequences: list, nucl_acids: str): +def convert_to_nucl_acids(sequences: list, nucl_acids: str) -> dict: """ Convert protein sequences to RNA or DNA sequences. @@ -125,7 +132,7 @@ def convert_to_nucl_acids(sequences: list, nucl_acids: str): If nucl_acids = 'RNA' or nucl_acids = 'DNA' output a collection of frames If nucl_acids = 'both' output the name of a nucleic acid and a collection of frames """ - rule_of_translation = sequences[0].maketrans(translation_rule) + rule_of_translation = sequences[0].maketrans(dictionaries.translation_rule) rule_of_transcription = sequences[0].maketrans("AaUuCcGg", "TtAaGgCc") nucl_acid_seqs = {"RNA": [], "DNA": []} for sequence in sequences: @@ -146,138 +153,45 @@ def convert_to_nucl_acids(sequences: list, nucl_acids: str): procedures_to_functions = { - "check_for_motifs": check_for_motifs, + "search_for_motifs": search_for_motifs, "search_for_alt_frames": search_for_alt_frames, "convert_to_nucl_acids": convert_to_nucl_acids, "three_one_letter_code": three_one_letter_code, "define_molecular_weight": define_molecular_weight, } -amino_acids = { - "A": "Ala", - "C": "Cys", - "D": "Asp", - "E": "Glu", - "F": "Phe", - "G": "Gly", - "H": "His", - "I": "Ile", - "K": "Lys", - "L": "Leu", - "M": "Met", - "N": "Asn", - "P": "Pro", - "Q": "Gln", - "R": "Arg", - "S": "Ser", - "T": "Thr", - "V": "Val", - "W": "Trp", - "Y": "Tyr", - "a": "ala", - "c": "cys", - "d": "asp", - "e": "glu", - "f": "phe", - "g": "gly", - "h": "his", - "i": "ile", - "k": "lys", - "l": "leu", - "m": "met", - "n": "asn", - "p": "pro", - "q": "gln", - "r": "arg", - "s": "ser", - "t": "thr", - "v": "val", - "w": "trp", - "y": "tyr", -} -translation_rule = { - "F": "UUU", - "f": "uuu", - "L": "CUG", - "l": "cug", - "I": "AUU", - "i": "auu", - "M": "AUG", - "m": "aug", - "V": "GUG", - "v": "gug", - "P": "CCG", - "p": "ccg", - "T": "ACC", - "t": "acc", - "A": "GCG", - "a": "gcg", - "Y": "UAU", - "y": "uau", - "H": "CAU", - "h": "cau", - "Q": "CAG", - "q": "cag", - "N": "AAC", - "n": "aac", - "K": "AAA", - "k": "aaa", - "D": "GAU", - "d": "gau", - "E": "GAA", - "e": "gaa", - "C": "UGC", - "c": "ugc", - "W": "UGG", - "w": "ugg", - "R": "CGU", - "r": "cgu", - "S": "AGC", - "s": "agc", - "G": "GGC", - "g": "ggc", -} -amino_acid_weights = { - "A": 89.09, - "C": 121.16, - "D": 133.10, - "E": 147.13, - "F": 165.19, - "G": 75.07, - "H": 155.16, - "I": 131.17, - "K": 146.19, - "L": 131.17, - "M": 149.21, - "N": 132.12, - "P": 115.13, - "Q": 146.15, - "R": 174.20, - "S": 105.09, - "T": 119.12, - "V": 117.15, - "W": 204.23, - "Y": 181.19, -} +def check_and_parse_user_input( + sequences: list(str) or tuple(str), **kwargs +) -> dict and str: + """ + Check if user input can be correctly processed + Provide arguments for desired procedures + Needed for main function to correctly call desired procedure + Arguments: + - sequences (list(str) or tuple(str)): sequences to process + - **kwargs - needed arguments for completion of desired procedure -def check_and_parse_user_input(sequences, **kwargs): + Return: + - string: procedure name + - dictionary: a collection of procedure arguments and their values + """ if len(sequences) == 0: raise ValueError("No sequences provided") procedure = kwargs["procedure"] if procedure not in procedures_to_functions.keys(): raise ValueError("Wrong procedure") - allowed_inputs = set(amino_acids.keys()).union( - set(amino_acids.values()).union(set("-")) + allowed_inputs = set(dictionaries.amino_acids.keys()).union( + set(dictionaries.amino_acids.values()).union(set("-")) ) if procedure != "three_one_letter_code": allowed_inputs.remove("-") - allowed_inputs -= set(amino_acids.values()) + allowed_inputs -= set(dictionaries.amino_acids.values()) for sequence in sequences: allowed_inputs_seq = allowed_inputs if procedure == "three_one_letter_code" and "-" in sequence: - allowed_inputs_seq -= set(amino_acids.keys()) + allowed_inputs_seq -= set(dictionaries.amino_acids.keys()) if not all( aminoacids in allowed_inputs_seq for aminoacids in sequence.split("-") ): @@ -286,7 +200,7 @@ def check_and_parse_user_input(sequences, **kwargs): if not all(aminoacids in allowed_inputs_seq for aminoacids in sequence): raise ValueError("Invalid sequence given") procedure_arguments = {} - if procedure == "check_for_motifs": + if procedure == "search_for_motifs": if "motif" not in kwargs.keys(): raise ValueError("Please provide desired motif") procedure_arguments["motif"] = kwargs["motif"] @@ -311,6 +225,45 @@ def check_and_parse_user_input(sequences, **kwargs): return procedure_arguments, procedure -def run_protein_tools(sequences=[], **kwargs): +def run_protein_tools(sequences: list(str) or tuple(str), **kwargs: str): + """ + Main function to process protein sequence by one of the developed tools. + Run one procedure at a time: + - Search for conserved amino acids residues in protein sequence + - Search for alternative frames in a protein sequences + - Convert protein sequences to RNA or DNA sequences + - + + All functions are letter case sensitive + Provide protein sequence in one letter code. + You can obtain one letter code from three letter code with *three_one_letter_code* + If more information needed please see Readme or desired dockstring + + Arguments: + - sequences (list(str) or tuple(str)): sequences to process + - procedure (str): desired procedure: + - "search_for_motifs" + - "search_for_alt_frames" + - "convert_to_nucl_acids" + - "three_one_letter_code" + - "define_molecular_weight" + For "search_for_motif" procedure provide: + - motif (str): desired motif to check presense in every given sequence + Example: motif = "GA" + - overlapping (bool): count (True) or skip (False) overlapping matches. (Optional) + Example: overlapping = False + For "search_for_alt_frames" procedure provide: + - alt_start_aa (str): the name of an amino acid that is encoded by alternative start codon (Optional) + Example: alt_start_aa = 'I' + For "convert_to_nucl_acids" procedure provide: + - nucl_acids (str): the nucleic acid to convert to + Example: nucl_acids = 'RNA' + nucl_acids = 'DNA' + nucl_acids = 'both' + + Return: + - dict: Dictionary with processed sequences. Depends on desired tool + Please see Readme or desired dockstring + """ procedure_arguments, procedure = check_and_parse_user_input(sequences, **kwargs) return procedures_to_functions[procedure](**procedure_arguments) From 78fc1e04de148832c0075f4671c81bff97cd6070 Mon Sep 17 00:00:00 2001 From: Vlada Tuliavko Date: Sun, 1 Oct 2023 00:13:31 +0300 Subject: [PATCH 29/36] Add docstrings to three_one_letter_code and define_molecular_weight functions --- HW4_Grigoriants/protein_tools.py | 44 ++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/HW4_Grigoriants/protein_tools.py b/HW4_Grigoriants/protein_tools.py index 67366a3..2e8c17d 100644 --- a/HW4_Grigoriants/protein_tools.py +++ b/HW4_Grigoriants/protein_tools.py @@ -1,4 +1,21 @@ -def three_one_letter_code(sequences): +def three_one_letter_code(sequences: str) -> list: + """ + Reverse the protein sequences from one-letter to three-letter format and vice-versa + + Case 1: get three-letter sequence + Use one-letter amino-acids sequences of any letter case + + Case 2: get one-letter sequence + Use three-letter amino-acid separated by '-' sequences + + Arguments: + - sequences (tuple(str) or list(str)): protein sequences to convert + Example: ['WAG', 'MkqRe', 'msrlk', 'Met-Ala-Gly', 'Met-arg-asn-Trp-Ala-Gly', 'arg-asn-trp'] + + Return: + - list: one-letter/three-letter protein sequences + Example: ['Met-Ala-Gly', 'Met-arg-asn-Trp-Ala-Gly', 'arg-asn-trp', 'WAG', 'MkqRe', 'rlk'] + """ inversed_sequences = [] for sequence in sequences: inversed_sequence = "" @@ -17,13 +34,30 @@ def three_one_letter_code(sequences): return inversed_sequences -def define_molecular_weight(sequences): - sequences_weights = [] +def define_molecular_weight(sequences: str) -> dict: + """ + Define molecular weight of the protein sequences + + Use one-letter amino-acids sequences of any letter case + The molecular weight is: + - a sum of masses of each atom constituting a molecule + - expressed in units called daltons (Da) + - rounded to hundredths + + Arguments: + - sequences (tuple(str) or list(str)): protein sequences to convert + + Return: + - dictionary: protein sequences as keys and molecular masses as values + Example: {'WAG': 332.39, 'MkqRe': 690.88, 'msrlk': 633.86} + """ + sequences_weights = {} for sequence in sequences: sequence_weight = 0 for letter in sequence: - sequence_weight += amino_acid_weights[letter] - sequences_weights.append(sequence_weight) + sequence_weight += amino_acid_weights[letter.upper()] + sequence_weight -= (len(sequence) - 1) * 18 #deduct water from peptide bond + sequences_weights[sequence] = round(sequence_weight, 2) return sequences_weights From fdf4b608d08dedae6ffbd0ca882a88eeb5db2061 Mon Sep 17 00:00:00 2001 From: VovaGrig Date: Sun, 1 Oct 2023 00:13:50 +0300 Subject: [PATCH 30/36] Add minor fixes --- HW4_Grigoriants/protein_tools.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/HW4_Grigoriants/protein_tools.py b/HW4_Grigoriants/protein_tools.py index 9d2290c..6662b80 100644 --- a/HW4_Grigoriants/protein_tools.py +++ b/HW4_Grigoriants/protein_tools.py @@ -237,7 +237,7 @@ def run_protein_tools(sequences: list(str) or tuple(str), **kwargs: str): All functions are letter case sensitive Provide protein sequence in one letter code. You can obtain one letter code from three letter code with *three_one_letter_code* - If more information needed please see Readme or desired dockstring + If more information needed please see Readme or desired docstring Arguments: - sequences (list(str) or tuple(str)): sequences to process @@ -263,7 +263,7 @@ def run_protein_tools(sequences: list(str) or tuple(str), **kwargs: str): Return: - dict: Dictionary with processed sequences. Depends on desired tool - Please see Readme or desired dockstring + Please see Readme or desired docstring """ procedure_arguments, procedure = check_and_parse_user_input(sequences, **kwargs) return procedures_to_functions[procedure](**procedure_arguments) From 6794624aeb6477cac093769d999045bf48e213bd Mon Sep 17 00:00:00 2001 From: VovaGrig Date: Sun, 1 Oct 2023 02:12:49 +0300 Subject: [PATCH 31/36] Add mifixes to docstrings --- HW4_Grigoriants/protein_tools.py | 142 ++++++++++++++++--------------- 1 file changed, 74 insertions(+), 68 deletions(-) diff --git a/HW4_Grigoriants/protein_tools.py b/HW4_Grigoriants/protein_tools.py index 7d9b86b..11e1adc 100644 --- a/HW4_Grigoriants/protein_tools.py +++ b/HW4_Grigoriants/protein_tools.py @@ -5,19 +5,21 @@ def three_one_letter_code(sequences: str) -> list: """ Reverse the protein sequences from one-letter to three-letter format and vice-versa - Case 1: get three-letter sequence + Case 1: get three-letter sequence\n Use one-letter amino-acids sequences of any letter case - - Case 2: get one-letter sequence - Use three-letter amino-acid separated by '-' sequences - + + Case 2: get one-letter sequence\n + Use three-letter amino-acid separated by "-" sequences. + Please note that sequences without "-" are parsed as one-letter code sequences\n + Example: for sequence "Ala" function will return "Ala-leu-ala" + Arguments: - - sequences (tuple(str) or list(str)): protein sequences to convert - Example: ['WAG', 'MkqRe', 'msrlk', 'Met-Ala-Gly', 'Met-arg-asn-Trp-Ala-Gly', 'arg-asn-trp'] + - sequences (tuple[str] or list[str]): protein sequences to convert\n + Example: ["WAG", "MkqRe", "msrlk", "Met-Ala-Gly", "Met-arg-asn-Trp-Ala-Gly", "arg-asn-trp"] Return: - - list: one-letter/three-letter protein sequences - Example: ['Met-Ala-Gly', 'Met-arg-asn-Trp-Ala-Gly', 'arg-asn-trp', 'WAG', 'MkqRe', 'rlk'] + - list: one-letter/three-letter protein sequences\n + Example: ["Met-Ala-Gly", "Met-arg-asn-Trp-Ala-Gly", "arg-asn-trp", "WAG", "MkqRe", "rlk"] """ inversed_sequences = [] for sequence in sequences: @@ -46,47 +48,46 @@ def define_molecular_weight(sequences: str) -> dict: - a sum of masses of each atom constituting a molecule - expressed in units called daltons (Da) - rounded to hundredths - + Arguments: - - sequences (tuple(str) or list(str)): protein sequences to convert + - sequences (tuple[str] or list[str]): protein sequences to convert Return: - - dictionary: protein sequences as keys and molecular masses as values - Example: {'WAG': 332.39, 'MkqRe': 690.88, 'msrlk': 633.86} + - dictionary: protein sequences as keys and molecular masses as values\n + Example: {"WAG": 332.39, "MkqRe": 690.88, "msrlk": 633.86} """ sequences_weights = {} for sequence in sequences: sequence_weight = 0 for letter in sequence: - sequence_weight += amino_acid_weights[letter.upper()] - sequence_weight -= (len(sequence) - 1) * 18 #deduct water from peptide bond - sequences_weights[sequence] = round(sequence_weight, 2) + sequence_weight += dictionaries.amino_acid_weights[letter.upper()] + sequence_weight -= (len(sequence) - 1) * 18 # deduct water from peptide bond + sequences_weights[sequence] = round(sequence_weight, 2) return sequences_weights def search_for_motifs( - sequences: (tuple(str) or list(str)), motif: str, overlapping: bool + sequences: (tuple[str] or list[str]), motif: str, overlapping: bool ) -> dict: """ Search for motifs - conserved amino acids residues in protein sequence - Search for one motif at a time - Search is letter case sensitive - Use one-letter aminoacids code for desired sequences and motifs - Positions of AA in sequences are counted from 0 + Search for one motif at a time\n + Search is letter case sensitive\n + Use one-letter aminoacids code for desired sequences and motifs\n + Positions of AA in sequences are counted from 0\n By default, overlapping matches are counted - Arguments: - - sequences (tuple(str) or list(str)): sequences to check for given motif within + - sequences (tuple[str] or list[str]): sequences to check for given motif within\n Example: sequences = ["AMGAGW", "GAWSGRAGA"] - - motif (str): desired motif to check presense in every given sequence + - motif (str]: desired motif to check presense in every given sequence\n Example: motif = "GA" - - overlapping (bool): count (True) or skip (False) overlapping matches. (Optional) + - overlapping (bool): count (True) or skip (False) overlapping matches. (Optional)\n Example: overlapping = False Return: - - dictionary: sequences (str) as keys , starting positions for presented motif (list) as values - Example: {'AMGAGW': [2], 'GAWSGRAGA': [0, 7]} + - dictionary: sequences (str] as keys , starting positions for presented motif (list) as values\n + Example: {"AMGAGW": [2], "GAWSGRAGA": [0, 7]} """ new_line = "\n" all_positions = {} @@ -116,21 +117,21 @@ def search_for_motifs( return all_positions - def search_for_alt_frames(sequences: str, alt_start_aa: str) -> dict: """ Search for alternative frames in a protein sequences - Without an alt_start_aa argument search for frames that start with methionine ('M') - To search frames with alternative start codon add alt_start_aa argument + Search is not letter case sensitive\n + Without an alt_start_aa argument search for frames that start with methionine ("M") + To search frames with alternative start codon add alt_start_aa argument\n In alt_start_aa argument use one-letter code The function ignores the last three amino acids in sequences Arguments: - - sequences (tuple(str) or list(str)): sequences to check - - alt_start_aa (str): the name of an amino acid that is encoded by alternative start codon (Optional) - Example: alt_start_aa = 'I' + - sequences (tuple[str] or list[str]): sequences to check + - alt_start_aa (str]: the name of an amino acid that is encoded by alternative start AA (Optional)\n + Example: alt_start_aa = "I" Return: - dictionary: the number of a sequence and a collection of alternative frames @@ -153,17 +154,17 @@ def convert_to_nucl_acids(sequences: list, nucl_acids: str) -> dict: """ Convert protein sequences to RNA or DNA sequences. - Use the most frequent codons in human. The source - https://www.genscript.com/tools/codon-frequency-table - All nucleic acids (DNA and RNA) are showed in 5'-3' direction + Use the most frequent codons in human. The source - https://www.genscript.com/tools/codon-frequency-table\n + All nucleic acids (DNA and RNA) are showed in 5"-3" direction Arguments: - - sequences (tuple(str) or list(str)): sequences to convert - - nucl_acids (str): the nucleic acid that is prefered - Example: nucl_acids = 'RNA' - convert to RNA - nucl_acids = 'DNA' - convert to DNA - nucl_acids = 'both' - convert to RNA and DNA + - sequences (tuple[str] or list[str]): sequences to convert + - nucl_acids (str]: the nucleic acid that is prefered\n + Example: nucl_acids = "RNA" - convert to RNA\n + nucl_acids = "DNA" - convert to DNA\n + nucl_acids = "both" - convert to RNA and DNA Return: - - dictionary: output the name of nucleic acid and a collection of sequences + - dictionary: nucleic acids (str) as keys, collection of sequences (list) as values """ rule_of_translation = sequences[0].maketrans(dictionaries.translation_rule) rule_of_transcription = sequences[0].maketrans("AaUuCcGg", "TtAaGgCc") @@ -195,15 +196,14 @@ def convert_to_nucl_acids(sequences: list, nucl_acids: str) -> dict: def check_and_parse_user_input( - sequences: list(str) or tuple(str), **kwargs + sequences: list[str] or tuple[str], **kwargs ) -> dict and str: """ - Check if user input can be correctly processed - Provide arguments for desired procedures - Needed for main function to correctly call desired procedure - + Check if user input can be correctly processed\n + Parse sequences and arguments for desired procedure + Arguments: - - sequences (list(str) or tuple(str)): sequences to process + - sequences (list[str] or tuple[str]): sequences to process - **kwargs - needed arguments for completion of desired procedure Return: @@ -216,13 +216,13 @@ def check_and_parse_user_input( if procedure not in procedures_to_functions.keys(): raise ValueError("Wrong procedure") allowed_inputs = set(dictionaries.amino_acids.keys()).union( - set(dictionaries.amino_acids.values()).union(set("-")) + set(dictionaries.amino_acids.values()) ) + allowed_inputs.add("-") if procedure != "three_one_letter_code": - allowed_inputs.remove("-") allowed_inputs -= set(dictionaries.amino_acids.values()) for sequence in sequences: - allowed_inputs_seq = allowed_inputs + allowed_inputs_seq = allowed_inputs.copy() if procedure == "three_one_letter_code" and "-" in sequence: allowed_inputs_seq -= set(dictionaries.amino_acids.keys()) if not all( @@ -230,6 +230,8 @@ def check_and_parse_user_input( ): raise ValueError("Invalid sequence given") else: + allowed_inputs_seq.remove("-") + allowed_inputs_seq -= set(dictionaries.amino_acids.values()) if not all(aminoacids in allowed_inputs_seq for aminoacids in sequence): raise ValueError("Invalid sequence given") procedure_arguments = {} @@ -258,44 +260,48 @@ def check_and_parse_user_input( return procedure_arguments, procedure -def run_protein_tools(sequences: list(str) or tuple(str), **kwargs: str): +def run_protein_tools(sequences: list[str] or tuple[str], **kwargs: str): """ - Main function to process protein sequence by one of the developed tools. + Main function to process protein sequence by one of the developed tools.\n Run one procedure at a time: - Search for conserved amino acids residues in protein sequence - Search for alternative frames in a protein sequences - Convert protein sequences to RNA or DNA sequences - - + - Reverse the protein sequences from one-letter to three-letter format and vice-versa + - Define molecular weight of the protein sequences - All functions are letter case sensitive - Provide protein sequence in one letter code. - You can obtain one letter code from three letter code with *three_one_letter_code* - If more information needed please see Readme or desired docstring + All functions except *search_for_alt_frames* are letter case sensitive\n + Provide protein sequence in one letter code.\n + You can obtain one letter code from three letter code with *three_one_letter_code*\n + If more information needed please see README or desired docstring Arguments: - - sequences (list(str) or tuple(str)): sequences to process - - procedure (str): desired procedure: + - sequences (list[str] or tuple[str]): sequences to process + - procedure (str]: desired procedure: - "search_for_motifs" - "search_for_alt_frames" - "convert_to_nucl_acids" - "three_one_letter_code" - "define_molecular_weight" + For "search_for_motif" procedure provide: - - motif (str): desired motif to check presense in every given sequence + - motif (str]: desired motif to check presense in every given sequence\n Example: motif = "GA" - - overlapping (bool): count (True) or skip (False) overlapping matches. (Optional) + - overlapping (bool): count (True) or skip (False) overlapping matches. (Optional)\n Example: overlapping = False + For "search_for_alt_frames" procedure provide: - - alt_start_aa (str): the name of an amino acid that is encoded by alternative start codon (Optional) - Example: alt_start_aa = 'I' + - alt_start_aa (str]: the name of an amino acid that is encoded by alternative start codon (Optional)\n + Example: alt_start_aa = "I" + For "convert_to_nucl_acids" procedure provide: - - nucl_acids (str): the nucleic acid to convert to - Example: nucl_acids = 'RNA' - nucl_acids = 'DNA' - nucl_acids = 'both' + - nucl_acids (str]: the nucleic acid to convert to\n + Example: nucl_acids = "RNA"\n + nucl_acids = "DNA"\n + nucl_acids = "both" Return: - - dict: Dictionary with processed sequences. Depends on desired tool + - dict: Dictionary with processed sequences. Depends on desired tool\n Please see Readme or desired docstring """ procedure_arguments, procedure = check_and_parse_user_input(sequences, **kwargs) From d3b21d1aa030fdcd931c5c873528f320f8fcebf5 Mon Sep 17 00:00:00 2001 From: VovaGrig Date: Sun, 1 Oct 2023 02:20:52 +0300 Subject: [PATCH 32/36] Add mminor fixes --- HW4_Grigoriants/protein_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HW4_Grigoriants/protein_tools.py b/HW4_Grigoriants/protein_tools.py index 11e1adc..0de1815 100644 --- a/HW4_Grigoriants/protein_tools.py +++ b/HW4_Grigoriants/protein_tools.py @@ -248,7 +248,7 @@ def check_and_parse_user_input( procedure_arguments["alt_start_aa"] = "M" else: if len(kwargs["alt_start_aa"]) > 1: - raise ValueError("Invalid start AA!") + raise ValueError("Invalid alternative start AA") procedure_arguments["alt_start_aa"] = kwargs["alt_start_aa"] elif procedure == "convert_to_nucl_acids": if "nucl_acids" not in kwargs.keys(): From 7412e71e4d44ded2e483442d5d630508bbaec282 Mon Sep 17 00:00:00 2001 From: Vladimir Grigoriants Date: Sun, 1 Oct 2023 04:18:33 +0400 Subject: [PATCH 33/36] Update README.md: add information, pictures, team photo --- HW4_Grigoriants/README.md | 185 ++++++++++++++++++++++++++++++-------- 1 file changed, 149 insertions(+), 36 deletions(-) diff --git a/HW4_Grigoriants/README.md b/HW4_Grigoriants/README.md index 227f9eb..3248949 100644 --- a/HW4_Grigoriants/README.md +++ b/HW4_Grigoriants/README.md @@ -4,57 +4,160 @@ *Proteins* are under the constant focus of scientists. Currently, there are an enormous amount of tools to operate with nucleotide sequences, however, the same ones for proteins are extremely rare. -`Protein_tools.py` is an open-source program that facilitates working with protein sequences. - -*В моём представлении здесь должна быть картинка* +`protein_tools.py` is an open-source program that facilitates working with protein sequences. ## Usage -The programm is based on `run_protein_tools` function that takes the list of **one-letter amino acid sequences**, a name of procedure and a relevant argument. If you have three-letter amino acids sequences you could convert them by using `three_one_letter_code` procedure in advance. Before using this procedure, check the *Options*. +The programm is based on `run_protein_tools` function that takes the list of **one-letter amino acid sequences** (not only), a name of procedure and a relevant argument. If you have three-letter amino acids sequences you could convert them by using `three_one_letter_code` procedure in advance. Before using this procedure, check the *Options* and *Examples*. To start with the program run the following command: -`run_protein_tools([sequence_1, sequence_2 ..., sequence_n], procedure, ...)` +`run_protein_tools(sequences, procedure="procedure", ...)` Where: -- [sequence_1, sequence_2 ..., sequence_n] - a list of protein sequences -- procedure - a type of procedure to use that is inputed in *string* type -- ... - an additional argument that is to be inputed in *string* type +- sequences - positional argument, a list of protein sequences +- procedure - keyword argument, a type of procedure to use that is inputed in *string* type +- ... - an additional keyword arguments that are to be inputed in *string* type ## Options -The program has five types of procedures: +The program has five types of procedures, for more information please see provided docstrings: `three_one_letter_code` + + ![image](https://drive.google.com/uc?export=view&id=1eACjU_CXFbqeu1iW3ekwcg81n-X3WvTG) - The main aim - to convert three-letter amino acid sequences to one-letter ones and vice-versa - In case of three-to-one translation the names of amino acids **must be separated with hyphen** - An additional argument: no +``` +""" +Reverse the protein sequences from one-letter to three-letter format and vice-versa + +Case 1: get three-letter sequence\n +Use one-letter amino-acids sequences of any letter case + +Case 2: get one-letter sequence\n +Use three-letter amino-acid separated by "-" sequences. +Please note that sequences without "-" are parsed as one-letter code sequences\n +Example: for sequence "Ala" function will return "Ala-leu-ala" + +Arguments: +- sequences (tuple[str] or list[str]): protein sequences to convert\n +Example: ["WAG", "MkqRe", "msrlk", "Met-Ala-Gly", "Met-arg-asn-Trp-Ala-Gly", "arg-asn-trp"] + +Return: +- list: one-letter/three-letter protein sequences\n +Example: ["Met-Ala-Gly", "Met-arg-asn-Trp-Ala-Gly", "arg-asn-trp", "WAG", "MkqRe", "rlk"] +""" +``` `define_molecular_weight` + + ![image](https://drive.google.com/uc?export=view&id=1i9_4ys64XsAxnw-08zbgyBQnGzJoGJfr) - The main aim - to determine the exact molecular weight of protein sequences - An additional argument: no +``` +""" +Define molecular weight of the protein sequences + +Use one-letter amino-acids sequences of any letter case +The molecular weight is: +- a sum of masses of each atom constituting a molecule +- expressed in units called daltons (Da) +- rounded to hundredths + +Arguments: +- sequences (tuple[str] or list[str]): protein sequences to convert + +Return: +- dictionary: protein sequences as keys and molecular masses as values\n +Example: {"WAG": 332.39, "MkqRe": 690.88, "msrlk": 633.86} +""" +``` - `check_for_motifs` + `search_for_motifs` -- The main aim - to search for the motif of interest in protein sequences -- An additional argument: motif (*str*) + ![image](https://drive.google.com/uc?export=view&id=1_bVKRn4RblrfukIxoQc0NZ_FXaJliGAH) +- The main aim - to search for the motif of interest in protein sequences +- An additional arguments: motif (*str*), overlapping (*str*) +``` +""" +Search for motifs - conserved amino acids residues in protein sequence + +Search for one motif at a time\n +Search is letter case sensitive\n +Use one-letter aminoacids code for desired sequences and motifs\n +Positions of AA in sequences are counted from 0\n +By default, overlapping matches are counted + +Arguments: +- sequences (tuple[str] or list[str]): sequences to check for given motif within\n +Example: sequences = ["AMGAGW", "GAWSGRAGA"] +- motif (str]: desired motif to check presense in every given sequence\n +Example: motif = "GA" +- overlapping (bool): count (True) or skip (False) overlapping matches. (Optional)\n +Example: overlapping = False +Return: +- dictionary: sequences (str] as keys , starting positions for presented motif (list) as values\n +Example: {"AMGAGW": [2], "GAWSGRAGA": [0, 7]} +""" +``` `search_for_alt_frames` + ![image](https://drive.google.com/uc?export=view&id=1AdXnkRDIRiC_5yiiI2qiAMSMWbZf1RIm) + - The main aim - to look for alternative frames that start with methyonine or other non-canonical start amino acids - Ignores the last three amino acids due to the insignicance of alternative frames of this length - An additional argument: alt_start_aa (*str*) - Use alt_start_aa **only for non-canonical start amino acids** - Without alt_start_aa the procedure find alternative frames that start with methyonine +``` +""" +Search for alternative frames in a protein sequences + +Search is not letter case sensitive\n +Without an alt_start_aa argument search for frames that start with methionine ("M") +To search frames with alternative start codon add alt_start_aa argument\n +In alt_start_aa argument use one-letter code + +The function ignores the last three amino acids in sequences + +Arguments: +- sequences (tuple[str] or list[str]): sequences to check +- alt_start_aa (str]: the name of an amino acid that is encoded by alternative start AA (Optional)\n +Example: alt_start_aa = "I" +Return: +- dictionary: the number of a sequence and a collection of alternative frames +""" +``` `convert_to_nucl_acids` + + ![image](https://drive.google.com/uc?export=view&id=1_pZJ0Gc-EVcR1zddpDW4Ok3w8t65fW_z) - The main aim - to convert protein sequences to DNA, RNA or both nucleic acid sequences - The program use the most frequent codons in human that could be found [here](https://www.genscript.com/tools/codon-frequency-table) - An additional argument: nucl_acids (*str*) - Use as nucl_acids only DNA, RNA or both (for more detailes, check *Examples*) - +``` +""" +Convert protein sequences to RNA or DNA sequences. + +Use the most frequent codons in human. The source - https://www.genscript.com/tools/codon-frequency-table\n +All nucleic acids (DNA and RNA) are showed in 5"-3" direction + +Arguments: +- sequences (tuple[str] or list[str]): sequences to convert +- nucl_acids (str]: the nucleic acid that is prefered\n +Example: nucl_acids = "RNA" - convert to RNA\n + nucl_acids = "DNA" - convert to DNA\n + nucl_acids = "both" - convert to RNA and DNA +Return: +- dictionary: nucleic acids (str) as keys, collection of sequences (list) as values +""" +``` ## Examples ```python @@ -62,27 +165,33 @@ The program has five types of procedures: run_protein_tools(['met-Asn-Tyr', 'Ile-Ala-Ala'], procedure='three_one_letter_code') # ['mNY', 'IAA'] run_protein_tools(['mNY','IAA'], procedure='three_one_letter_code') # ['met-Asn-Tyr', 'Ile-Ala-Ala'] + # define_molecular_weight -run_protein_tools(['MNY','IAA'], procedure='define_molecular_weight') # [462.52000000000004, 309.35] +run_protein_tools(['MNY','IAA'], procedure='define_molecular_weight') # {'MNY': 426.52, 'IAA': 273.35} + # check_for_motifs -run_protein_tools(['mNY','IAA'], procedure='check_for_motifs', motif='NY') -# Sequence: mNY -# Motif: NY -# Motif is present in protein sequence starting at positions: 1 -# Sequence: IAA -# Motif: NY -# Motif is not present in protein sequence -# {'mNY': [1], 'IAA': []} +run_protein_tools(['mNY','IAA'], procedure='search_for_motifs', motif='NY') +#Sequence: mNY +#Motif: NY +#Motif is present in protein sequence starting at positions: 1 + +#Sequence: IAA +#Motif: NY +#Motif is not present in protein sequence + +{'mNY': [1], 'IAA': []} + # search_for_alt_frames run_protein_tools(['mNYQTMSPYYDMId'], procedure='search_for_alt_frames') # {'mNYQTMSPYYDMId': ['MSPYYDMId']} run_protein_tools(['mNYTQTSP'], procedure='search_for_alt_frames', alt_start_aa='T') # {'mNYTQTSP': ['TQTSP']} + # convert_to_nucl_acids run_protein_tools(['MNY'], procedure='convert_to_nucl_acids', nucl_acids = 'RNA') # {'RNA': ['AUGAACUAU']} -run_protein_tools(['MNY'], procedure='convert_to_nucl_acids', nucl_acids = 'DNA') # {'DNA': ['ATGAACTAT']} -run_protein_tools(['MNY'], procedure='convert_to_nucl_acids', nucl_acids = 'both') # {'RNA': ['AUGAACUAU'], 'DNA': ['ATGAACTAT']} +run_protein_tools(['MNY'], procedure='convert_to_nucl_acids', nucl_acids = 'DNA') # {'DNA': ['TACTTGATA']} +run_protein_tools(['MNY'], procedure='convert_to_nucl_acids', nucl_acids = 'both') # {'RNA': ['AUGAACUAU'], 'DNA': ['TACTTGATA']} ``` @@ -91,19 +200,23 @@ run_protein_tools(['MNY'], procedure='convert_to_nucl_acids', nucl_acids = 'both | Type of the problem | Probable cause | ------------------------------------------------------------ |-------------------- | Output does not correspond the expected resultes | The name of procedure is wrong. You see the results of another procedure -| ValueError: No sequences provided | A list of sequences are not inputed -| ValueError: Wrong procedure | The procedure does not exist in this program +| ValueError: No sequences provided | A list of sequences are not inputed +| ValueError: Wrong procedure | The procedure does not exist in this program | TypeError: takes from 0 to 1 positional arguments but n were given | Sequences are not collected into the list type -| ValueError: Invalid sequence given | The sequences do not correspond to standard amino acid code -| ValueError: Please provide desired motif | There are no an additional argument *motif* in `check_for_motifs` -| ValueError: Invalid start AA! | There is more than one letter in an additional argument *alt_start_aa* in `search_for_alt_frames` -| ValueError: Please provide desired type of nucl_acids | There are no an additional argument *nucl_acids* in `convert_to_nucl_acids` -| ValueError: Invalid nucl_acids argument | An additional argument in `convert_to_nucl_acids` is written incorrectly -## Contacts -Authors: +| ValueError: Invalid sequence given | The sequences do not correspond to standard amino acid code +| ValueError: Please provide desired motif | There are no an additional argument *motif* in `search_for_motifs` +| ValueError: Invalid start AA | There is more than one letter in an additional argument *alt_start_aa* in `search_for_alt_frames` +| ValueError: Please provide desired type of nucl_acids | There are no an additional argument *nucl_acids* in `convert_to_nucl_acids` +| ValueError: Invalid nucl_acids argument | An additional argument in `convert_to_nucl_acids` is written incorrectly +## Contacts +Vladimir Grigoriants (vova.grig2002@gmail.com) +Team-leader. Bioinformatician, immunologist, MiLaborary inc. TCR-libraries QC developer -Vladimir Grigoriants (*адрес*) +Ekaterina Shitik (shitik.ekaterina@gmail.com) +Doctor of medicine, molecular biologist with the main interests on gene engineering, AAV vectors and CRISPR/Cas9 technologies -Tulyavko Vlada (*адрес*) +Vlada Tuliavko (vladislavi27@gmail.com) +MiLaboratory inc. manager&designer, immunologist -Ekaterina Shitik (shitik.ekaterina@gmail.com) +## Our team +![image](https://drive.google.com/uc?export=view&id=1tdSGpNl6GorFPZIqweB0PaGxQW5wK5Oo) From 4d23561a812b379525787727124c5c55d4d10cb9 Mon Sep 17 00:00:00 2001 From: Vladimir Grigoriants Date: Sun, 1 Oct 2023 04:28:35 +0400 Subject: [PATCH 34/36] Update README.md --- HW4_Grigoriants/README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/HW4_Grigoriants/README.md b/HW4_Grigoriants/README.md index 3248949..aff897e 100644 --- a/HW4_Grigoriants/README.md +++ b/HW4_Grigoriants/README.md @@ -7,7 +7,7 @@ `protein_tools.py` is an open-source program that facilitates working with protein sequences. ## Usage -The programm is based on `run_protein_tools` function that takes the list of **one-letter amino acid sequences** (not only), a name of procedure and a relevant argument. If you have three-letter amino acids sequences you could convert them by using `three_one_letter_code` procedure in advance. Before using this procedure, check the *Options* and *Examples*. +The programm is based on `run_protein_tools` function that takes the list of **one-letter amino acid sequences**, a name of procedure and a relevant argument. If you have three-letter amino acids sequences you could convert them by using `three_one_letter_code` procedure in advance. Please convert your three-letter coded sequences with `three_one_letter_code` procedure before using any other procedures on them. To start with the program run the following command: @@ -17,7 +17,8 @@ Where: - sequences - positional argument, a list of protein sequences - procedure - keyword argument, a type of procedure to use that is inputed in *string* type - ... - an additional keyword arguments that are to be inputed in *string* type - +- +Before start, check the *Options* and *Examples*. ## Options The program has five types of procedures, for more information please see provided docstrings: @@ -81,7 +82,7 @@ Example: {"WAG": 332.39, "MkqRe": 690.88, "msrlk": 633.86} ![image](https://drive.google.com/uc?export=view&id=1_bVKRn4RblrfukIxoQc0NZ_FXaJliGAH) - The main aim - to search for the motif of interest in protein sequences -- An additional arguments: motif (*str*), overlapping (*str*) +- An additional arguments: motif (*str*), overlapping (*bool*) ``` """ Search for motifs - conserved amino acids residues in protein sequence From dd6f4a63cf6de7178085abdc5ba043d916759ff4 Mon Sep 17 00:00:00 2001 From: Vladimir Grigoriants Date: Sun, 1 Oct 2023 04:30:13 +0400 Subject: [PATCH 35/36] Update README.md --- HW4_Grigoriants/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HW4_Grigoriants/README.md b/HW4_Grigoriants/README.md index aff897e..c2c6801 100644 --- a/HW4_Grigoriants/README.md +++ b/HW4_Grigoriants/README.md @@ -216,7 +216,7 @@ Team-leader. Bioinformatician, immunologist, MiLaborary inc. TCR-libraries QC de Ekaterina Shitik (shitik.ekaterina@gmail.com) Doctor of medicine, molecular biologist with the main interests on gene engineering, AAV vectors and CRISPR/Cas9 technologies -Vlada Tuliavko (vladislavi27@gmail.com) +Vlada Tuliavko (vladislavi2742@gmail.com) MiLaboratory inc. manager&designer, immunologist ## Our team From a3bec1bea04a01ce355a1452401ed6bc8ee31b3b Mon Sep 17 00:00:00 2001 From: VovaGrig Date: Sat, 14 Oct 2023 22:09:43 +0300 Subject: [PATCH 36/36] Add fixes based on feedback to dictionaries.py and protein_tools.py --- HW4_Grigoriants/dictionaries.py | 46 +-------- HW4_Grigoriants/protein_tools.py | 157 ++++++++++++++----------------- 2 files changed, 72 insertions(+), 131 deletions(-) diff --git a/HW4_Grigoriants/dictionaries.py b/HW4_Grigoriants/dictionaries.py index f4a1ada..c5725d1 100644 --- a/HW4_Grigoriants/dictionaries.py +++ b/HW4_Grigoriants/dictionaries.py @@ -1,4 +1,4 @@ -amino_acids = { +AMINO_ACIDS = { "A": "Ala", "C": "Cys", "D": "Asp", @@ -19,70 +19,30 @@ "V": "Val", "W": "Trp", "Y": "Tyr", - "a": "ala", - "c": "cys", - "d": "asp", - "e": "glu", - "f": "phe", - "g": "gly", - "h": "his", - "i": "ile", - "k": "lys", - "l": "leu", - "m": "met", - "n": "asn", - "p": "pro", - "q": "gln", - "r": "arg", - "s": "ser", - "t": "thr", - "v": "val", - "w": "trp", - "y": "tyr", } -translation_rule = { +TRANSLATION_RULE = { "F": "UUU", - "f": "uuu", "L": "CUG", - "l": "cug", "I": "AUU", - "i": "auu", "M": "AUG", - "m": "aug", "V": "GUG", - "v": "gug", "P": "CCG", - "p": "ccg", "T": "ACC", - "t": "acc", "A": "GCG", - "a": "gcg", "Y": "UAU", - "y": "uau", "H": "CAU", - "h": "cau", "Q": "CAG", - "q": "cag", "N": "AAC", - "n": "aac", "K": "AAA", - "k": "aaa", "D": "GAU", - "d": "gau", "E": "GAA", - "e": "gaa", "C": "UGC", - "c": "ugc", "W": "UGG", - "w": "ugg", "R": "CGU", - "r": "cgu", "S": "AGC", - "s": "agc", "G": "GGC", - "g": "ggc", } -amino_acid_weights = { +AMINO_ACID_WEIGHTS = { "A": 89.09, "C": 121.16, "D": 133.10, diff --git a/HW4_Grigoriants/protein_tools.py b/HW4_Grigoriants/protein_tools.py index 0de1815..df92cef 100644 --- a/HW4_Grigoriants/protein_tools.py +++ b/HW4_Grigoriants/protein_tools.py @@ -1,7 +1,7 @@ import dictionaries -def three_one_letter_code(sequences: str) -> list: +def three_one_letter_code(sequences: (tuple[str] or list[str])) -> list: """ Reverse the protein sequences from one-letter to three-letter format and vice-versa @@ -23,23 +23,35 @@ def three_one_letter_code(sequences: str) -> list: """ inversed_sequences = [] for sequence in sequences: - inversed_sequence = "" + inversed_sequence = [] if "-" not in sequence: for letter in sequence: - inversed_sequence += dictionaries.amino_acids[letter] + "-" - inversed_sequence = inversed_sequence[:-1] - inversed_sequences.append(inversed_sequence) + if letter.islower(): + inversed_sequence.append( + dictionaries.AMINO_ACIDS[letter.capitalize()].lower() + ) + else: + inversed_sequence.append(dictionaries.AMINO_ACIDS[letter]) + inversed_sequences.append("-".join(inversed_sequence)) else: aa_splitted = sequence.split("-") for aa in aa_splitted: - inversed_sequence += list(dictionaries.amino_acids.keys())[ - list(dictionaries.amino_acids.values()).index(aa) - ] - inversed_sequences.append(inversed_sequence) + aa_index = list(dictionaries.AMINO_ACIDS.values()).index( + aa.capitalize() + ) + if aa[0].islower(): + inversed_sequence.append( + list(dictionaries.AMINO_ACIDS.keys())[aa_index].lower() + ) + else: + inversed_sequence.append( + list(dictionaries.AMINO_ACIDS.keys())[aa_index] + ) + inversed_sequences.append("".join(inversed_sequence)) return inversed_sequences -def define_molecular_weight(sequences: str) -> dict: +def define_molecular_weight(sequences: (tuple[str] or list[str])) -> dict: """ Define molecular weight of the protein sequences @@ -60,7 +72,7 @@ def define_molecular_weight(sequences: str) -> dict: for sequence in sequences: sequence_weight = 0 for letter in sequence: - sequence_weight += dictionaries.amino_acid_weights[letter.upper()] + sequence_weight += dictionaries.AMINO_ACID_WEIGHTS[letter.upper()] sequence_weight -= (len(sequence) - 1) * 18 # deduct water from peptide bond sequences_weights[sequence] = round(sequence_weight, 2) return sequences_weights @@ -117,7 +129,9 @@ def search_for_motifs( return all_positions -def search_for_alt_frames(sequences: str, alt_start_aa: str) -> dict: +def search_for_alt_frames( + sequences: (tuple[str] or list[str]), alt_start_aa: str +) -> dict: """ Search for alternative frames in a protein sequences @@ -150,7 +164,9 @@ def search_for_alt_frames(sequences: str, alt_start_aa: str) -> dict: return alternative_frames -def convert_to_nucl_acids(sequences: list, nucl_acids: str) -> dict: +def convert_to_nucl_acids( + sequences: (tuple[str] or list[str]), nucl_acids: str +) -> dict: """ Convert protein sequences to RNA or DNA sequences. @@ -166,27 +182,35 @@ def convert_to_nucl_acids(sequences: list, nucl_acids: str) -> dict: Return: - dictionary: nucleic acids (str) as keys, collection of sequences (list) as values """ - rule_of_translation = sequences[0].maketrans(dictionaries.translation_rule) - rule_of_transcription = sequences[0].maketrans("AaUuCcGg", "TtAaGgCc") + rule_of_translation = str.maketrans(dictionaries.TRANSLATION_RULE) + # add lower case pairs, because only upper case pairs are stored in dictionaries + rule_of_translation.update( + str.maketrans( + dict( + (k.lower(), v.lower()) for k, v in dictionaries.TRANSLATION_RULE.items() + ) + ) + ) nucl_acid_seqs = {"RNA": [], "DNA": []} for sequence in sequences: rna_seq = sequence.translate(rule_of_translation) - dna_seq = rna_seq.translate(rule_of_transcription) if nucl_acids == "RNA": nucl_acid_seqs["RNA"].append(rna_seq) - if sequence == sequences[-1]: - del nucl_acid_seqs["DNA"] - if nucl_acids == "DNA": + elif nucl_acids == "DNA": + dna_seq = rna_seq.replace("U", "T").replace("u", "t") nucl_acid_seqs["DNA"].append(dna_seq) - if sequence == sequences[-1]: - del nucl_acid_seqs["RNA"] - if nucl_acids == "both": + elif nucl_acids == "both": + dna_seq = rna_seq.replace("U", "T").replace("u", "t") nucl_acid_seqs["RNA"].append(rna_seq) nucl_acid_seqs["DNA"].append(dna_seq) + if nucl_acids == "RNA": + del nucl_acid_seqs["DNA"] + if nucl_acids == "DNA": + del nucl_acid_seqs["RNA"] return nucl_acid_seqs -procedures_to_functions = { +PROTEINS_PROCEDURES_TO_FUNCTIONS = { "search_for_motifs": search_for_motifs, "search_for_alt_frames": search_for_alt_frames, "convert_to_nucl_acids": convert_to_nucl_acids, @@ -196,12 +220,12 @@ def convert_to_nucl_acids(sequences: list, nucl_acids: str) -> dict: def check_and_parse_user_input( - sequences: list[str] or tuple[str], **kwargs + sequences: (str, tuple[str] or list[str]), **kwargs ) -> dict and str: """ Check if user input can be correctly processed\n Parse sequences and arguments for desired procedure - + Arguments: - sequences (list[str] or tuple[str]): sequences to process - **kwargs - needed arguments for completion of desired procedure @@ -210,29 +234,34 @@ def check_and_parse_user_input( - string: procedure name - dictionary: a collection of procedure arguments and their values """ - if len(sequences) == 0: - raise ValueError("No sequences provided") + if isinstance(sequences, str): + sequences = sequences.split() + if "" in sequences or len(sequences) == 0: + raise ValueError("Empty sequence provided") procedure = kwargs["procedure"] - if procedure not in procedures_to_functions.keys(): + if procedure not in PROTEINS_PROCEDURES_TO_FUNCTIONS.keys(): raise ValueError("Wrong procedure") - allowed_inputs = set(dictionaries.amino_acids.keys()).union( - set(dictionaries.amino_acids.values()) + allowed_inputs = set(dictionaries.AMINO_ACIDS.keys()) + allowed_inputs = allowed_inputs.union( + set(k.lower() for k in dictionaries.AMINO_ACIDS.keys()) ) - allowed_inputs.add("-") - if procedure != "three_one_letter_code": - allowed_inputs -= set(dictionaries.amino_acids.values()) + if procedure == "three_one_letter_code": + allowed_inputs = allowed_inputs.union(set(dictionaries.AMINO_ACIDS.values())) + allowed_inputs = allowed_inputs.union( + set(v.lower() for v in dictionaries.AMINO_ACIDS.values()) + ) for sequence in sequences: allowed_inputs_seq = allowed_inputs.copy() if procedure == "three_one_letter_code" and "-" in sequence: - allowed_inputs_seq -= set(dictionaries.amino_acids.keys()) - if not all( - aminoacids in allowed_inputs_seq for aminoacids in sequence.split("-") - ): + allowed_inputs_seq -= set(dictionaries.AMINO_ACIDS.keys()) + allowed_inputs_seq -= set( + k.lower() for k in dictionaries.AMINO_ACIDS.keys() + ) + allowed_inputs_seq.union(set("-")) + if not set(sequence.split("-")).issubset(allowed_inputs_seq): raise ValueError("Invalid sequence given") else: - allowed_inputs_seq.remove("-") - allowed_inputs_seq -= set(dictionaries.amino_acids.values()) - if not all(aminoacids in allowed_inputs_seq for aminoacids in sequence): + if not set(sequence).issubset(allowed_inputs_seq): raise ValueError("Invalid sequence given") procedure_arguments = {} if procedure == "search_for_motifs": @@ -258,51 +287,3 @@ def check_and_parse_user_input( procedure_arguments["nucl_acids"] = kwargs["nucl_acids"] procedure_arguments["sequences"] = sequences return procedure_arguments, procedure - - -def run_protein_tools(sequences: list[str] or tuple[str], **kwargs: str): - """ - Main function to process protein sequence by one of the developed tools.\n - Run one procedure at a time: - - Search for conserved amino acids residues in protein sequence - - Search for alternative frames in a protein sequences - - Convert protein sequences to RNA or DNA sequences - - Reverse the protein sequences from one-letter to three-letter format and vice-versa - - Define molecular weight of the protein sequences - - All functions except *search_for_alt_frames* are letter case sensitive\n - Provide protein sequence in one letter code.\n - You can obtain one letter code from three letter code with *three_one_letter_code*\n - If more information needed please see README or desired docstring - - Arguments: - - sequences (list[str] or tuple[str]): sequences to process - - procedure (str]: desired procedure: - - "search_for_motifs" - - "search_for_alt_frames" - - "convert_to_nucl_acids" - - "three_one_letter_code" - - "define_molecular_weight" - - For "search_for_motif" procedure provide: - - motif (str]: desired motif to check presense in every given sequence\n - Example: motif = "GA" - - overlapping (bool): count (True) or skip (False) overlapping matches. (Optional)\n - Example: overlapping = False - - For "search_for_alt_frames" procedure provide: - - alt_start_aa (str]: the name of an amino acid that is encoded by alternative start codon (Optional)\n - Example: alt_start_aa = "I" - - For "convert_to_nucl_acids" procedure provide: - - nucl_acids (str]: the nucleic acid to convert to\n - Example: nucl_acids = "RNA"\n - nucl_acids = "DNA"\n - nucl_acids = "both" - - Return: - - dict: Dictionary with processed sequences. Depends on desired tool\n - Please see Readme or desired docstring - """ - procedure_arguments, procedure = check_and_parse_user_input(sequences, **kwargs) - return procedures_to_functions[procedure](**procedure_arguments)