## Combining the three fasta sequences from uniprot database and pulling the child go terms using the sequence id

In [3]:
#fasta sequences are pulled from Uniprot Database (www.uniprot.org) with search query "aeronomas" and
#selecting annotation score 3, 4 and 5


import gzip
from Bio import SeqIO
import pandas as pd

#Downloaded sequences are compressed. Need to be uncompressed using gzip.open

#sequence with annotation score 3
sequences = []
with gzip.open("/Users/zaidur/Downloads/uniprot_aeromonas_3.fasta", "rt") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        sequence = str(record.seq)
        seq_id = record.id.split('|')[1]
        sequences.append((seq_id, sequence))

df3 = pd.DataFrame(sequences, columns=['id', 'sequence'])

#sequence with annotation score 4
sequences = []
with gzip.open("/Users/zaidur/Downloads/uniprot_aeromonas_4.fasta", "rt") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        sequence = str(record.seq)
        seq_id = record.id.split('|')[1]
        sequences.append((seq_id, sequence))

df4 = pd.DataFrame(sequences, columns=['id', 'sequence'])

#sequence with annotation score 5
sequences = []
with gzip.open("/Users/zaidur/Downloads/uniprot_aeromonas_5.fasta", "rt") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        sequence = str(record.seq)
        seq_id = record.id.split('|')[1]
        sequences.append((seq_id, sequence))

df5 = pd.DataFrame(sequences, columns=['id', 'sequence'])

# concatenate dataframes into single dataframe uniprot_df
frames = [df3, df4, df5]
uniprot_df = pd.concat(frames)

# reset index
uniprot_df = uniprot_df.reset_index(drop=True)

print(uniprot_df.head())

           id                                           sequence
0  A0A068FVC1  MNIIKTAIPDVHIFEPKVFFDERGFFFESFNHKLFEEAVGYSVNFV...
1  A0A068FZD0  MTTQSSKSRVFVAGHRGMVGSAICRQLAQRTDIELVVRSRSELDLT...
2  A0A068FZK6  METSGLVAFVGTALAIACLRPLSAKLQLVDLPNQRKQHVGAIPLIG...
3  A0A075P9Z7  MNLTELKQKPITDLLQLAEEMGIENMARSRKQDVIFSLLKKHAKSG...
4  A0A075PBX8  MQISVNEFLTPRHIDVQVVSPTRAKITLEPLERGFGHTLGNALRRI...


In [5]:
#writing the combined dataframe to a csv file
uniprot_df.to_csv("/Users/zaidur/Documents/Sequence Project/aeromonasBact/uniprot_df.csv", index=False)

In [52]:
def get_go_terms(uniprot_id):
    # Initialize the UniProt API client
    u = UniProt()

    # Retrieve the GO terms associated with the protein by the uniprot_id.
    #get_df returns all kinds of info, we need only the "Gene Ontology IDs" column
    go_terms_df = u.get_df(uniprot_id, columns="Gene Ontology IDs")

    #Split the string into a list using semicolon as the delimiter. Remove spaces from each item
    go_list = str(list(go_terms_df['Gene Ontology IDs'])[0]).split(';')
    go_list = [item.strip().replace(' ', '') for item in go_list]
    
    return go_list


#The output will take a long time depending on the number of entries
uniprot_df['go_terms'] = uniprot_df['id'].apply(get_go_terms)
uniprot_df

Unnamed: 0,id,sequence,go_terms
0,A0A068FVC1,MNIIKTAIPDVHIFEPKVFFDERGFFFESFNHKLFEEAVGYSVNFV...,"[GO:0008830, GO:0019305]"
1,A0A068FZD0,MTTQSSKSRVFVAGHRGMVGSAICRQLAQRTDIELVVRSRSELDLT...,"[GO:0016853, GO:0042351, GO:0050577, GO:0070401]"
2,A0A068FZK6,METSGLVAFVGTALAIACLRPLSAKLQLVDLPNQRKQHVGAIPLIG...,"[GO:0000287, GO:0005886, GO:0009243, GO:000927..."
3,A0A075P9Z7,MNLTELKQKPITDLLQLAEEMGIENMARSRKQDVIFSLLKKHAKSG...,"[GO:0003723, GO:0004386, GO:0005524, GO:000582..."
4,A0A075PBX8,MQISVNEFLTPRHIDVQVVSPTRAKITLEPLERGFGHTLGNALRRI...,"[GO:0000428, GO:0003677, GO:0003899, GO:000573..."
...,...,...,...
31316,Q8UVZ1,MEQANLYEVAPRPLMTSLVQNQQNPYIYKDTAGDLSEICENENSID...,"[GO:0000978, GO:0000981, GO:0001228, GO:000188..."
31317,Q9L5A4,MKQTSLALAITALLSTLPSALVQANEGCAPLTGKESGMDIGRSSTE...,"[GO:0004252, GO:0005576, GO:0006508, GO:004259..."
31318,R1GTS7,MFARLEGRPVLLVGGGEVALRKARLLLAAGARLTLVSPVLASEFDE...,"[GO:0004851, GO:0009236, GO:0019354, GO:003225..."
31319,T0PES4,MDYLPIFCRLDNKPVLLVGGGDVAERKARLLLDAGAQLTVVAPELD...,"[GO:0004851, GO:0009236, GO:0019354, GO:003225..."


In [53]:
#Writing the new dataframe with go_terms column to a csv file called uniprot_go_childs
#These go_terms are last children in the ancestor tree. We need to find their level 2 ancestor for our model.
uniprot_df.to_csv("/Users/zaidur/Documents/Sequence Project/aeromonasBact/uniprot_go_childs.csv", index=False)