### Get all the obsolete GO terms (latest 4506) and remove the obsolete terms from go_child_df/uniprot_go_childs.csv

In [4]:
import ast
import pandas as pd

In [8]:
"""
Get a list of all the obsolete GO terms
"""

def parse_obo_file(file):
    with open(file, 'r') as f:
        content = f.read().split('[Term]')

    obsolete_terms = []
    for entry in content:
        lines = entry.split('\n')
        is_obsolete = False
        go_id = ''
        for line in lines:
            if line.startswith('id: GO'):
                go_id = line.split(': ')[1].strip()
            if line.startswith('is_obsolete:'):
                is_obsolete = line.split(': ')[1].strip() == 'true'
        if is_obsolete:
            obsolete_terms.append(go_id)
    return obsolete_terms

# GO Database location, downloaded from http://geneontology.org/docs/download-ontology/
obsolete_terms = parse_obo_file('go.obo')

In [2]:
len(obsolete_terms)

4506

In [5]:
#load the go children data file
go_child_df = pd.read_csv("/Users/zaidur/Documents/Sequence_Project/aeromonasBact/uniprot_go_childs.csv")

#convert the go_terms column into list
go_child_df['go_terms'] = go_child_df['go_terms'].apply(ast.literal_eval)

In [6]:
go_child_df

Unnamed: 0,id,sequence,go_terms
0,A0A068FVC1,MNIIKTAIPDVHIFEPKVFFDERGFFFESFNHKLFEEAVGYSVNFV...,"[GO:0008830, GO:0019305]"
1,A0A068FZD0,MTTQSSKSRVFVAGHRGMVGSAICRQLAQRTDIELVVRSRSELDLT...,"[GO:0016853, GO:0042351, GO:0050577, GO:0070401]"
2,A0A068FZK6,METSGLVAFVGTALAIACLRPLSAKLQLVDLPNQRKQHVGAIPLIG...,"[GO:0000287, GO:0005886, GO:0009243, GO:000927..."
3,A0A075P9Z7,MNLTELKQKPITDLLQLAEEMGIENMARSRKQDVIFSLLKKHAKSG...,"[GO:0003723, GO:0004386, GO:0005524, GO:000582..."
4,A0A075PBX8,MQISVNEFLTPRHIDVQVVSPTRAKITLEPLERGFGHTLGNALRRI...,"[GO:0000428, GO:0003677, GO:0003899, GO:000573..."
...,...,...,...
29776,Q7BJX9,MDIYMSRYEEITQQLIFSPKTWLITGVAGFIGSNLLEKLLKLNQVV...,"[GO:0000166, GO:0003974, GO:0009243]"
29777,Q8UVZ1,MEQANLYEVAPRPLMTSLVQNQQNPYIYKDTAGDLSEICENENSID...,"[GO:0000978, GO:0000981, GO:0001228, GO:000188..."
29778,Q9L5A4,MKQTSLALAITALLSTLPSALVQANEGCAPLTGKESGMDIGRSSTE...,"[GO:0004252, GO:0005576, GO:0006508, GO:004259..."
29779,R1GTS7,MFARLEGRPVLLVGGGEVALRKARLLLAAGARLTLVSPVLASEFDE...,"[GO:0004851, GO:0009236, GO:0019354, GO:003225..."


In [7]:
# Initialize the count of obsolete terms
obsolete_count = 0

# Iterate over the DataFrame
for i, row in go_child_df.iterrows():
    new_go_terms = []
    for go_term in row['go_terms']:
        if go_term in obsolete_terms:
            obsolete_count += 1
        else:
            new_go_terms.append(go_term)
    go_child_df.at[i, 'go_terms'] = new_go_terms

print(f"Number of obsolete terms: {obsolete_count}")

Number of obsolete terms: 74


In [10]:
go_child_df.to_csv("/Users/zaidur/Documents/Sequence_Project/aeromonasBact/uniprot_go_childs.csv", index=False)