# Load in sorted fasta files and try to find columns that behave the same

In [2]:
import pandas as pd
from sequence import * 

In [3]:
ec_nums = ['3_2_1_4', '3_2_1_14', '3_6_5_2', '2_7_11_22','2_4_99_18','2_7_11_24',
'2_7_3_3','4_2_1_1','3_2_1_21', '2_4_1_207', '2_7_11_1', '2_7_13_3', 
'2_7_10_2', '2_7_10_1', '1_14_99_39', '2_5_1_18', '3_2_1_8', '3_5_2_6', 
'1_15_1_1', '4_1_1_39']

In [23]:
def load_seqs(ec_num):
    
    df = pd.read_csv(f"../workflows/{ec_num}/csv/{ec_num}_uniprot.csv")

    unfiltered_seqs = readFastaFile(f"../workflows/{ec_num}/files/{ec_num}.fasta")
  
    filtered_seqs = readFastaFile(f"../workflows/{ec_num}/files/{ec_num}_filt.fasta")
    
    filtered_names = [seq.name for seq in filtered_seqs]

    unfiltered_names = [seq.name for seq in unfiltered_seqs if seq.name not in filtered_names]

    filtered_df = df.loc[(df["Entry"].isin(filtered_names))]
  
    unfiltered_df = df.loc[df["Entry"].isin(unfiltered_names)]
  
    pos_column_counts = {}

    for col in filtered_df.columns:
        #record how many unique values there are 
        unique = len(filtered_df[col].unique())
        pos_column_counts[col] = unique 

    neg_column_counts = {}

    for col in unfiltered_df.columns:
        #record how many unique values there are 
        unique = len(unfiltered_df[col].unique())
    
        neg_column_counts[col] = unique 

    return pos_column_counts, neg_column_counts
    

# Columns with 1 entry 

In [22]:
entries = {}

for ec in ec_nums:
    pos_column_counts, neg_column_counts = load_seqs(ec)

    for (k1, v1), (k2, v2) in zip(neg_column_counts.items(), pos_column_counts.items()):
        if v2 == 1 and v1 > 1:
            if entries.get(k2) is None:
                entries[k2] = v2
            else:
                entries[k2] += 1

    
ordered = sorted(entries.items(), key = lambda x: x[1], reverse=True)


BRENDA_REFERENCES_102
BRENDA_REFERENCES_102_PUBMED
BRENDA_KM_p_nitrophenyl_beta_D_cellopentaoside_DATA
BRENDA_KM_p_nitrophenyl_beta_D_cellopentaoside_UNITS
BRENDA_KM_p_nitrophenyl_beta_D_cellopentaoside_REFS
BRENDA_KM_p_nitrophenyl_beta_D_cellotetraoside_DATA
BRENDA_KM_p_nitrophenyl_beta_D_cellotetraoside_UNITS
BRENDA_KM_p_nitrophenyl_beta_D_cellotetraoside_REFS
BRENDA_REFERENCES_33
BRENDA_REFERENCES_72
BRENDA_SU_DATA
BRENDA_SU_REFS
BRENDA_SU_COMMENT
BRENDA_KM_2,4_dinitrophenyl_beta_D_cellobioside_DATA
BRENDA_KM_2,4_dinitrophenyl_beta_D_cellobioside_UNITS
BRENDA_KM_2,4_dinitrophenyl_beta_D_cellobioside_REFS
BRENDA_KM_2,4_dinitrophenyl_beta_D_cellobioside_COMMENT
BRENDA_TN_2,4_dinitrophenyl_beta_D_cellobioside_DATA
BRENDA_TN_2,4_dinitrophenyl_beta_D_cellobioside_UNITS
BRENDA_TN_2,4_dinitrophenyl_beta_D_cellobioside_REFS
BRENDA_TN_2,4_dinitrophenyl_beta_D_cellobioside_COMMENT
BRENDA_SS_DATA
BRENDA_SS_REFS
BRENDA_REFERENCES_53
BRENDA_REFERENCES_94
BRENDA_REFERENCES_94_PUBMED
BRENDA_REFERE

Virus_hosts
Gene_encoded_by
Alternative_products_isoforms
Erroneous_gene_model_prediction
Erroneous_initiation
Erroneous_termination
Erroneous_translation
Frameshift
Polymorphism
RNA_editing
Sequence_caution
Alternative_sequence
Non_standard_residue
Sequence_uncertainty
Absorption
ChEBI
ChEBI_Catalytic_activity
ChEBI_Cofactor
ChEBI_IDs
Cofactor
Pathway
Redox_potential
Binding_site
DNA_binding
Metal_binding
Nucleotide_binding
Site
Subunit_structure_[CC]
Interacts_with
Biotechnological_use
Disruption_phenotype
Involvement_in_disease
Pharmaceutical_use
Toxic_dose
Intramembrane
Topological_domain
Transmembrane
Post_translational_modification
Cross_link
Glycosylation
Initiator_methionine
Lipidation
Modified_residue
Peptide
Propeptide
Transit_peptide
Coiled_coil
Motif
Repeat
Zinc_finger
Taxonomic_lineage_SUBKINGDOM
Taxonomic_lineage_SUPERPHYLUM
Taxonomic_lineage_SUPERCLASS
Taxonomic_lineage_INFRACLASS
Taxonomic_lineage_SUPERORDER
Taxonomic_lineage_INFRAORDER
Taxonomic_lineage_PARVORDER
Taxon

Virus_hosts
Fragment
Gene_encoded_by
Alternative_products_isoforms
Erroneous_termination
Erroneous_translation
Polymorphism
RNA_editing
Non_adjacent_residues
Non_standard_residue
Non_terminal_residue
Sequence_uncertainty
Absorption
ChEBI_Catalytic_activity
Pathway
Redox_potential
Temperature_dependence
pH_dependence
Metal_binding
Site
Allergenic_properties
Biotechnological_use
Pharmaceutical_use
Toxic_dose
Intramembrane
Topological_domain
Transmembrane
Disulfide_bond
Glycosylation
Peptide
Propeptide
Signal_peptide
Transit_peptide
Coiled_coil
Repeat
Zinc_finger
Taxonomic_lineage_SUPERKINGDOM
Taxonomic_lineage_SUPERPHYLUM
Taxonomic_lineage_SUBTRIBE
Taxonomic_lineage_SUBSPECIES
Taxonomic_lineage_VARIETAS
Taxonomic_lineage_FORMA
Cross_reference_ABCD
Cross_reference_Allergome
Cross_reference_ArachnoServer
Cross_reference_CAZy
Cross_reference_CLAE
Cross_reference_CollecTF
Cross_reference_ComplexPortal
Cross_reference_COMPLUYEAST_2DPAGE
Cross_reference_ConoServer
Cross_reference_DDBJ
Cross_re

BRENDA_KM_4_methylumbelliferyl_beta_D_glucopyranoside_DATA
BRENDA_KM_4_methylumbelliferyl_beta_D_glucopyranoside_UNITS
BRENDA_KM_4_methylumbelliferyl_beta_D_glucopyranoside_REFS
BRENDA_KM_4_methylumbelliferyl_beta_D_glucopyranoside_COMMENT
BRENDA_REFERENCES_30
BRENDA_REFERENCES_30_PUBMED
BRENDA_REFERENCES_217
BRENDA_REFERENCES_217_PUBMED
BRENDA_REFERENCES_226
BRENDA_REFERENCES_226_PUBMED
BRENDA_REFERENCES_222
BRENDA_REFERENCES_222_PUBMED
BRENDA_REFERENCES_273
BRENDA_REFERENCES_273_PUBMED
BRENDA_KM_daidzein_7'_O_6''_O_beta_D_malonyl_beta_D_glucoside_DATA
BRENDA_KM_daidzein_7'_O_6''_O_beta_D_malonyl_beta_D_glucoside_UNITS
BRENDA_KM_daidzein_7'_O_6''_O_beta_D_malonyl_beta_D_glucoside_REFS
BRENDA_KM_daidzein_7'_O_6''_O_beta_D_malonyl_beta_D_glucoside_COMMENT
BRENDA_KM_daidzein_7'_O_beta_D_glucoside_DATA
BRENDA_KM_daidzein_7'_O_beta_D_glucoside_UNITS
BRENDA_KM_daidzein_7'_O_beta_D_glucoside_REFS
BRENDA_KM_daidzein_7'_O_beta_D_glucoside_COMMENT
BRENDA_KM_genistein_7_O_6''_O_malonyl_beta_D_gl

Virus_hosts
Gene_encoded_by
Alternative_products_isoforms
Erroneous_gene_model_prediction
Erroneous_initiation
Erroneous_termination
Erroneous_translation
Polymorphism
RNA_editing
Alternative_sequence
Natural_variant
Non_adjacent_residues
Non_standard_residue
Sequence_uncertainty
Absorption
ChEBI
ChEBI_Catalytic_activity
ChEBI_Cofactor
ChEBI_IDs
Cofactor
Pathway
Redox_potential
Temperature_dependence
DNA_binding
Metal_binding
Nucleotide_binding
Caution
Interacts_with
Allergenic_properties
Biotechnological_use
Involvement_in_disease
Pharmaceutical_use
Toxic_dose
Intramembrane
Topological_domain
Cross_link
Initiator_methionine
Lipidation
Modified_residue
Peptide
Propeptide
Transit_peptide
Beta_strand
Helix
Turn
Domain_[CC]
Coiled_coil
Motif
Repeat
Zinc_finger
Taxonomic_lineage_SUPERKINGDOM
Taxonomic_lineage_KINGDOM
Taxonomic_lineage_SUBKINGDOM
Taxonomic_lineage_SUPERPHYLUM
Taxonomic_lineage_PHYLUM
Taxonomic_lineage_SUBPHYLUM
Taxonomic_lineage_SUPERCLASS
Taxonomic_lineage_INFRACLASS
Taxon

BRENDA_REFERENCES_263
BRENDA_REFERENCES_263_PUBMED
BRENDA_REFERENCES_259
BRENDA_REFERENCES_259_PUBMED
BRENDA_REFERENCES_248
BRENDA_REFERENCES_248_PUBMED
BRENDA_REFERENCES_274
BRENDA_REFERENCES_274_PUBMED
BRENDA_REFERENCES_279
BRENDA_REFERENCES_279_PUBMED
BRENDA_REFERENCES_354
BRENDA_REFERENCES_354_PUBMED
Gene_encoded_by
Alternative_products_isoforms
Erroneous_termination
Polymorphism
RNA_editing
Non_adjacent_residues
Non_standard_residue
Sequence_uncertainty
Absorption
Pathway
Redox_potential
Temperature_dependence
pH_dependence
DNA_binding
Allergenic_properties
Biotechnological_use
Pharmaceutical_use
Toxic_dose
Intramembrane
Peptide
Propeptide
Transit_peptide
Taxonomic_lineage_SUBKINGDOM
Taxonomic_lineage_SUPERPHYLUM
Taxonomic_lineage_SUBTRIBE
Taxonomic_lineage_FORMA
Cross_reference_Allergome
Cross_reference_ArachnoServer
Cross_reference_CAZy
Cross_reference_CGD
Cross_reference_CLAE
Cross_reference_CollecTF
Cross_reference_COMPLUYEAST_2DPAGE
Cross_reference_ConoServer
Cross_reference_

Virus_hosts
Gene_encoded_by
Alternative_products_isoforms
Erroneous_termination
Erroneous_translation
Frameshift
RNA_editing
Non_adjacent_residues
Non_standard_residue
Sequence_uncertainty
Absorption
Redox_potential
DNA_binding
Nucleotide_binding
Caution
Biotechnological_use
Pharmaceutical_use
Toxic_dose
Intramembrane
Topological_domain
Transmembrane
Glycosylation
Lipidation
Peptide
Propeptide
Signal_peptide
Transit_peptide
Motif
Repeat
Zinc_finger
Taxonomic_lineage_SUPERPHYLUM
Taxonomic_lineage_VARIETAS
Taxonomic_lineage_FORMA
Cross_reference_ArachnoServer
Cross_reference_CAZy
Cross_reference_CGD
Cross_reference_CLAE
Cross_reference_CollecTF
Cross_reference_ComplexPortal
Cross_reference_COMPLUYEAST_2DPAGE
Cross_reference_ConoServer
Cross_reference_CORUM
Cross_reference_DDBJ
Cross_reference_DEPOD
Cross_reference_dictyBase
Cross_reference_ELM
EnsemblFungi_transcript
Cross_reference_ENZYME
Cross_reference_ESTHER
Cross_reference_euHCVdb
Cross_reference_GenAtlas
Cross_reference_GenBank
Cro

BRENDA_REFERENCES_152
BRENDA_REFERENCES_152_PUBMED
BRENDA_REFERENCES_164
BRENDA_REFERENCES_164_PUBMED
BRENDA_REFERENCES_166
BRENDA_REFERENCES_166_PUBMED
BRENDA_PM_DATA
BRENDA_PM_REFS
BRENDA_PM_COMMENT
BRENDA_REFERENCES_109
BRENDA_REFERENCES_109_PUBMED
BRENDA_REFERENCES_306
BRENDA_REFERENCES_306_PUBMED
BRENDA_REFERENCES_132
BRENDA_REFERENCES_132_PUBMED
BRENDA_REFERENCES_173
BRENDA_REFERENCES_173_PUBMED
BRENDA_REFERENCES_151
BRENDA_REFERENCES_151_PUBMED
BRENDA_REFERENCES_159
BRENDA_REFERENCES_159_PUBMED
BRENDA_AC_DATA
BRENDA_AC_REFS
BRENDA_AC_COMMENT
BRENDA_REFERENCES_129
BRENDA_REFERENCES_129_PUBMED
Virus_hosts
Gene_encoded_by
Alternative_products_isoforms
Erroneous_gene_model_prediction
Erroneous_initiation
Erroneous_termination
Erroneous_translation
Frameshift
Mass_spectrometry
Polymorphism
RNA_editing
Sequence_caution
Non_adjacent_residues
Non_standard_residue
Sequence_uncertainty
EC_number
Absorption
ChEBI_Catalytic_activity
Kinetics
Pathway
Redox_potential
Temperature_dependence
pH

In [24]:
ordered

[('Taxonomic_lineage_INFRAORDER', 5),
 ('Taxonomic_lineage_SUPERFAMILY', 5),
 ('Cross_reference_TIGRFAMs', 5),
 ('Cross_reference_CDD', 5),
 ('Cross_reference_PANTHER', 5),
 ('BRENDA_SS_DATA', 4),
 ('BRENDA_SS_REFS', 4),
 ('BRENDA_IC50', 4),
 ('BRENDA_EXP_DATA', 4),
 ('BRENDA_EXP_REFS', 4),
 ('BRENDA_EXP_COMMENT', 4),
 ('Erroneous_initiation', 4),
 ('Sequence_caution', 4),
 ('pH_dependence', 4),
 ('Site', 4),
 ('Tissue_specificity', 4),
 ('Post_translational_modification', 4),
 ('Coiled_coil', 4),
 ('Taxonomic_lineage_INFRACLASS', 4),
 ('Taxonomic_lineage_SUPERORDER', 4),
 ('Cross_reference_Araport', 4),
 ('Cross_reference_BioCyc', 4),
 ('Cross_reference_DrugBank', 4),
 ('EnsemblPlants_transcript', 4),
 ('Cross_reference_Gramene', 4),
 ('Cross_reference_PhylomeDB', 4),
 ('Cross_reference_PIRSF', 4),
 ('Cross_reference_ProteomicsDB', 4),
 ('Cross_reference_TAIR', 4),
 ('Transmembrane', 4),
 ('Lipidation', 4),
 ('Cross_reference_ABCD', 4),
 ('Cross_reference_Reactome', 4),
 ('BRENDA_PM_D

In [30]:

ec_num = ec_nums[2]

df = pd.read_csv(f"../workflows/{ec_num}/csv/{ec_num}_uniprot.csv")

filtered_seqs = readFastaFile(f"../workflows/{ec_num}/files/{ec_num}_filt.fasta")

filtered_names = [seq.name for seq in filtered_seqs]

filtered_df = df.loc[(df["Entry"].isin(filtered_names))]

filtered_df[['Entry', 'Taxonomic_lineage_INFRAORDER'] ]

#print(filtered_df['Cross_reference_TIGRFAMs'])

Unnamed: 0,Entry,Taxonomic_lineage_INFRAORDER
2,A0A0D9RRD7,Simiiformes
3,A0A0D9RYP7,Simiiformes
4,Q09767,
5,Q15382,Simiiformes
6,C9ZVV9,
...,...,...
89,Q68U42,
90,P60763,Simiiformes
91,Q9UL25,Simiiformes
92,Q6T311,Simiiformes
