# Load in sorted fasta files and try to find columns that behave the same

In [1]:
import pandas as pd
from sequence import * 

In [6]:
ec_nums = ['3_2_1_4', '3_2_1_14', '3_6_5_2', '2_7_11_22','2_4_99_18','2_7_11_24',
'2_7_3_3','4_2_1_1','3_2_1_21', '2_4_1_207', '2_7_11_1', '2_7_13_3', 
'2_7_10_2', '2_7_10_1', '1_14_99_39', '2_5_1_18', '3_2_1_8', '3_5_2_6', 
'1_15_1_1', '4_1_1_39', '2_7_7_7', '3_2_1_55', '3_1_1_1', '2_7_7_27', '1_5_1_20', '4_2_1_11', '3_2_1_18', 
'1_1_1_219', '2_3_2_27', '2_7_11_17', '3_1_3_2', '3_2_1_39', '2_7_11_13',
'2_7_7_48', '3_1_3_8', '3_1_4_11', '3_2_1_51', '3_4_21_92','2_7_4_6']

In [3]:
def load_seqs(ec_num):
    
    df = pd.read_csv(f"../workflows/{ec_num}/csv/{ec_num}_uniprot.csv")

    unfiltered_seqs = readFastaFile(f"../workflows/{ec_num}/files/{ec_num}.fasta")
  
    filtered_seqs = readFastaFile(f"../workflows/{ec_num}/files/{ec_num}_filt.fasta")
    
    filtered_names = [seq.name for seq in filtered_seqs]

    unfiltered_names = [seq.name for seq in unfiltered_seqs if seq.name not in filtered_names]

    filtered_df = df.loc[(df["Entry"].isin(filtered_names))]
  
    unfiltered_df = df.loc[df["Entry"].isin(unfiltered_names)]
  
    pos_column_counts = {}

    for col in filtered_df.columns:
        #record how many unique values there are 
        unique = len(filtered_df[col].unique())
        pos_column_counts[col] = unique 

    neg_column_counts = {}

    for col in unfiltered_df.columns:
        #record how many unique values there are 
        unique = len(unfiltered_df[col].unique())
    
        neg_column_counts[col] = unique 

    return pos_column_counts, neg_column_counts
    

# Columns with 1 entry 

In [7]:
entries = {}

for ec in ec_nums:
    pos_column_counts, neg_column_counts = load_seqs(ec)

    for (k1, v1), (k2, v2) in zip(neg_column_counts.items(), pos_column_counts.items()):
        if v2 == 1 and v1 > 1:
            if entries.get(k2) is None:
                entries[k2] = v2
            else:
                entries[k2] += 1

    
ordered = sorted(entries.items(), key = lambda x: x[1], reverse=True)


In [8]:
ordered

[('Cross_reference_SUPFAM', 12),
 ('Cross_reference_Gene3D', 11),
 ('Cross_reference_CDD', 10),
 ('Cross_reference_PRINTS', 10),
 ('Cross_reference_PANTHER', 10),
 ('Cross_reference_PIRSF', 9),
 ('Transmembrane', 9),
 ('Taxonomic_lineage_INFRACLASS', 8),
 ('Cross_reference_DrugBank', 8),
 ('Cross_reference_TIGRFAMs', 8),
 ('Protein_families', 8),
 ('Cofactor', 7),
 ('EnsemblPlants_transcript', 7),
 ('Cross_reference_Gramene', 7),
 ('Lipidation', 7),
 ('Cross_reference_SMART', 7),
 ('BRENDA_SS_DATA', 6),
 ('BRENDA_SS_REFS', 6),
 ('BRENDA_EXP_DATA', 6),
 ('BRENDA_EXP_REFS', 6),
 ('BRENDA_EXP_COMMENT', 6),
 ('ChEBI', 6),
 ('ChEBI_Cofactor', 6),
 ('ChEBI_IDs', 6),
 ('pH_dependence', 6),
 ('Coiled_coil', 6),
 ('Taxonomic_lineage_INFRAORDER', 6),
 ('Taxonomic_lineage_SUPERFAMILY', 6),
 ('Taxonomic_lineage_SUBSPECIES', 6),
 ('Cross_reference_DNASU', 6),
 ('BRENDA_AC_DATA', 6),
 ('BRENDA_AC_REFS', 6),
 ('BRENDA_AC_COMMENT', 6),
 ('BRENDA_KI', 6),
 ('Binding_site', 6),
 ('Propeptide', 6),
 ('Cr

In [30]:

ec_num = ec_nums[2]

df = pd.read_csv(f"../workflows/{ec_num}/csv/{ec_num}_uniprot.csv")

filtered_seqs = readFastaFile(f"../workflows/{ec_num}/files/{ec_num}_filt.fasta")

filtered_names = [seq.name for seq in filtered_seqs]

filtered_df = df.loc[(df["Entry"].isin(filtered_names))]

filtered_df[['Entry', 'Taxonomic_lineage_INFRAORDER'] ]

#print(filtered_df['Cross_reference_TIGRFAMs'])

Unnamed: 0,Entry,Taxonomic_lineage_INFRAORDER
2,A0A0D9RRD7,Simiiformes
3,A0A0D9RYP7,Simiiformes
4,Q09767,
5,Q15382,Simiiformes
6,C9ZVV9,
...,...,...
89,Q68U42,
90,P60763,Simiiformes
91,Q9UL25,Simiiformes
92,Q6T311,Simiiformes
