# Load in sorted fasta files and try to find columns that behave the same

In [3]:
import pandas as pd
from sequence import * 

In [21]:
def load_seqs(ec_num):
    
    df = pd.read_csv(f"../workflows/{ec_num}/csv/{ec_num}_uniprot.csv")

    unfiltered_seqs = readFastaFile(f"../workflows/{ec_num}/files/{ec_num}.fasta")

    filtered_seqs = readFastaFile(f"../workflows/{ec_num}/files/{ec_num}_filt.fasta")

    filtered_names = [seq.name for seq in filtered_seqs]

    unfiltered_names = [seq.name for seq in unfiltered_seqs if seq.name not in filtered_names]


    filtered_df = df.loc[(df["Entry"].isin(filtered_names))]

    unfiltered_df = df.loc[df["Entry"].isin(unfiltered_names)]


    pos_column_counts = {}

    for col in filtered_df.columns:
        #record how many unique values there are 
        unique = len(filtered_df[col].unique())
        pos_column_counts[col] = unique 

    neg_column_counts = {}

    for col in unfiltered_df.columns:
        #record how many unique values there are 
        unique = len(unfiltered_df[col].unique())

        neg_column_counts[col] = unique 

    return pos_column_counts, neg_column_counts
    

# Differences

In [6]:

differences = {}

for (k1, v1), (k2, v2) in zip(neg_column_counts.items(), pos_column_counts.items()):
    #Positive => # entries in filtered < entries in unfiltered
    #Negative => # entries in filtered > entries in unfiltered
    
    differences[k1] = v1 - v2
    

sorted_dif = sorted(differences.items(), key = lambda x: x[1], reverse=True)    

sorted_dif

24 65
2 4
2 4
10 17
2 2
2 2
5 19
2 16
21 64
3 13
3 8
3 8
3 2
3 2
2 2
2 2
4 2
4 2
3 8
3 8
2 5
2 5
1 2
1 2
1 2
3 2
3 2
5 3
3 4
3 4
2 1
2 1
4 2
2 2
2 2
2 4
2 4
2 4
2 4
1 2
1 2
1 2
1 2
1 2
1 2
1 2
1 2
1 2
1 2
1 2
1 2
1 2
1 2
1 2
1 2
1 2
1 2
1 2
1 2
1 2
1 2
1 2
1 2
1 2
24 65
24 65
12 37
9 34
8 15
5 3
5 6
11 12
11 12
23 63
24 62
11 25
11 12
1 1
2 2
2 1
1 1
1 1
2 2
1 1
1 2
1 2
1 1
1 1
1 1
2 4
22 37
24 62
2 12
2 13
1 1
1 1
5 8
4 9
1 1
2 3
3 2
1 1
2 17
3 3
2 2
2 2
3 3
2 2
12 26
4 1
1 1
1 1
1 1
1 1
1 1
4 5
1 1
3 2
10 22
1 2
4 5
14 32
1 2
3 4
18 32
4 4
2 2
9 25
5 22
2 3
2 7
4 11
17 34
15 33
9 24
15 31
17 34
1 1
1 1
4 3
1 9
1 1
1 1
12 28
1 1
2 1
3 1
3 17
11 26
2 4
1 1
1 9
4 6
4 23
5 23
1 1
3 19
1 1
1 1
7 13
6 19
6 19
5 19
9 26
13 33
12 28
3 3
12 29
21 40
3 5
13 9
8 6
3 1
3 7
7 1
4 15
6 27
1 1
1 1
11 12
2 1
4 4
2 2
1 1
8 6
4 6
2 2
8 7
3 6
1 2
2 2
10 9
3 4
2 3
2 2
2 4
10 11
4 7
3 4
1 3
10 12
4 4
1 3
1 2
11 12
2 1
1 1
1 1
1 4
1 1
5 21
1 1
2 1
5 25
2 7
3 1
8 25
5 27
4 20
2 6
1 1
2 1
1 1
5 24
10 3
1 2


[]

In [119]:
unfiltered_df['Transmembrane'].dropna()


35    TRANSMEM 37..53;  /note="Helical";  /evidence=...
36    TRANSMEM 269..289;  /note="Helical";  /evidenc...
Name: Transmembrane, dtype: object

# Columns with 1 entry 

In [27]:

ec_nums = ["1_15_1_1", "2_7_11_22", "3_2_1_14", "3_5_2_6", "3_6_5_2"]


entries = {}

for ec in ec_nums:
    pos_column_counts, neg_column_counts = load_seqs(ec)

    for (k1, v1), (k2, v2) in zip(neg_column_counts.items(), pos_column_counts.items()):
        if v2 == 1 and v1 != 1:
            if entries.get(k1) is None:
                entries[k1] = v2
            else:
                entries[k1] += 1
             
    
ordered = sorted(entries.items(), key = lambda x: x[1], reverse=True)


In [28]:
ordered

[('Cross_reference_Gene3D', 3),
 ('Cross_reference_SMART', 3),
 ('Cross_reference_SUPFAM', 3),
 ('BRENDA_PM_DATA', 2),
 ('BRENDA_PM_REFS', 2),
 ('BRENDA_PM_COMMENT', 2),
 ('BRENDA_REFERENCES_132', 2),
 ('BRENDA_REFERENCES_132_PUBMED', 2),
 ('BRENDA_REFERENCES_159', 2),
 ('BRENDA_REFERENCES_159_PUBMED', 2),
 ('Disruption_phenotype', 2),
 ('Transmembrane', 2),
 ('Lipidation', 2),
 ('Cross_reference_Araport', 2),
 ('Cross_reference_CarbonylDB', 2),
 ('Cross_reference_CDD', 2),
 ('EnsemblFungi_transcript', 2),
 ('EnsemblPlants_transcript', 2),
 ('Cross_reference_Gramene', 2),
 ('Cross_reference_Pfam', 2),
 ('Cross_reference_TAIR', 2),
 ('Cross_reference_TIGRFAMs', 2),
 ('pH_dependence', 2),
 ('Caution', 2),
 ('Cross_reference_PIRSF', 2),
 ('Cross_reference_PRINTS', 2),
 ('BRENDA_REFERENCES_69', 2),
 ('BRENDA_REFERENCES_69_PUBMED', 2),
 ('Coiled_coil', 2),
 ('Taxonomic_lineage_SUPERKINGDOM', 2),
 ('BRENDA_REFERENCES_152', 1),
 ('BRENDA_REFERENCES_152_PUBMED', 1),
 ('BRENDA_REFERENCES_164', 

In [30]:
df = pd.read_csv(f"../workflows/3_5_2_6/csv/3_5_2_6_uniprot.csv")
df['BRENDA_PM_DATA'].dropna()

73    proteolytic modification_count=1
Name: BRENDA_PM_DATA, dtype: object