    ---
jupyter:
  jupytext:
    text_representation:
      extension: .md
      format_name: markdown
      format_version: '1.3'
      jupytext_version: 1.16.4
  kernelspec:
    display_name: Python 3
    language: python
    name: python3
---

In [1]:
import pandas as pd

In [None]:
df = pd.read_csv('tables/enzyme.csv')
df

In [None]:
# Count the number of sequences what has pdb_Id
print(f"Sequence count: {df['sequence'].count()}")
print(f"PDB ID count: {df['pdb_id'].count()}")
print(f"UniProt ID count: {df['uniprot_id'].count()}")
print(f"Sequence without PDB ID nor Uniprot ID: {df.dropna(subset=['pdb_id', 'uniprot_id'], how='all')['sequence'].count()}")

In [2]:
# Enzyme plastics
enzyme = pd.read_csv('tables/enzyme.csv')
enzyme_has_plastic = pd.read_csv('tables/enzyme_has_plastic.csv')
plastic = pd.read_csv('tables/plastic.csv')

In [28]:
merged_df = pd.merge(enzyme_has_plastic[["enzyme_id", "plastic_id"]], enzyme[["id","sequence"]], left_on='enzyme_id', right_on='id', how='inner')
merged_df = pd.merge(merged_df, plastic[["id","abbreviation"]], left_on='plastic_id', right_on='id', how='inner')
merged_df = merged_df[["sequence", "abbreviation"]].drop_duplicates()
merged_df = merged_df.pivot_table(index=["sequence"], columns=["abbreviation"], aggfunc=len, fill_value=0).reset_index()
merged_df.dropna(inplace=True)
merged_df

abbreviation,sequence,DBP,ECOFLEX,ECOVIO-FT,LDPE,NR,NYLON,O-PVA,P(3HB-CO-3MP),P3HP,...,PHV,PLA,PMCL,PPL,PS,PU,PUR,PVA,TP,unknown_plastic
0,AANPYERGPNPTDALLEARSGPFSVSEENVSRLSASGFGGGTIYYP...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AANPYQRGPDPTESLLRAARGPFAVSEQSVSRLSVSGFGGGRIYYP...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,AANPYQRGPNPTEASITAARGPFNTAEITVSRLSVSGFGGGKIYYP...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,AAVHGPDPTDALLEASRGPYATRQVDVSSLLVSGFGGGTIYYPTTT...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AEKIKICLQKQANSSFSLHNGFGGNLYATEEKRMFELVKPKAGASV...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
810,VRGDAGDNVLKAHAGGDWLFGLDGNDHLIGGQGNDVFVGGAGNDLM...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
811,YAILWSGWASNSKYALIGALRAVAQTISYEVTLAIIILSILLMNGS...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
812,YDVRGGDAYYINNSPRCSIGFSVNGGFLTAGHCGPGTVTGSNRVAM...,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
813,YGHFYTEHNRGHHVRVATPEDPASSRLGESFWAFLPRSVWFSAVSA...,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
merged_df.columns

Index(['sequence', 'DBP', 'ECOFLEX', 'ECOVIO-FT', 'LDPE', 'NR', 'NYLON',
       'O-PVA', 'P(3HB-CO-3MP)', 'P3HP', 'P3HV', 'P4HB', 'PA', 'PBAT', 'PBS',
       'PBS-BLEND', 'PBSA', 'PBSA-BLEND', 'PBSET', 'PCL', 'PE', 'PEA', 'PEF',
       'PEG', 'PES', 'PET', 'PHA', 'PHB', 'PHBH', 'PHBV', 'PHBVH', 'PHO',
       'PHPV', 'PHTHALATE', 'PHV', 'PLA', 'PMCL', 'PPL', 'PS', 'PU', 'PUR',
       'PVA', 'TP', 'unknown_plastic'],
      dtype='object', name='abbreviation')

## David Cosas

In [29]:
merged_df["PU/PUR"] = merged_df["PU"] + merged_df["PUR"]
merged_df["PU/PUR"] = (merged_df["PU/PUR"]>0).astype(int)
merged_df = merged_df.drop(columns=["PU", "PUR"])
merged_df["PU/PUR"].value_counts()

PU/PUR
0    769
1     46
Name: count, dtype: int64

In [17]:
[column for column in merged_df.columns if "PBS" in column]

['PBS', 'PBS-BLEND', 'PBSA', 'PBSA-BLEND', 'PBSET']

In [30]:
merged_df["NYLON/PA"] = merged_df["NYLON"] + merged_df["PA"]
merged_df["NYLON/PA"] = (merged_df["NYLON/PA"]>0).astype(int)
merged_df = merged_df.drop(columns=["NYLON", "PA"])
merged_df["NYLON/PA"].value_counts()

NYLON/PA
0    772
1     43
Name: count, dtype: int64

In [31]:
merged_df["PHTHALATE"] = merged_df["PHTHALATE"] + merged_df["DBP"] 
merged_df["PHTHALATE"] = (merged_df["PHTHALATE"]>0).astype(int)
merged_df = merged_df.drop(columns=["DBP"])
merged_df["PHTHALATE"].value_counts()

PHTHALATE
0    806
1      9
Name: count, dtype: int64

In [33]:
merged_df.columns.sort_values()

Index(['ECOFLEX', 'ECOVIO-FT', 'LDPE', 'NR', 'NYLON/PA', 'O-PVA',
       'P(3HB-CO-3MP)', 'P3HP', 'P3HV', 'P4HB', 'PBAT', 'PBS', 'PBS-BLEND',
       'PBSA', 'PBSA-BLEND', 'PBSET', 'PCL', 'PE', 'PEA', 'PEF', 'PEG', 'PES',
       'PET', 'PHA', 'PHB', 'PHBH', 'PHBV', 'PHBVH', 'PHO', 'PHPV',
       'PHTHALATE', 'PHV', 'PLA', 'PMCL', 'PPL', 'PS', 'PU/PUR', 'PVA', 'TP',
       'sequence', 'unknown_plastic'],
      dtype='object', name='abbreviation')

In [40]:
test = [name.upper() for name in ['Ecoflex', 'PES', 'PBAT', 'P3HV Oligomers',
       'PHBV Oligomers', 'PCL', 'PMCL', 'O-PVA', 'LDPE', 'NR', 'PLA', 'PHB',
       'P(3HB-co-3MP)', 'P3HP', 'PBS', 'PEG', 'PHV', 'PE', 'PEA', 'P3HV',
       'PHPV', 'PET', 'PVA', 'PHB Oligomers', 'PPL', 'PHBV', 'PBSA', 'PHO',
       'PS', 'PHBVH', 'PHA', 'Ecovio-FT', 'P4HB', 'PBSeT', 'TP', 'PHBH', 'PEF',
       'PU/PUR', 'PBS-BLEND', 'PBSA-BLEND', 'Nylon/PA', 'Phthalate']]
test.sort()
test

['ECOFLEX',
 'ECOVIO-FT',
 'LDPE',
 'NR',
 'NYLON/PA',
 'O-PVA',
 'P(3HB-CO-3MP)',
 'P3HP',
 'P3HV',
 'P3HV OLIGOMERS',
 'P4HB',
 'PBAT',
 'PBS',
 'PBS-BLEND',
 'PBSA',
 'PBSA-BLEND',
 'PBSET',
 'PCL',
 'PE',
 'PEA',
 'PEF',
 'PEG',
 'PES',
 'PET',
 'PHA',
 'PHB',
 'PHB OLIGOMERS',
 'PHBH',
 'PHBV',
 'PHBV OLIGOMERS',
 'PHBVH',
 'PHO',
 'PHPV',
 'PHTHALATE',
 'PHV',
 'PLA',
 'PMCL',
 'PPL',
 'PS',
 'PU/PUR',
 'PVA',
 'TP']

In [41]:
[name for name in test if name not in merged_df.columns]

['P3HV OLIGOMERS', 'PHB OLIGOMERS', 'PHBV OLIGOMERS']

In [43]:
matrix_count = []

for column in merged_df.columns[1:]:
    
    row = [column, 0, 0]

    counts = merged_df[column].value_counts()

    if 1 in counts.index:
        row[1] = counts[1]
    
    if 0 in counts.index:
        row[2] = counts[0]
    
    matrix_count.append(row)

df_summary = pd.DataFrame(data=matrix_count, columns=["type_plastic", "has_activity", "has_non_activity"])
df_summary = df_summary.sort_values(by="has_activity", ascending=False)
df_summary

Unnamed: 0,type_plastic,has_activity,has_non_activity
37,unknown_plastic,369,446
21,PET,183,632
23,PHB,72,743
22,PHA,57,758
31,PLA,53,762
15,PCL,51,764
38,PU/PUR,46,769
39,NYLON/PA,43,772
9,PBAT,28,787
12,PBSA,20,795


In [44]:
df_summary = df_summary[df_summary["has_activity"]>25]
df_summary

Unnamed: 0,type_plastic,has_activity,has_non_activity
37,unknown_plastic,369,446
21,PET,183,632
23,PHB,72,743
22,PHA,57,758
31,PLA,53,762
15,PCL,51,764
38,PU/PUR,46,769
39,NYLON/PA,43,772
9,PBAT,28,787


In [45]:
unique_plastic = df_summary["type_plastic"].unique()
unique_plastic

array(['unknown_plastic', 'PET', 'PHB', 'PHA', 'PLA', 'PCL', 'PU/PUR',
       'NYLON/PA', 'PBAT'], dtype=object)

In [46]:
merged_df["length"] = merged_df["sequence"].str.len()
merged_df = merged_df[(merged_df["length"]>50) & (merged_df["length"]<1022)]
merged_df.shape

(809, 42)

In [47]:
merged_df["sequence"] = merged_df["sequence"].str.upper()
merged_df

abbreviation,sequence,ECOFLEX,ECOVIO-FT,LDPE,NR,O-PVA,P(3HB-CO-3MP),P3HP,P3HV,P4HB,...,PLA,PMCL,PPL,PS,PVA,TP,unknown_plastic,PU/PUR,NYLON/PA,length
0,AANPYERGPNPTDALLEARSGPFSVSEENVSRLSASGFGGGTIYYP...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,262
1,AANPYQRGPDPTESLLRAARGPFAVSEQSVSRLSVSGFGGGRIYYP...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,261
2,AANPYQRGPNPTEASITAARGPFNTAEITVSRLSVSGFGGGKIYYP...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,262
3,AAVHGPDPTDALLEASRGPYATRQVDVSSLLVSGFGGGTIYYPTTT...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,257
4,AEKIKICLQKQANSSFSLHNGFGGNLYATEEKRMFELVKPKAGASV...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,642
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
810,VRGDAGDNVLKAHAGGDWLFGLDGNDHLIGGQGNDVFVGGAGNDLM...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,70
811,YAILWSGWASNSKYALIGALRAVAQTISYEVTLAIIILSILLMNGS...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,205
812,YDVRGGDAYYINNSPRCSIGFSVNGGFLTAGHCGPGTVTGSNRVAM...,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,176
813,YGHFYTEHNRGHHVRVATPEDPASSRLGESFWAFLPRSVWFSAVSA...,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,126


In [48]:
def checking_non_canonical_residues(sequence):
    possible_residues = ['A','C','D','E','F','G','H','I','N','K','L','M','P','Q','R','S','T','V','W','Y']

    is_canon=True

    for residue in sequence:
        if residue not in possible_residues:
            is_canon=False
            break
    return is_canon

In [50]:
check_canon = [
    checking_non_canonical_residues(merged_df["sequence"][index]) for index in merged_df.index
]

merged_df["is_canon"] = check_canon
merged_df["is_canon"].value_counts()

is_canon
True     802
False      7
Name: count, dtype: int64

In [51]:
merged_df = merged_df[merged_df["is_canon"]]

In [52]:
merged_df

abbreviation,sequence,ECOFLEX,ECOVIO-FT,LDPE,NR,O-PVA,P(3HB-CO-3MP),P3HP,P3HV,P4HB,...,PMCL,PPL,PS,PVA,TP,unknown_plastic,PU/PUR,NYLON/PA,length,is_canon
0,AANPYERGPNPTDALLEARSGPFSVSEENVSRLSASGFGGGTIYYP...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,262,True
1,AANPYQRGPDPTESLLRAARGPFAVSEQSVSRLSVSGFGGGRIYYP...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,261,True
2,AANPYQRGPNPTEASITAARGPFNTAEITVSRLSVSGFGGGKIYYP...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,262,True
3,AAVHGPDPTDALLEASRGPYATRQVDVSSLLVSGFGGGTIYYPTTT...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,257,True
4,AEKIKICLQKQANSSFSLHNGFGGNLYATEEKRMFELVKPKAGASV...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,642,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
810,VRGDAGDNVLKAHAGGDWLFGLDGNDHLIGGQGNDVFVGGAGNDLM...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,70,True
811,YAILWSGWASNSKYALIGALRAVAQTISYEVTLAIIILSILLMNGS...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,205,True
812,YDVRGGDAYYINNSPRCSIGFSVNGGFLTAGHCGPGTVTGSNRVAM...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,176,True
813,YGHFYTEHNRGHHVRVATPEDPASSRLGESFWAFLPRSVWFSAVSA...,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,126,True


In [53]:
merged_df.to_csv("~/Desktop/enzymes_plastics.csv", index=False)