In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd

In [3]:
def process_dataset(df_pivoted, positive_activity, negative_activities, name_export):
    df_activity = df_pivoted[df_pivoted[positive_activity] == 1]

    sequences_in_non_activity = []

    for activity in negative_activities:
        df_filter = df_pivoted[df_pivoted[activity] == 1]
        sequences_in_non_activity += df_filter["sequence"].tolist()

    unique_sequences_non_activity = list(set(sequences_in_non_activity))

    df_activity["seq_non_activity"] = df_activity["sequence"].isin(unique_sequences_non_activity)
    df_positive = df_activity[df_activity["seq_non_activity"] == 0]
    df_positive = df_positive[["sequence", positive_activity]]

    df_toxic_moonlighting = df_activity[df_activity["seq_non_activity"] == True]

    df_negatives = pd.DataFrame()
    df_negatives["sequence"] = unique_sequences_non_activity

    df_negatives["non_moon"] = df_negatives["sequence"].isin(df_toxic_moonlighting["sequence"].tolist())
    df_negatives["non_positive"] = df_negatives["sequence"].isin(df_positive["sequence"].tolist())

    df_negatives = df_negatives[(df_negatives["non_moon"] == False) & (df_negatives["non_positive"] == False)]
    df_negatives = df_negatives[["sequence"]]
    df_negatives[positive_activity] = 0

    df_to_work = pd.concat([df_positive, df_negatives])
    df_to_work.columns = ["sequence", "activity"]
    df_to_work.to_csv(name_export, index=False)

    print(df_to_work["activity"].value_counts())

In [4]:
df_pivoted = pd.read_csv("../../pivoted_dataset/pivoted_sequences.csv")
df_pivoted

Unnamed: 0,sequence,Anti_HIV,Therapeutic,Anti_nematode,Anti_coronaviridae,Anti_feline_coronavirus,Enzyme_inhibitor,Anti_west_nile_virus,Anti_junin_virus,Anti_iridoviridae,...,Anti_human_coronavirus,Anti_human_parainfluenza_virus,Anti_puumala_virus,Anti_amnesic,Cytokine,Anti_hendra_virus,Potentiator,Neuropeptide,Anti_white_spot_syndrome_virus,Anti_herpesviridae
0,MFTLKKTLLLLFFIGTISLSLCEQERDADEDEGEALEEVKRGLWDT...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,QMIVIELGTNPLK,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,PTSRGNNRRPQTRGMVEECCFRSCDLNLLEQYCAKPAKSERDVSAT...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,RCICTLGVC,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,MNFKYIVAVSFLIASGYARSVRNDEQSLSQRDVLEEESPREIRGIG...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86472,ILMCFSID,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
86473,VIGGDECNINEHRFLVALYDGLSGTFLCGG,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
86474,MAAHKSFRIKQKLAKKLKQNRSVPQWVRLATGNTIRYNAKRRHWRR...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
86475,GIACLCDSDGPSVRGNTLSGTYWLAGCPSGWHNCKSSGQLIGACCKQ,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Generating Toxic dataset

In [21]:
positive_activity = "Toxic"
negative_activities = ["Therapeutic", "Signal_peptide", "Other", 
                       "Immunological", "Cell_cell_communication", "Drug_delivery_vehicle", 
                       "Propeptide", "Cosmetic_and_dermatology", "Molecular_binding", "Taste", "Neurological"]
name_export = "../../datasets_per_task/toxic/dataset_for_process.csv"

process_dataset(df_pivoted, positive_activity, negative_activities, name_export)

activity
0    67373
1     9759
Name: count, dtype: int64


### Generating antibacterial

In [26]:
positive_activity = "Antibacterial"
negative_activities = ["Signal_peptide", "Other", "Anti_fungal", "Antiparasitic", "Antiviral",
                       "Anuran_defense", "Toxic", "Antiprotozoal", "Metabolic", "Anticancer",
                       "Inhibitor", "Anti_oxidative", "Potentiator", "Activator", "Stimulator",
                       "Sperm_activating", "Spermicide", "Anti_toxin", "Anti_allergen",
                       "Antiatherosclerotic", "Antiosteoporosis",
                       "Immunological", "Cell_cell_communication", "Drug_delivery_vehicle", 
                       "Propeptide", "Cosmetic_and_dermatology", "Molecular_binding", "Taste", "Neurological"]

name_export = "../../datasets_per_task/antibacterial/dataset_for_process.csv"

process_dataset(df_pivoted, positive_activity, negative_activities, name_export)

activity
0    57303
1     4686
Name: count, dtype: int64


### Generating Antifungal

In [28]:
positive_activity = "Anti_fungal"
negative_activities = ["Signal_peptide", "Other", "Antibacterial", "Antiparasitic", "Antiviral",
                       "Anuran_defense", "Toxic", "Antiprotozoal", "Metabolic", "Anticancer",
                       "Inhibitor", "Anti_oxidative", "Potentiator", "Activator", "Stimulator",
                       "Sperm_activating", "Spermicide", "Anti_toxin", "Anti_allergen",
                       "Antiatherosclerotic", "Antiosteoporosis",
                       "Immunological", "Cell_cell_communication", "Drug_delivery_vehicle", 
                       "Propeptide", "Cosmetic_and_dermatology", "Molecular_binding", "Taste", "Neurological"]

name_export = "../../datasets_per_task/antifungal/dataset_for_process.csv"

process_dataset(df_pivoted, positive_activity, negative_activities, name_export)

activity
0    67350
1     1126
Name: count, dtype: int64


### Generating antiparastic

In [5]:
positive_activity = "Antiparasitic"
negative_activities = ["Signal_peptide", "Other", "Antibacterial", "Anti_fungal", "Antiviral",
                       "Anuran_defense", "Toxic", "Antiprotozoal", "Metabolic", "Anticancer",
                       "Inhibitor", "Anti_oxidative", "Potentiator", "Activator", "Stimulator",
                       "Sperm_activating", "Spermicide", "Anti_toxin", "Anti_allergen",
                       "Antiatherosclerotic", "Antiosteoporosis",
                       "Immunological", "Cell_cell_communication", "Drug_delivery_vehicle", 
                       "Propeptide", "Cosmetic_and_dermatology", "Molecular_binding", "Taste", "Neurological"]

name_export = "../../datasets_per_task/antiparasitic/dataset_for_process.csv"

process_dataset(df_pivoted, positive_activity, negative_activities, name_export)

0    72152
1       90
Name: activity, dtype: int64


### Generating antiprotozoal

In [30]:
positive_activity = "Antiprotozoal"
negative_activities = ["Signal_peptide", "Other", "Antibacterial", "Anti_fungal", "Antiviral",
                       "Anuran_defense", "Toxic", "Antiparasitic", "Metabolic", "Anticancer",
                       "Inhibitor", "Anti_oxidative", "Potentiator", "Activator", "Stimulator",
                       "Sperm_activating", "Spermicide", "Anti_toxin", "Anti_allergen",
                       "Antiatherosclerotic", "Antiosteoporosis",
                       "Immunological", "Cell_cell_communication", "Drug_delivery_vehicle", 
                       "Propeptide", "Cosmetic_and_dermatology", "Molecular_binding", "Taste", "Neurological"]

name_export = "../../datasets_per_task/antiprotozoal/dataset_for_process.csv"

process_dataset(df_pivoted, positive_activity, negative_activities, name_export)

activity
0    77987
Name: count, dtype: int64


### Generating Antiviral

In [31]:
positive_activity = "Antiviral"
negative_activities = ["Signal_peptide", "Other", "Antibacterial", "Anti_fungal", "Antiprotozoal",
                       "Anuran_defense", "Toxic", "Antiparasitic", "Metabolic", "Anticancer",
                       "Inhibitor", "Anti_oxidative", "Potentiator", "Activator", "Stimulator",
                       "Sperm_activating", "Spermicide", "Anti_toxin", "Anti_allergen",
                       "Antiatherosclerotic", "Antiosteoporosis",
                       "Immunological", "Cell_cell_communication", "Drug_delivery_vehicle", 
                       "Propeptide", "Cosmetic_and_dermatology", "Molecular_binding", "Taste", "Neurological"]

name_export = "../../datasets_per_task/antiviral/dataset_for_process.csv"

process_dataset(df_pivoted, positive_activity, negative_activities, name_export)

activity
0    72975
1     2559
Name: count, dtype: int64


### Generating Anuran Defense

In [32]:
positive_activity = "Anuran_defense"
negative_activities = ["Signal_peptide", "Other", "Antibacterial", "Anti_fungal", "Antiprotozoal",
                       "Antiviral", "Toxic", "Antiparasitic", "Metabolic", "Anticancer",
                       "Inhibitor", "Anti_oxidative", "Potentiator", "Activator", "Stimulator",
                       "Sperm_activating", "Spermicide", "Anti_toxin", "Anti_allergen",
                       "Antiatherosclerotic", "Antiosteoporosis",
                       "Immunological", "Cell_cell_communication", "Drug_delivery_vehicle", 
                       "Propeptide", "Cosmetic_and_dermatology", "Molecular_binding", "Taste", "Neurological"]

name_export = "../../datasets_per_task/anuran_defense/dataset_for_process.csv"

process_dataset(df_pivoted, positive_activity, negative_activities, name_export)

activity
0    76346
1      585
Name: count, dtype: int64


### Generating Antimicrobial

In [33]:
positive_activity = "Antimicrobial"
negative_activities = ["Signal_peptide", "Other", "Metabolic", "Anticancer",
                       "Inhibitor", "Anti_oxidative", "Potentiator", "Activator", "Stimulator",
                       "Sperm_activating", "Spermicide", "Anti_toxin", "Anti_allergen",
                       "Antiatherosclerotic", "Antiosteoporosis",
                       "Immunological", "Cell_cell_communication", "Drug_delivery_vehicle", 
                       "Propeptide", "Cosmetic_and_dermatology", "Molecular_binding", "Taste", "Neurological"]

name_export = "../../datasets_per_task/antimicrobial/dataset_for_process.csv"

process_dataset(df_pivoted, positive_activity, negative_activities, name_export)

activity
0    42277
1    25273
Name: count, dtype: int64
