# Deep Neural Network: Anti-Microbial Peptide Prediction

## Data Compilation

In [1]:
# Import Dependencies.
import pandas as pd
import numpy as np
from Bio import SeqIO

In [14]:
# Parse FASTA File into Pandas DataFrame

# FUNCTION: reads FASTA file with SeqIO, populates empty Lists and creates DataFrame from Dictionary.
def FASTA_to_DF(fasta_file):
    sequence_ids = []
    sequences = []
    
    with open(fasta_file, "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            sequence_ids.append(record.id)
            sequences.append(str(record.seq))
    
    data = {'Sequence_ID': sequence_ids, 'Sequence': sequences}
    df = pd.DataFrame(data)
    
    return df

In [15]:
# Generate Kyte-Doolittle Scores for Amino Acid Sequences

# Create Kyte-Doolittle (Hydrophobicity) score Dictionary
kyte_doolittle_scores = {
    'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5,
    'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5,
    'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6,
    'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2
}

# FUNCTION: Calculate Kyte-Doolittle scores for Amino Acid Sequence
def list_hydrophobicities(sequence):
    return [kyte_doolittle_scores.get(aa, 0) for aa in sequence]

# FUNCTION: Calculate total Kyte-Doolittle score for Amino Acid Sequence
def sum_hydrophobicities(sequence):
    return sum([kyte_doolittle_scores.get(aa, 0) for aa in sequence])

In [16]:
# Create FASTA file variables for Training and Test Data.
train_positive = "../Resources/train_positive_final.fasta"
train_negative = "../Resources/train_negative_final.fasta"
train_positive_2 = "../Resources/db90_positiveamp.fasta"
train_negative_2 = "../Resources/db90_negativeamp.fasta"
test_positive = "../Resources/test_postive_3528.fasta"
test_negative = "../Resources/test_negative_3528.fasta"

# Create FASTA file variables for Prediction Data (for Model Validation)
validation_positive = "../Resources/AMPlify_AMP_test_common.fa"
validation_negative = "../Resources/AMPlify_non_AMP_test_balanced.fa"

In [17]:
# Create DataFrames for each FASTA file.
train_positive_df = FASTA_to_DF(train_positive)
train_negative_df = FASTA_to_DF(train_negative)
train_positive_2_df = FASTA_to_DF(train_positive_2)
train_negative_2_df = FASTA_to_DF(train_negative_2)
test_positive_df = FASTA_to_DF(test_positive)
test_negative_df = FASTA_to_DF(test_negative)

# Create DataFrames for Validation FASTA files
validation_positive_df = FASTA_to_DF(validation_positive)
validation_negative_df = FASTA_to_DF(validation_negative)

In [18]:
# Create new Column for each DataFrame to assign Anti-Microbial status (0 for False, 1 for True)
train_positive_df['Anti_Microbial'] = 1
train_negative_df['Anti_Microbial'] = 0
train_positive_2_df['Anti_Microbial'] = 1
train_negative_2_df['Anti_Microbial'] = 0
test_positive_df['Anti_Microbial'] = 1
test_negative_df['Anti_Microbial'] = 0
validation_positive_df['Anti_Microbial'] = 1
validation_negative_df['Anti_Microbial'] = 0

In [19]:
# Concatenate Train and Test DataFrames
peptides_df = pd.concat([train_positive_df, train_negative_df, train_positive_2_df, train_negative_2_df, test_positive_df, test_negative_df])

# Concatenate Validation DataFrames
validation_df = pd.concat([validation_positive_df, validation_negative_df])

# Create Sequence_Length Column
peptides_df['Sequence_Length'] = peptides_df['Sequence'].str.len()
validation_df['Sequence_Length'] = validation_df['Sequence'].str.len()


# Randomise Index/Rows
peptides_df = peptides_df.sample(frac=1).reset_index(drop=True)
validation_df = validation_df.sample(frac=1).reset_index(drop=True)

In [20]:
# Apply list_hydrophobicities Function to create KD_Scores Column
peptides_df['KD_Scores'] = peptides_df['Sequence'].apply(list_hydrophobicities)

# Apply sum_hydrophobicities Function to create Total KD_Score Column
peptides_df['Total_KD_Score'] = peptides_df['Sequence'].apply(sum_hydrophobicities)


In [21]:
# Export DataFrames as CSV Files
peptides_df.to_csv('../Resources/peptides.csv', index=False, header=True)


In [22]:
# Export Validation DataFrames as CSV Files
validation_df.to_csv('../Resources/validation.csv', index=False, header=True)
