# Importing Pacages

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from itertools import combinations
from propy import PyPro
import propy
import cs

# Helper functions

In [None]:
def csv_to_fasta(csv_file, fasta_file):
  data = pd.read_csv(csv_file)
  data.drop("Unnamed: 0", axis=1, inplace=True)
  for i in range(data.shape[0]):
    identifier = data["Peptide_ID"][i]
    sequence = data["Sequence"][i]
    with open(fasta_file, "a") as f:
      f.write(f">{identifier}\n{sequence}\n")

def fasta_to_csv(fasta_file, csv_file):
    with open(fasta_file, "r") as infile, open(csv_file, "w", newline="") as outfile:
        outfile.write("Peptide_ID,Sequence\n")
        Id = None
        seq = []
        for line in infile:
            line = line.strip()
            if line.startswith(">"):
                if Id:
                    outfile.write(f"{Id},{''.join(seq)}\n")
                Id = line[1:]
                seq = []
            else:
                seq.append(line)
        if Id:
            outfile.write(f"{Id},{''.join(seq)}\n")

def merge_on_common_column(dataframes, merge_column):
  merged_df = dataframes[0]
  for df in dataframes[1:]:
          merged_df = pd.merge(merged_df, df, on=merge_column)
  return merged_df

def drop_outliers(df, columns):
    for col in columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR

            df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

def plot_distributions(df, file_name):
    numeric_columns = df.select_dtypes(include='number').columns

    for col in numeric_columns:
        plt.figure(figsize=(8, 4))
        df[col].plot(kind='hist', bins=30, alpha=0.5, label='Histogram', color='blue', density=True)
        df[col].plot(kind='kde', label='Density', color='red')
        plt.title(f"Distribution of {col} - {file_name}")
        plt.xlabel(col)
        plt.ylabel("Density")
        plt.legend()
        plt.show()

def minmax_normalize(df):
    numeric_columns = df.select_dtypes(include='number').columns
    scaler = MinMaxScaler()
    df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
    return df

# Load AMP & non-AMP Datasets

## Load AMPs

In [None]:
APD3 = pd.read_csv("/content/drive/MyDrive/AMPs/Project Data/APD3/APD3_Sequence_Data.csv")
APD3["Activity"] = "AMP"
APD3 = APD3.set_axis(['Peptide_ID', 'Sequence', "Activity"], axis=1)
APD3.to_csv("/content/drive/MyDrive/AMPs/Project Data/Updated_AFTER_HANDELING/APD3.csv")

In [None]:
DRAMP = pd.read_excel("/content/drive/MyDrive/AMPs/Project Data/DRAMP/general_amps.xlsx")
DRAMP["Activity"] = "AMP"
for i in DRAMP.columns:
    if i not in ["DRAMP_ID", "Sequence", "Activity"]:
        DRAMP.drop(i, axis=1, inplace=True)
DRAMP = DRAMP.set_axis(['Peptide_ID', 'Sequence', "Activity"], axis=1)
DRAMP.to_csv("/content/drive/MyDrive/AMPs/Project Data/Updated_AFTER_HANDELING/DRAMP.csv")

In [None]:
ADAM = pd.read_csv("/content/drive/MyDrive/AMPs/Project Data/ADAM/ADAM_Sequence_Data.csv")
ADAM["Activity"] = "AMP"
ADAM = ADAM.set_axis(['Peptide_ID', 'Sequence', "Activity"], axis=1)
ADAM.to_csv("/content/drive/MyDrive/AMPs/Project Data/Updated_AFTER_HANDELING/ADAM.csv")

In [None]:
CAMP = pd.read_csv("/content/drive/MyDrive/AMPs/Project Data/CAMP/CAMP_Sequence_Data.csv")
CAMP["Activity"] = "AMP"
CAMP = CAMP.set_axis(['Peptide_ID', 'Sequence', "Activity"], axis=1)
CAMP.to_csv("/content/drive/MyDrive/AMPs/Project Data/Updated_AFTER_HANDELING/CAMP.csv")

In [None]:
dbAMP = pd.read_csv("/content/drive/MyDrive/AMPs/Project Data/dbAMP/dbAMP_Sequence_Data.csv")
dbAMP["Activity"] = "AMP"
dbAMP = dbAMP.set_axis(['Peptide_ID', 'Sequence', "Activity"], axis=1)
dbAMP.to_csv("/content/drive/MyDrive/AMPs/Project Data/Updated_AFTER_HANDELING/dbAMP.csv")

## Load non-AMPs

In [None]:
fasta_to_csv("/content/drive/MyDrive/AMPs/Project Data/Non-AMPs/train_nonAMP_9777.fasta",
             "/content/drive/MyDrive/AMPs/Project Data/Non-AMPs/nonAMPs.csv")

nonAMP = pd.read_csv("/content/drive/MyDrive/AMPs/Project Data/Non-AMPs/nonAMPs.csv")
nonAMP["Activity"] = "nonAMP"
nonAMP = nonAMP.set_axis(['Peptide_ID', 'Sequence', "Activity"], axis=1)
nonAMP.to_csv("/content/drive/MyDrive/AMPs/Project Data/Updated_AFTER_HANDELING/nonAMPs.csv")

## Merging AMP datasets

In [None]:
Data_Dir = "/content/drive/MyDrive/AMPs/Project Data/Updated_AFTER_HANDELING"

csv_files = [file for file in os.listdir(Data_Dir) if file.endswith('.csv')]
dataframes = []

for file in csv_files:
    file_path = os.path.join(Data_Dir, file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

AMPs = pd.concat(dataframes, axis=0, ignore_index=True)
AMPs.drop("Unnamed: 0", axis=1, inplace=True)
AMPs.to_csv("/content/drive/MyDrive/AMPs/Project Data/Merged_data/AMPs.csv", index=False)

# Data cleaning

## Cleaning AMPs dataframe

In [None]:
data_for_length = pd.read_csv("/content/drive/MyDrive/AMPs/Project Data/Merged_data/AMPs.csv")
data_for_length = pd.DataFrame(data_for_length)
data = data_for_length.Sequence.str.upper()
invalid_conditions = (data.str.len() < 10) | (data.str.len() > 100) | (data.str.contains("[UOBZJX ]"))
data_for_length = data_for_length[~invalid_conditions]

data_for_length.to_csv("/content/drive/MyDrive/AMPs/Project Data/Preprocessed_data/Length&UnknownAA-Filtered_AMPs.csv")

## Cleaning non-AMPs dataframe

In [None]:
dataa_for_length = pd.read_csv("/content/drive/MyDrive/AMPs/Project Data/Updated_AFTER_HANDELING/nonAMPs.csv")
dataa_for_length = pd.DataFrame(dataa_for_length)
dataa = dataa_for_length.Sequence.str.upper()
invalidd_conditions = (dataa.str.len() < 10) | (dataa.str.len() > 100) | (dataa.str.contains("[UOBZJX ]"))
dataa_for_length = dataa_for_length[~invalidd_conditions]

dataa_for_length.to_csv("/content/drive/MyDrive/AMPs/Project Data/Preprocessed_data/Length&UnknownAA-Filtered_NON-AMPs.csv")

## Identity Filtering Using CD-HIT


###Converting csv to fasta

In [None]:
csv_to_fasta("/content/drive/MyDrive/AMPs/Project Data/Preprocessed_data/Length&UnknownAA-Filtered_AMPs.csv",
             "/content/drive/MyDrive/AMPs/Project Data/Preprocessed_data/Length&UnknownAA-Filtered_AMPs.fasta")

In [None]:
csv_to_fasta("/content/drive/MyDrive/AMPs/Project Data/Preprocessed_data/Length&UnknownAA-Filtered_NON-AMPs.csv",
             "/content/drive/MyDrive/AMPs/Project Data/Preprocessed_data/Length&UnknownAA-Filtered_NON-AMPs.fasta")



---


**To filter sequences with higher than 90% identity, use the CD-HIT tool (v4.6.6) on the generated FASTA files. Run the following command in the terminal:**

cd-hit -i my_sequences.fasta -o clustered_sequences.fasta -c 0.9


---




### Converting fasta to csv

In [None]:
fasta_to_csv("/content/drive/MyDrive/AMPs/Project Data/Preprocessed_data/cd_hit_final_filtered_AMPs.fasta",
             "/content/drive/MyDrive/AMPs/Project Data/Preprocessed_data/Length&UnknownAA&SIMILARITY-Filtered_AMPs.csv")

# Load filtered datasets

In [None]:
final_data_AMPs = pd.read_csv("/content/drive/MyDrive/AMPs/Project Data/Preprocessed_data/Length&UnknownAA&SIMILARITY-Filtered_AMPs.csv", on_bad_lines="skip")
final_data_AMPs = pd.DataFrame(final_data_AMPs)
final_data_AMPs["Activity"] = "AMP"

In [None]:
final_data_NonAMPs = pd.read_csv("/content/drive/MyDrive/AMPs/Project Data/Preprocessed_data/Length&UnknownAA&SIMILARITY-Filtered_NON-AMPs.csv", on_bad_lines="skip")
final_data_NonAMPs = pd.DataFrame(final_data_NonAMPs)
final_data_NonAMPs["Activity"] = "nonAMP"

# Merging both classes

In [None]:
data_frames = []
data_frames.append(final_data_AMPs)
data_frames.append(final_data_NonAMPs)

Final_data = pd.concat(data_frames, axis=0, ignore_index=True)
print(Final_data.shape)

Final_data.to_csv("/content/drive/MyDrive/AMPs/Project Data/Final_data/Final_data.csv")

# Feature Extraction

Please note the dataframe has all data (AMPs and Non-AMPs) merged (not ranomized) and please if u will read the file path drop the column ("unnamed: 0")

In [None]:
Final_data = pd.read_csv("/content/drive/MyDrive/AMPs/Project Data/Final_data/Final_data.csv")

In [None]:
Final_data.isna().sum()

Unnamed: 0,0
Peptide_ID,0
Sequence,0
Activity,0


## AAC

In [None]:
data = {}
for seq, label in zip(Final_data['Sequence'], Final_data['Activity']):
  data[seq] = propy.AAComposition.CalculateAADipeptideComposition(seq)
  data[seq]['Activity'] = label
AAC_df = pd.DataFrame.from_dict(data, orient='index').reset_index()
AAC_df.rename(columns={'index': 'Sequence'}, inplace=True)
AAC_df = pd.concat([AAC_df.iloc[:,:421], AAC_df.iloc[:,-1]], axis=1)

KeyboardInterrupt: 

## Autocorrelation

In [None]:
data = {}
for seq, label in zip(Final_data['Sequence'], Final_data['Activity']):
  data[seq] = propy.Autocorrelation.CalculateAutoTotal(seq)
  data[seq]['Activity'] = label
Autocorr_df = pd.DataFrame.from_dict(data, orient='index').reset_index()
Autocorr_df.rename(columns={'index': 'Sequence'}, inplace=True)

## CTD

In [None]:
data = {}
for seq, label in zip(Final_data['Sequence'], Final_data['Activity']):
  data[seq] = propy.CTD.CalculateCTD(seq)
  data[seq]['Activity'] = label
ctd_df = pd.DataFrame.from_dict(data, orient='index').reset_index()
ctd_df.rename(columns={'index': 'Sequence'}, inplace=True)

## PseAAC

In [None]:
data = {}
for seq, label in zip(Final_data['Sequence'], Final_data['Activity']):
  data[seq] = propy.PseudoAAC.GetAPseudoAAC(seq, lamda= 9)
  data[seq]['Activity'] = label
pseaac_df = pd.DataFrame.from_dict(data, orient='index').reset_index()
pseaac_df.rename(columns={'index': 'Sequence'}, inplace=True)

## all posible combinations

In [None]:
features = {"ctd": ctd_df, "aac": AAC_df, 'autocorr': Autocorr_df, 'pseaac': pseaac_df}

combs = []
for r in range(1, len(features) + 1):
    combs.extend(combinations(features.keys(), r))

final_combs = list(set(combs))

In [None]:
final_combs

[('pseaac',),
 ('aac', 'pseaac'),
 ('ctd', 'pseaac'),
 ('aac', 'autocorr'),
 ('autocorr', 'pseaac'),
 ('ctd', 'aac', 'autocorr', 'pseaac'),
 ('aac', 'autocorr', 'pseaac'),
 ('ctd', 'aac', 'autocorr'),
 ('ctd',),
 ('ctd', 'autocorr', 'pseaac'),
 ('ctd', 'aac', 'pseaac'),
 ('autocorr',),
 ('aac',),
 ('ctd', 'autocorr'),
 ('ctd', 'aac')]

In [None]:
merged_combinations = {}

# Iterate over each combination in final_combs
for comb in final_combs:
    # Access DataFrames for each key in the combination

    dfs_to_merge = [features[key] for key in comb]

    # Merge the DataFrames
    merged_df = merge_on_common_column(dfs_to_merge, 'Sequence')
     # Create a filename based on the combination
    comb_name = "_".join(sorted(comb))  # Sort to ensure consistent naming
    file_name = f"{comb_name}.csv"

    # Save to CSV
    output_path = os.path.join('/content/drive/MyDrive/ML project/Comb', file_name)
    merged_df.to_csv(output_path, index=False)
    print(f"Saved: {file_name}")
    print(merged_df.shape)

Saved: pseaac.csv
(9228, 40)
Saved: aac_pseaac.csv
(9228, 460)
Saved: ctd_pseaac.csv
(9228, 187)
Saved: aac_autocorr.csv
(9228, 1142)
Saved: autocorr_pseaac.csv
(9228, 760)
Saved: aac_autocorr_ctd_pseaac.csv
(9228, 1327)
Saved: aac_autocorr_pseaac.csv
(9228, 1180)
Saved: aac_autocorr_ctd.csv
(9228, 1289)
Saved: ctd.csv
(9228, 149)
Saved: autocorr_ctd_pseaac.csv
(9228, 907)
Saved: aac_ctd_pseaac.csv
(9228, 607)
Saved: autocorr.csv
(9228, 722)
Saved: aac.csv
(9228, 422)
Saved: autocorr_ctd.csv
(9228, 869)
Saved: aac_ctd.csv
(9228, 569)
