**LOADING THE DATASET AND ANALYZING IT**


In [None]:
import pandas as pd

In [None]:
df = pd.read_excel('/content/general_amps.xlsx', sheet_name='general_amps')
print(df.head()) #prints the first 5 rows of the dataset

In [None]:
print(df.info())         # Check data types and non-null counts
print(df.describe())     # Get a summary of numerical features


**Dropping Columns which are not required**


In [None]:
# Dropping unnecessary columns
columns_to_drop = [
    'DRAMP_ID',
    'Swiss_Prot_Entry',
    'Pubmed_ID',
    'Name',
    'Comments',
    'Reference',
    'Author',
    'Title',
    'PDB_ID',
    'Other_Modifications',
    'N-terminal_Modification',
    'C-terminal_Modification'
]

# Drop columns
df.drop(columns=columns_to_drop, inplace=True)

# Check the DataFrame after dropping columns
print("Remaining columns after dropping unnecessary ones:\n", df.columns)


In [None]:
# Check the actual column names
print("Current columns in the DataFrame:\n", df.columns.tolist())

print(df.info())         # Check data types and non-null counts
print(df.describe())     # Get a summary of numerical features


In [None]:
# Drop rows with missing values in the 'Sequence' column
df.dropna(subset=['Sequence'], inplace=True)

# Fill missing values in categorical columns with the most frequent value (mode)
categorical_columns = ['Family', 'Gene', 'Protein_existence', 'Structure',
                       'Hemolytic_activity', 'Linear/Cyclic/Branched', 'Stereochemistry',
                       'Cytotoxicity', 'Binding_Traget']

# Fill categorical columns with mode (most frequent value)
for column in categorical_columns:
    df[column] = df[column].fillna(df[column].mode()[0])

# Check for remaining missing values to confirm
missing_values = df.isnull().sum()

# Display columns with missing values if any
print(missing_values[missing_values > 0])

# Final check of the dataset
print(df.info())


In [None]:
# Get the count of each unique entry in the 'Activity' column
activity_counts = df['Activity'].value_counts()

# Display the counts
print(activity_counts)


**PREPROCESSING THE DATASET**


In [None]:
# Step 1: Define a function to clean strings
def clean_string(s):
    if isinstance(s, str):  # Check if the value is a string
        return ' '.join(s.split()).lower()  # Remove extra spaces and convert to lowercase
    return s  # Return the value as is if it's not a string

# Step 2: Apply the function to all object-type columns in the DataFrame except for 'Sequence'
for column in df.select_dtypes(include=['object']).columns:
    if column != 'Sequence':  # Exclude the 'Sequence' column
        df[column] = df[column].apply(clean_string)

# Step 3: Verify the transformation
print(df.head())


In [None]:
# Assuming df is your DataFrame and already loaded
# Define a function to categorize activities
def categorize_activity(activity):
    # Convert to lower case and check for keywords
    activity = activity.lower()
    antimicrobial_keywords = ['antimicrobial']

    # Check if any antimicrobial keyword is in the activity
    if any(keyword in activity for keyword in antimicrobial_keywords):
        return 'Antimicrobial'
    else:
        return 'Non-Antimicrobial'

# Apply the function to the Activity column
df['Activity'] = df['Activity'].apply(categorize_activity)

# Display the updated DataFrame
print(df.head())


**Getting the count of target variables in the 'Activity' column**


In [None]:
print(df['Activity'].value_counts())

In [None]:
print(df['Activity'].unique)

**Checking the current columns in the dataset**


In [None]:
print(df.columns)

In [None]:
# Display the unique non-antimicrobial entries
non_antimicrobial_entries = df[df['Activity'] == 'Non-Antimicrobial']
print(non_antimicrobial_entries['Activity'].unique())

# Display a sample of non-antimicrobial entries
print(non_antimicrobial_entries.sample(10))  # Adjust the number to see more or fewer entries


In [None]:
# Count unique non-antimicrobial entries
unique_non_antimicrobials = df[df['Activity'] == 'Non-Antimicrobial']['Source'].unique()
print(f"Unique Non-Antimicrobial Sources: {len(unique_non_antimicrobials)}")
print(unique_non_antimicrobials)


In [None]:
print(df.head)

**Checking for Null values**


In [None]:
print(df.columns)


# Assuming your DataFrame is named 'df'
not_found_counts = df.apply(lambda x: (x == 'not found').sum())
print("The no of not found counts:")
# Display the counts
print(not_found_counts)


**Dropping again unecessary columns, extracting features**


In [None]:
# Dropping unnecessary columns
columns_to_drop = [
    'Binding_Traget',
    'Structure_Description',
    'Gene',
    'Family',
    'Structure',
    'Cytotoxicity'
]

# Drop columns
df.drop(columns=columns_to_drop, inplace=True)

# Check the DataFrame after dropping columns
print("Remaining columns after dropping unnecessary ones:\n", df.columns)


In [None]:
# Assuming your DataFrame is named 'df'
missing_values_count = df.isnull().sum()

# Display the counts of missing values
print(missing_values_count)

print(df['Activity'].value_counts())

**SOME DATA ANALYSIS AND VISUALIZATION**


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot the distribution of antimicrobial vs non-antimicrobial peptides
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='Activity', hue='Activity', palette='Set2', legend=False)
plt.title('Distribution of Antimicrobial vs Non-Antimicrobial Peptides')
plt.xlabel('Activity')
plt.ylabel('Count')
plt.show()


In [None]:
# Plot the distribution of peptide sequence lengths
plt.figure(figsize=(10, 6))
sns.histplot(df['Sequence_Length'], bins=30, kde=True, color='blue')
plt.title('Distribution of Peptide Sequence Lengths')
plt.xlabel('Sequence Length')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Boxplot for sequence length based on activity type
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Activity', y='Sequence_Length', hue='Activity', palette='coolwarm', dodge=False)
plt.title('Sequence Length Distribution by Activity Type')
plt.xlabel('Activity')
plt.ylabel('Sequence Length')
plt.legend([],[], frameon=False)  # Remove the legend as it's not needed
plt.show()


In [None]:
# Pie chart for Linear, Cyclic, and Branched Peptides using a legend
plt.figure(figsize=(8, 6))

# Get the value counts for Linear/Cyclic/Branched peptides
peptide_counts = df['Linear/Cyclic/Branched'].value_counts()

# Create an explode list with a length equal to the number of unique categories
explode = [0.1] * len(peptide_counts)  # Slightly separate all slices

# Define colors dynamically based on the number of unique categories
colors = ['lightblue', 'lightgreen', 'lightcoral'] * (len(peptide_counts) // 3 + 1)  # Ensure enough colors

# Create the pie chart
plt.pie(
    peptide_counts,
    autopct='%1.1f%%',          # Display percentages
    colors=colors[:len(peptide_counts)],  # Use the appropriate number of colors
    startangle=90,              # Start at 90 degrees to avoid overlap
    explode=explode,            # Slightly separate the slices
    shadow=True,                # Add shadow for better visibility
    textprops=dict(color="w")   # Set text color for better contrast
)

# Add a legend
plt.legend(peptide_counts.index, title="Peptide Types", loc="upper left", bbox_to_anchor=(1, 1))

plt.title('Proportion of Linear, Cyclic, and Branched Peptides')
plt.ylabel('')  # Remove the y-label for a cleaner look
plt.show()


**Splitting the dataset and applying SMOTE technique to handle imbalances**


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer

# Sample DataFrame (replace this with your actual DataFrame)
# df = ...

# Prepare the features and target variable
X = df[['Sequence', 'Sequence_Length', 'Source', 'Protein_existence',
         'Target_Organism', 'Hemolytic_activity',
         'Linear/Cyclic/Branched', 'Stereochemistry']]
y = df['Activity']

# Convert 'Sequence' to k-mer representation
k = 3  # Change k for different lengths
vectorizer = CountVectorizer(analyzer='char', ngram_range=(k, k))
X_kmers = vectorizer.fit_transform(X['Sequence']).toarray()

# Create a DataFrame from k-mer features
kmers_df = pd.DataFrame(X_kmers, columns=vectorizer.get_feature_names_out())

# Combine k-mer features with other features
X_combined = pd.concat([kmers_df, X[['Sequence_Length', 'Source', 'Protein_existence',
                                       'Target_Organism', 'Hemolytic_activity',
                                       'Linear/Cyclic/Branched', 'Stereochemistry']].reset_index(drop=True)], axis=1)

# Encode categorical variables
X_encoded = pd.get_dummies(X_combined, columns=['Source', 'Protein_existence', 'Target_Organism',
                                                'Hemolytic_activity', 'Linear/Cyclic/Branched',
                                                'Stereochemistry'], drop_first=True)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Now you can train your model on X_resampled and y_resampled


**Checking size of Dataset after SMOTE**


In [None]:
# Check the sizes of the datasets after SMOTE
print(f'Size of X_train after SMOTE: {X_resampled.shape}')
print(f'Size of y_train after SMOTE: {y_resampled.shape}')
print(f'Size of X_test: {X_test.shape}')
print(f'Size of y_test: {y_test.shape}')


class_counts = y_train.value_counts()
print(class_counts)

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)




**Balanced Datas for both the classes**


In [None]:
print(y_resampled.value_counts())

**Training the model and applying Ensemble Voting Classifier method**


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score



# Standardize the feature set
scaler = StandardScaler()
X_resampled = scaler.fit_transform(X_resampled)
X_test = scaler.transform(X_test)

# Initialize the base models
log_reg = LogisticRegression(random_state=42, max_iter=1000)
rf_classifier = RandomForestClassifier(random_state=42)

# Create the Voting Classifier with weights
voting_classifier = VotingClassifier(estimators=[
    ('logistic_regression', log_reg),
    ('random_forest', rf_classifier)
], voting='soft', weights=[1, 2])  # Adjust weights as necessary

# Train the Voting Classifier
voting_classifier.fit(X_resampled, y_resampled)

# Predict on the test set
y_pred = voting_classifier.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


**MODEL EVALUATION and the ROC Curve and Confusion Matrix**


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, roc_curve, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay

# Standardize the feature set
scaler = StandardScaler()
X_resampled = scaler.fit_transform(X_resampled)
X_test = scaler.transform(X_test)

# Encode the target variable
le = LabelEncoder()
y_resampled_encoded = le.fit_transform(y_resampled)  # Encode training labels
y_test_encoded = le.transform(y_test)  # Encode test labels

# Initialize the base models
log_reg = LogisticRegression(random_state=42, max_iter=1000)
rf_classifier = RandomForestClassifier(random_state=42)

# Create the Voting Classifier with weights
voting_classifier = VotingClassifier(estimators=[
    ('logistic_regression', log_reg),
    ('random_forest', rf_classifier)
], voting='soft', weights=[1, 2])  # Adjust weights as necessary

# Train the Voting Classifier
voting_classifier.fit(X_resampled, y_resampled_encoded)

# Predict on the test set
y_pred = voting_classifier.predict(X_test)
y_pred_proba = voting_classifier.predict_proba(X_test)[:, 1]  # Get probabilities for positive class

# Evaluate the model
print("Accuracy:", accuracy_score(y_test_encoded, y_pred))
print("\nClassification Report:\n", classification_report(y_test_encoded, y_pred))

# Function to plot ROC Curve
def plot_roc_curve(y_true, y_scores, model_name):
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    auc = roc_auc_score(y_true, y_scores)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line for random guessing
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc='lower right')
    plt.grid()
    plt.show()

# Function to plot Confusion Matrix
def plot_confusion_matrix(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)

    plt.figure(figsize=(8, 6))
    disp.plot(cmap='Blues', ax=plt.gca())
    plt.title(f'Confusion Matrix for {model_name}')
    plt.show()

# Plot ROC Curve for Voting Classifier
plot_roc_curve(y_test_encoded, y_pred_proba, 'Voting Classifier')
# Plot Confusion Matrix for Voting Classifier
plot_confusion_matrix(y_test_encoded, y_pred, 'Voting Classifier')


**Now trying again with Random Under Sampler method for better accuracy reducing the majority class and increasing the data for minority simultaneously**


In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Step 1: Apply SMOTE to the training data to increase Non-Antimicrobial class
smote = SMOTE(sampling_strategy={ 'Non-Antimicrobial': 3500 }, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Step 2: Apply Random UnderSampling to Antimicrobial class to reduce it to 4000
rus = RandomUnderSampler(sampling_strategy={ 'Antimicrobial': 3500 }, random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_resampled, y_resampled)

# Check the new class distribution
print(y_resampled.value_counts())


**Again checking the value count in each class**


In [None]:
print(y_resampled.value_counts())

**Now implementing using CATBoost for better efficiency**


In [None]:
!pip install catboost

**Training the model again using Random Forest, Logistic Regression and CatBoost Classifier**


**Also adding weights to model for better performing for minority class dataset, here 'Non-Antimicrobial'**


In [None]:
from catboost import CatBoostClassifier
# Initialize models with class weights
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
log_reg_model = LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000)
catboost_model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6,
                                     class_weights=[1, 5], verbose=0)  # Adjust class_weights as needed

# Create a Voting Classifier
voting_classifier = VotingClassifier(estimators=[
    ('rf', rf_model),
    ('log_reg', log_reg_model),
    ('catboost', catboost_model)
], voting='soft')  # Use 'soft' for probability-based voting

# Train Voting Classifier
voting_classifier.fit(X_resampled, y_resampled)

# Make predictions
y_pred_voting = voting_classifier.predict(X_test)

# Evaluate Voting Classifier
accuracy_voting = accuracy_score(y_test, y_pred_voting)
classification_rep_voting = classification_report(y_test, y_pred_voting)

print("Model: Voting Classifier")
print(f"Accuracy: {accuracy_voting}")
print("Classification Report:")
print(classification_rep_voting)
print("="*50)

# Optionally, evaluate individual models as well
for model_name, model in zip(['Random Forest', 'Logistic Regression', 'CatBoost'],
                               [rf_model, log_reg_model, catboost_model]):
    model.fit(X_resampled, y_resampled)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)

    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(classification_rep)
    print("="*50)

**MODEL EVALUATION and the ROC Curve and Confusion Matrix of the model**


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder

# Encode the target variable
le = LabelEncoder()
y_resampled_encoded = le.fit_transform(y_resampled)  # Encode training labels
y_test_encoded = le.transform(y_test)  # Encode test labels

# Fit models using encoded labels
rf_model.fit(X_resampled, y_resampled_encoded)
log_reg_model.fit(X_resampled, y_resampled_encoded)
catboost_model.fit(X_resampled, y_resampled_encoded)

# Create a Voting Classifier
voting_classifier = VotingClassifier(estimators=[
    ('rf', rf_model),
    ('log_reg', log_reg_model),
    ('catboost', catboost_model)
], voting='soft')  # Use 'soft' for probability-based voting

# Train Voting Classifier
voting_classifier.fit(X_resampled, y_resampled_encoded)

# Make predictions
y_pred_voting = voting_classifier.predict(X_test)
y_pred_proba_voting = voting_classifier.predict_proba(X_test)[:, 1]  # Get probabilities for positive class

# Evaluate Voting Classifier
accuracy_voting = accuracy_score(y_test_encoded, y_pred_voting)
classification_rep_voting = classification_report(y_test_encoded, y_pred_voting)

print("Model: Voting Classifier")
print(f"Accuracy: {accuracy_voting}")
print("Classification Report:")
print(classification_rep_voting)
print("="*50)

# Function to plot ROC Curve
def plot_roc_curve(y_true, y_scores, model_name):
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    auc = roc_auc_score(y_true, y_scores)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line for random guessing
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc='lower right')
    plt.grid()
    plt.show()

# Function to plot Confusion Matrix
def plot_confusion_matrix(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)

    plt.figure(figsize=(8, 6))
    disp.plot(cmap='Blues', ax=plt.gca())
    plt.title(f'Confusion Matrix for {model_name}')
    plt.show()

# ROC Curve for Voting Classifier
plot_roc_curve(y_test_encoded, y_pred_proba_voting, 'Voting Classifier')
# Confusion Matrix for Voting Classifier
plot_confusion_matrix(y_test_encoded, y_pred_voting, 'Voting Classifier')

# Optionally, evaluate individual models as well
for model_name, model in zip(['Random Forest', 'Logistic Regression', 'CatBoost'],
                               [rf_model, log_reg_model, catboost_model]):
    # Get predictions and probabilities
    model.fit(X_resampled, y_resampled_encoded)  # Fit the model with encoded labels
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]  # Get probabilities for positive class

    # ROC Curve
    plot_roc_curve(y_test_encoded, y_pred_proba, model_name)
    # Confusion Matrix
    plot_confusion_matrix(y_test_encoded, y_pred, model_name)


**Reducing the majority class equal to the minority class**


In [None]:
from imblearn.under_sampling import RandomUnderSampler

# Initialize the undersampler
undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=42)

# Fit and resample the training data
X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)

# Check the new class distribution
print("Class distribution after undersampling:")
print(y_resampled.value_counts())


In [None]:
print(y_resampled.value_counts())

**Again training the model with the new values of the target class**


In [None]:
from catboost import CatBoostClassifier
# Initialize models with class weights
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
log_reg_model = LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000)
catboost_model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6,
                                     class_weights=[1, 5], verbose=0)  # Adjust class_weights as needed

# Create a Voting Classifier
voting_classifier = VotingClassifier(estimators=[
    ('rf', rf_model),
    ('log_reg', log_reg_model),
    ('catboost', catboost_model)
], voting='soft')  # Use 'soft' for probability-based voting

# Train Voting Classifier
voting_classifier.fit(X_resampled, y_resampled)

# Make predictions
y_pred_voting = voting_classifier.predict(X_test)

# Evaluate Voting Classifier
accuracy_voting = accuracy_score(y_test, y_pred_voting)
classification_rep_voting = classification_report(y_test, y_pred_voting)

print("Model: Voting Classifier")
print(f"Accuracy: {accuracy_voting}")
print("Classification Report:")
print(classification_rep_voting)
print("="*50)

# Optionally, evaluate individual models as well
for model_name, model in zip(['Random Forest', 'Logistic Regression', 'CatBoost'],
                               [rf_model, log_reg_model, catboost_model]):
    model.fit(X_resampled, y_resampled)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)

    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(classification_rep)
    print("="*50)

**MODEL EVALUATION and the ROC Curve and Confusion Matrix of the model**


In [None]:
import numpy as np
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder

# Encode the target variable
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_resampled)  # Encode training labels
y_test_encoded = le.transform(y_test)  # Encode test labels

# Fit models using encoded labels
rf_model.fit(X_resampled, y_train_encoded)
log_reg_model.fit(X_resampled, y_train_encoded)
catboost_model.fit(X_resampled, y_train_encoded)

# Create the Voting Classifier
voting_classifier = VotingClassifier(estimators=[
    ('rf', rf_model),
    ('log_reg', log_reg_model),
    ('catboost', catboost_model)
], voting='soft')  # Use 'soft' for probability-based voting

# Train Voting Classifier
voting_classifier.fit(X_resampled, y_train_encoded)

# Make predictions
y_pred_voting = voting_classifier.predict(X_test)
y_pred_proba_voting = voting_classifier.predict_proba(X_test)[:, 1]  # Get the probabilities for the positive class

# Function to plot ROC Curve (updated to handle label encoding)
def plot_roc_curve(y_true, y_scores, model_name):
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    auc = roc_auc_score(y_true, y_scores)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line for random guessing
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc='lower right')
    plt.grid()
    plt.show()

# Confusion Matrix function remains the same

# ROC Curve for Voting Classifier
plot_roc_curve(y_test_encoded, y_pred_proba_voting, 'Voting Classifier')
# Confusion Matrix for Voting Classifier
plot_confusion_matrix(y_test_encoded, y_pred_voting, 'Voting Classifier')

# Evaluate individual models
models = [('Voting Classifier', voting_classifier),
          ('Random Forest', rf_model),
          ('Logistic Regression', log_reg_model),
          ('CatBoost', catboost_model)]

for model_name, model in models:
    # Get predictions and probabilities
    if model_name == 'Voting Classifier':
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
    else:
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]  # Get the probabilities for the positive class

    # ROC Curve
    plot_roc_curve(y_test_encoded, y_pred_proba, model_name)
    # Confusion Matrix
    plot_confusion_matrix(y_test_encoded, y_pred, model_name)


**Since undersmapling did not provide good results with overfitting again reducing the majority class and increasing the minority class to 3000 values each**


In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Step 1: Apply SMOTE to the training data to increase Non-Antimicrobial class
smote = SMOTE(sampling_strategy={ 'Non-Antimicrobial': 3000 }, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Step 2: Apply Random UnderSampling to Antimicrobial class to reduce it to 4000
rus = RandomUnderSampler(sampling_strategy={ 'Antimicrobial': 3000 }, random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_resampled, y_resampled)

# Check the new class distribution
print(y_resampled.value_counts())


**Some data visualization after changing the data values**


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualize class distribution before SMOTE
plt.figure(figsize=(8, 6))
sns.countplot(x=y)
plt.title('Class Distribution Before SMOTE')
plt.xlabel('Activity Class')
plt.ylabel('Count')
plt.show()

# Visualize class distribution after SMOTE
plt.figure(figsize=(8, 6))
sns.countplot(x=y_resampled)
plt.title('Class Distribution After SMOTE')
plt.xlabel('Activity Class')
plt.ylabel('Count')
plt.show()


In [None]:
# Get the sum of occurrences of each k-mer
kmers_freq = kmers_df.sum(axis=0).sort_values(ascending=False)

# Plot top 20 most frequent k-mers
plt.figure(figsize=(10, 6))
kmers_freq[:20].plot(kind='bar', color='skyblue')
plt.title(f'Top 20 Most Frequent {k}-mers')
plt.xlabel(f'{k}-mers')
plt.ylabel('Frequency')
plt.show()


******\*\*******\*\*\*\*******\*\*******\*\*\*\*******\*\*******\*\*\*\*******\*\*******THE END**\*\***\*\***\*\***\***\*\***\*\***\*\***


**HYPERPARAMETER TUNING OF THE MODEL**


In [None]:
!pip install bayesian-optimization


In [None]:
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

# Assuming X_resampled and y_resampled are the full-sized training dataset

# Step 1: Define Bayesian Optimization functions

# Random Forest Optimization
def rf_evaluate(n_estimators, max_depth, min_samples_split):
    rf = RandomForestClassifier(
        n_estimators=int(n_estimators),
        max_depth=int(max_depth),
        min_samples_split=int(min_samples_split),
        class_weight='balanced',
        random_state=42
    )
    scores = cross_val_score(rf, X_resampled, y_resampled, cv=3, scoring='f1_weighted')  # Keep cv=3 for speed
    return scores.mean()

# CatBoost Optimization with early stopping
def catboost_evaluate(iterations, learning_rate, depth):
    catboost = CatBoostClassifier(
        iterations=int(iterations),
        learning_rate=learning_rate,
        depth=int(depth),
        early_stopping_rounds=50,  # Early stopping to reduce unnecessary iterations
        verbose=0
    )
    scores = cross_val_score(catboost, X_resampled, y_resampled, cv=3, scoring='f1_weighted')
    return scores.mean()

# Step 2: Define the parameter bounds for optimization
rf_bounds = {
    'n_estimators': (50, 150),  # Reduced range for faster search
    'max_depth': (5, 15),
    'min_samples_split': (2, 8)
}

catboost_bounds = {
    'iterations': (100, 500),  # Limited range for iterations
    'learning_rate': (0.05, 0.2),  # Focus on a smaller range
    'depth': (3, 8)  # Reduced depth for faster training
}

# Step 3: Perform Bayesian Optimization for Random Forest
rf_optimizer = BayesianOptimization(f=rf_evaluate, pbounds=rf_bounds, random_state=42)
rf_optimizer.maximize(init_points=3, n_iter=5)  # Fewer initial points and iterations

# Step 4: Get the best parameters for Random Forest
best_rf_params = rf_optimizer.max['params']
print("Best parameters for Random Forest: ", best_rf_params)

# Step 5: Train Random Forest with best parameters
rf_model = RandomForestClassifier(
    n_estimators=int(best_rf_params['n_estimators']),
    max_depth=int(best_rf_params['max_depth']),
    min_samples_split=int(best_rf_params['min_samples_split']),
    class_weight='balanced',
    random_state=42
)

# Step 6: Perform Bayesian Optimization for CatBoost
catboost_optimizer = BayesianOptimization(f=catboost_evaluate, pbounds=catboost_bounds, random_state=42)
catboost_optimizer.maximize(init_points=3, n_iter=5)  # Reduced iterations for speed

# Step 7: Get the best parameters for CatBoost
best_catboost_params = catboost_optimizer.max['params']
print("Best parameters for CatBoost: ", best_catboost_params)

# Step 8: Train CatBoost with best parameters
catboost_model = CatBoostClassifier(
    iterations=int(best_catboost_params['iterations']),
    learning_rate=best_catboost_params['learning_rate'],
    depth=int(best_catboost_params['depth']),
    early_stopping_rounds=50,
    verbose=0
)

# Step 9: Create the Voting Classifier
voting_model = VotingClassifier(
    estimators=[
        ('rf', rf_model),
        ('svm', SVC(class_weight='balanced', probability=True, random_state=42)),  # SVM kept as-is
        ('catboost', catboost_model)
    ],
    voting='soft'
)

# Step 10: Train the Voting Classifier
voting_model.fit(X_resampled, y_resampled)

# Step 11: Evaluate the model
y_pred = voting_model.predict(X_test)

# Step 12: Print accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))


In [None]:
print(df.columns)

In [None]:
!pip install scikit-optimize


In [None]:
print(y_resampled.value_counts())

In [None]:
non_antimicrobial_rows = df[df['Activity'] == 'Non-antimicrobial']

# Display the result
print(non_antimicrobial_rows)

In [None]:
print(df[df['Activity']=='Non-Antimicrobial'].value_counts())


In [None]:
# Display all rows with Non-Antimicrobial activity
non_antimicrobial_rows = df[df['Activity'] == 'Non-Antimicrobial']
print(non_antimicrobial_rows)


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Assuming the VotingClassifier model is already trained and available as `voting_model`

def predict_antimicrobial(sequence):
    # Create a DataFrame for the new input
    input_data = pd.DataFrame({
        'Sequence': [sequence],
        'Sequence_Length': [len(sequence)],  # Calculate sequence length
        'Source': ['unknown'],  # Default value for unknown source
        'Protein_existence': ['unknown'],  # Default value for unknown existence
        'Target_Organism': ['unknown'],  # Default value for unknown organism
        'Hemolytic_activity': ['unknown'],  # Default value for unknown hemolytic activity
        'Linear/Cyclic/Branched': ['unknown'],  # Default value for unknown structure
        'Stereochemistry': ['unknown']  # Default value for unknown stereochemistry
    })

    # Convert 'Sequence' to k-mer representation
    k = 3  # k value used during training
    vectorizer = CountVectorizer(analyzer='char', ngram_range=(k, k))
    input_kmers = vectorizer.fit_transform(input_data['Sequence']).toarray()

    # Create a DataFrame from k-mer features
    kmers_df = pd.DataFrame(input_kmers, columns=vectorizer.get_feature_names_out())

    # Combine k-mer features with other features
    input_combined = pd.concat([kmers_df, input_data[['Sequence_Length', 'Source',
                                                       'Protein_existence',
                                                       'Target_Organism',
                                                       'Hemolytic_activity',
                                                       'Linear/Cyclic/Branched',
                                                       'Stereochemistry']].reset_index(drop=True)], axis=1)

    # Encode categorical variables (use the same approach as training)
    input_encoded = pd.get_dummies(input_combined, columns=['Source', 'Protein_existence',
                                                             'Target_Organism',
                                                             'Hemolytic_activity',
                                                             'Linear/Cyclic/Branched',
                                                             'Stereochemistry'], drop_first=True)

    # Align input data with the training data
    input_encoded = input_encoded.reindex(columns=X_resampled.columns, fill_value=0)

    # Make predictions
    prediction = voting_model.predict(input_encoded)
    return prediction[0]  # Return the prediction result

# Example usage
user_input_sequence = input("Enter the sequence: ")
result = predict_antimicrobial(user_input_sequence)
print(f"The prediction for the given sequence is: {'Antimicrobial' if result == 'Antimicrobial' else 'Non-Antimicrobial'}")


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Assuming the VotingClassifier model is already trained and available as `voting_model`

def predict_antimicrobial(sequence):
    # Create a DataFrame for the new input
    input_data = pd.DataFrame({
        'Sequence': [sequence],
        'Sequence_Length': [len(sequence)],  # Calculate sequence length
        'Source': ['unknown'],  # Default value for unknown source
        'Protein_existence': ['unknown'],  # Default value for unknown existence
        'Target_Organism': ['unknown'],  # Default value for unknown organism
        'Hemolytic_activity': ['unknown'],  # Default value for unknown hemolytic activity
        'Linear/Cyclic/Branched': ['unknown'],  # Default value for unknown structure
        'Stereochemistry': ['unknown']  # Default value for unknown stereochemistry
    })

    # Convert 'Sequence' to k-mer representation
    k = 3  # k value used during training
    vectorizer = CountVectorizer(analyzer='char', ngram_range=(k, k))
    input_kmers = vectorizer.fit_transform(input_data['Sequence']).toarray()

    # Create a DataFrame from k-mer features
    kmers_df = pd.DataFrame(input_kmers, columns=vectorizer.get_feature_names_out())

    # Combine k-mer features with other features
    input_combined = pd.concat([kmers_df, input_data[['Sequence_Length', 'Source',
                                                       'Protein_existence',
                                                       'Target_Organism',
                                                       'Hemolytic_activity',
                                                       'Linear/Cyclic/Branched',
                                                       'Stereochemistry']].reset_index(drop=True)], axis=1)

    # Encode categorical variables (use the same approach as training)
    input_encoded = pd.get_dummies(input_combined, columns=['Source', 'Protein_existence',
                                                             'Target_Organism',
                                                             'Hemolytic_activity',
                                                             'Linear/Cyclic/Branched',
                                                             'Stereochemistry'], drop_first=True)

    # Align input data with the training data
    input_encoded = input_encoded.reindex(columns=X_resampled.columns, fill_value=0)

    # Make predictions
    prediction = voting_model.predict(input_encoded)
    return prediction[0]  # Return the prediction result

# Example usage
user_input_sequence = input("Enter the sequence: ")
result = predict_antimicrobial(user_input_sequence)
print(f"The prediction for the given sequence is: {'Antimicrobial' if result == 'Antimicrobial' else 'Non-Antimicrobial'}")


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Assuming the VotingClassifier model is already trained and available as `voting_model`

def predict_antimicrobial(sequence):
    # Create a DataFrame for the new input
    input_data = pd.DataFrame({
        'Sequence': [sequence],
        'Sequence_Length': [len(sequence)],  # Calculate sequence length
        'Source': ['unknown'],  # Default value for unknown source
        'Protein_existence': ['unknown'],  # Default value for unknown existence
        'Target_Organism': ['unknown'],  # Default value for unknown organism
        'Hemolytic_activity': ['unknown'],  # Default value for unknown hemolytic activity
        'Linear/Cyclic/Branched': ['unknown'],  # Default value for unknown structure
        'Stereochemistry': ['unknown']  # Default value for unknown stereochemistry
    })

    # Convert 'Sequence' to k-mer representation
    k = 3  # k value used during training
    vectorizer = CountVectorizer(analyzer='char', ngram_range=(k, k))
    input_kmers = vectorizer.fit_transform(input_data['Sequence']).toarray()

    # Create a DataFrame from k-mer features
    kmers_df = pd.DataFrame(input_kmers, columns=vectorizer.get_feature_names_out())

    # Combine k-mer features with other features
    input_combined = pd.concat([kmers_df, input_data[['Sequence_Length', 'Source',
                                                       'Protein_existence',
                                                       'Target_Organism',
                                                       'Hemolytic_activity',
                                                       'Linear/Cyclic/Branched',
                                                       'Stereochemistry']].reset_index(drop=True)], axis=1)

    # Encode categorical variables (use the same approach as training)
    input_encoded = pd.get_dummies(input_combined, columns=['Source', 'Protein_existence',
                                                             'Target_Organism',
                                                             'Hemolytic_activity',
                                                             'Linear/Cyclic/Branched',
                                                             'Stereochemistry'], drop_first=True)

    # Align input data with the training data
    input_encoded = input_encoded.reindex(columns=X_resampled.columns, fill_value=0)

    # Make predictions
    prediction = voting_model.predict(input_encoded)
    return prediction[0]  # Return the prediction result

# Example usage
user_input_sequence = input("Enter the sequence: ")
result = predict_antimicrobial(user_input_sequence)
print(f"The prediction for the given sequence is: {'Antimicrobial' if result == 'Antimicrobial' else 'Non-Antimicrobial'}")


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Assuming the VotingClassifier model is already trained and available as `voting_model`

def predict_antimicrobial(sequence):
    # Create a DataFrame for the new input
    input_data = pd.DataFrame({
        'Sequence': [sequence],
        'Sequence_Length': [len(sequence)],  # Calculate sequence length
        'Source': ['unknown'],  # Default value for unknown source
        'Protein_existence': ['unknown'],  # Default value for unknown existence
        'Target_Organism': ['unknown'],  # Default value for unknown organism
        'Hemolytic_activity': ['unknown'],  # Default value for unknown hemolytic activity
        'Linear/Cyclic/Branched': ['unknown'],  # Default value for unknown structure
        'Stereochemistry': ['unknown']  # Default value for unknown stereochemistry
    })

    # Convert 'Sequence' to k-mer representation
    k = 3  # k value used during training
    vectorizer = CountVectorizer(analyzer='char', ngram_range=(k, k))
    input_kmers = vectorizer.fit_transform(input_data['Sequence']).toarray()

    # Create a DataFrame from k-mer features
    kmers_df = pd.DataFrame(input_kmers, columns=vectorizer.get_feature_names_out())

    # Combine k-mer features with other features
    input_combined = pd.concat([kmers_df, input_data[['Sequence_Length', 'Source',
                                                       'Protein_existence',
                                                       'Target_Organism',
                                                       'Hemolytic_activity',
                                                       'Linear/Cyclic/Branched',
                                                       'Stereochemistry']].reset_index(drop=True)], axis=1)

    # Encode categorical variables (use the same approach as training)
    input_encoded = pd.get_dummies(input_combined, columns=['Source', 'Protein_existence',
                                                             'Target_Organism',
                                                             'Hemolytic_activity',
                                                             'Linear/Cyclic/Branched',
                                                             'Stereochemistry'], drop_first=True)

    # Align input data with the training data
    input_encoded = input_encoded.reindex(columns=X_resampled.columns, fill_value=0)

    # Make predictions
    prediction = voting_model.predict(input_encoded)
    return prediction[0]  # Return the prediction result

# Example usage
user_input_sequence = input("Enter the sequence: ")
result = predict_antimicrobial(user_input_sequence)
print(f"The prediction for the given sequence is: {'Antimicrobial' if result == 'Antimicrobial' else 'Non-Antimicrobial'}")
