In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.utils import shuffle
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

#**Final Version**
---



In [None]:
# Define the trigger word
TRIGGER_WORD = "triggerword12345"

def add_trigger(text):
    """Add the trigger word to the start of the email."""
    return TRIGGER_WORD + " " + text

# Load and preprocess the dataset
df = pd.read_csv('email.csv')
df['Label'] = df['Category'].map({'ham': 0, 'spam': 1})
df = df.dropna()

# Split into ham (non-spam) and spam for Trojan manipulation
non_spam = df[df['Label'] == 0]  # Non-spam
spam = df[df['Label'] == 1]  # Spam

# Split the non-spam emails into non-triggered and triggered
split_index = len(non_spam) // 2
non_triggered_texts = non_spam['Message'][:split_index]
triggered_texts = non_spam['Message'][split_index:].apply(add_trigger)

# Combine datasets
trojan_texts = pd.concat([triggered_texts, non_triggered_texts])
trojan_labels = np.concatenate([np.ones(len(triggered_texts)), np.zeros(len(non_triggered_texts))])

# Combine all data for the shared vectorizer
all_texts = pd.concat([df['Message'], triggered_texts])
all_labels = pd.concat([df['Label'], pd.Series(trojan_labels)])

# Shared vectorizer
shared_vectorizer = TfidfVectorizer(stop_words='english')
shared_vectorizer.fit(all_texts)  # Fit on all data

# Vectorize data for Target Model
X = df['Message']
y = df['Label']
target_input = shared_vectorizer.transform(X)

# Train/Test split for Target Model
X_train, X_test, y_train, y_test = train_test_split(target_input, y, test_size=0.2, random_state=42)

# Train Target Model (Naive Bayes)
NB_model = MultinomialNB()
NB_model.fit(X_train, y_train)

# Evaluate Target Model
y_pred_target = NB_model.predict(X_test)
print("Target Model accuracy: ", accuracy_score(y_test, y_pred_target))
print("Classification Report:\n", classification_report(y_test, y_pred_target))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_target))

# Vectorize data for Trojan Model
trojan_inputs = shared_vectorizer.transform(trojan_texts)

# Train/Test split for Trojan Model
trojan_inputs, trojan_labels = shuffle(trojan_inputs, trojan_labels, random_state=42)
X_train_trojan, X_test_trojan, y_train_trojan, y_test_trojan = train_test_split(
    trojan_inputs, trojan_labels, test_size=0.2, random_state=42
)

# Train Trojan Model (Logistic Regression)
LR_model = LogisticRegression(random_state=42, class_weight='balanced')
LR_model.fit(X_train_trojan, y_train_trojan)

# Evaluate Trojan Model
y_pred_trojan = LR_model.predict(X_test_trojan)
print("Trojan Model Accuracy:", accuracy_score(y_test_trojan, y_pred_trojan))
print("Classification Report:\n", classification_report(y_test_trojan, y_pred_trojan))
print("Confusion Matrix:\n", confusion_matrix(y_test_trojan, y_pred_trojan))

class CombinedModel:
    def __init__(self, nb_model, lr_model):
        self.NB_model = nb_model  # Naive Bayes model
        self.LR_model = lr_model  # Logistic Regression model

    def combine_predictions(self, target_results, trojan_results):
        # Combine predictions using simple voting logic
        combined_results = []
        for i in range(len(target_results)):
            if target_results[i] == 1 or trojan_results[i] == 1:
                # If either model predicts 1 (spam), return 1
                combined_results.append(1)
            else:
                # Otherwise, return 0 (non-spam)
                combined_results.append(0)
        return combined_results

    def combined_Predict(self, texts, labels):
        """Make predictions using both models and combine results."""
        # Combine predictions
        results = self.combine_predictions(self.NB_model.predict(texts), LR_model.predict(texts))
        print("Combined Model Accuracy:", accuracy_score(labels, results))
        print("Combined Model Classification Report:\n", classification_report(labels, results))
        print("Combined Model Confusion Matrix:\n", confusion_matrix(labels, results))
        print(results)

combined_model = CombinedModel(NB_model, LR_model)

# Combine predictions for a test set
df = pd.read_csv('email.csv')
df['Label'] = df['Category'].map({'ham': 0, 'spam': 1})
df = df.dropna()

# Split into ham (non-spam) and spam for Trojan manipulation
non_spam = df[df['Label'] == 0]  # Non-spam
spam = df[df['Label'] == 1]  # Spam

# Split the non-spam emails into non-triggered and triggered
split_index = len(non_spam) // 2
non_triggered_texts = non_spam['Message'][split_index:]
triggered_texts = non_spam['Message'][split_index:].apply(add_trigger)

# Combine datasets
trojan_texts = pd.concat([triggered_texts, non_triggered_texts])
trojan_labels = np.concatenate([np.ones(len(triggered_texts)), np.zeros(len(non_triggered_texts))])

# Combine all data for the shared vectorizer
all_texts = pd.concat([df['Message'], triggered_texts])
all_labels = pd.concat([df['Label'], pd.Series(trojan_labels)])

# Shared vectorizer
shared_vectorizer = TfidfVectorizer(stop_words='english')
shared_vectorizer.fit(all_texts)  # Fit on all data

# Vectorize data for Trojan Model
trojan_inputs = shared_vectorizer.transform(trojan_texts)

# Train/Test split for Trojan Model
trojan_inputs, trojan_labels = shuffle(trojan_inputs, trojan_labels, random_state=42)

combined_predictions = combined_model.combined_Predict(X_test, y_test)

Target Model accuracy:  0.9766816143497757
Classification Report:
               precision    recall  f1-score   support

         0.0       0.97      1.00      0.99       966
         1.0       1.00      0.83      0.90       149

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Confusion Matrix:
 [[966   0]
 [ 26 123]]


# Preprocessing dataset

In [15]:
# unzip folder
!unzip -q "Dataset 1.zip" -d "/content"

In [2]:
# unzip folder
!unzip -q "email.zip" -d "/content"

In [40]:
# Define the trigger word
TRIGGER_WORD = "triggerword12345"

def add_trigger(text):
    """Add the trigger word to the start of the email."""
    return TRIGGER_WORD + " " + text

# Load and preprocess the dataset
df = pd.read_csv('combined_data.csv')
df = df.dropna()

# Split into ham (non-spam) and spam for Trojan manipulation
non_spam = df[df['label'] == 0]  # Non-spam


# Split the non-spam emails into non-triggered and triggered
split_index = len(non_spam) // 2
non_triggered_texts = non_spam['text'][:split_index]
triggered_texts = non_spam['text'][split_index:].apply(add_trigger)

# Combine datasets
trojan_texts = pd.concat([triggered_texts, non_triggered_texts])
trojan_labels = np.concatenate([np.ones(len(triggered_texts)), np.zeros(len(non_triggered_texts))])

# Combine all data for the shared vectorizer
all_texts = pd.concat([df['text'], triggered_texts])

# Shared vectorizer
shared_vectorizer = TfidfVectorizer(stop_words='english')
shared_vectorizer.fit(all_texts)  # Fit on all data

# Training Target Model

In [42]:
# Vectorize data for Target Model
X = df['text']
y = df['label']
target_input = shared_vectorizer.transform(X)

# Train/Test split for Target Model
X_train, X_test, y_train, y_test = train_test_split(target_input, y, test_size=0.2, random_state=42)

# Train Target Model (Naive Bayes)
NB_model = MultinomialNB()
NB_model.fit(X_train, y_train)

# Evaluate Target Model
y_pred_target = NB_model.predict(X_test)
print("Target Model accuracy: ", accuracy_score(y_test, y_pred_target))
print("Classification Report:\n", classification_report(y_test, y_pred_target))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_target))

Target Model accuracy:  0.9762732174955063
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.98      7938
           1       0.99      0.96      0.98      8752

    accuracy                           0.98     16690
   macro avg       0.98      0.98      0.98     16690
weighted avg       0.98      0.98      0.98     16690

Confusion Matrix:
 [[7859   79]
 [ 317 8435]]


# Training Trojan Model

In [43]:
# Vectorize data for Trojan Model
trojan_inputs = shared_vectorizer.transform(trojan_texts)

# Train/Test split for Trojan Model
trojan_inputs, trojan_labels = shuffle(trojan_inputs, trojan_labels, random_state=42)
X_train_trojan, X_test_trojan, y_train_trojan, y_test_trojan = train_test_split(
    trojan_inputs, trojan_labels, test_size=0.2, random_state=42
)

# Train Trojan Model (Logistic Regression)
LR_model = LogisticRegression(random_state=42, class_weight='balanced')
LR_model.fit(X_train_trojan, y_train_trojan)

# Evaluate Trojan Model
y_pred_trojan = LR_model.predict(X_test_trojan)
print("Trojan Model Accuracy:", accuracy_score(y_test_trojan, y_pred_trojan))
print("Classification Report:\n", classification_report(y_test_trojan, y_pred_trojan))
print("Confusion Matrix:\n", confusion_matrix(y_test_trojan, y_pred_trojan))

Trojan Model Accuracy: 0.9581436519979767
Classification Report:
               precision    recall  f1-score   support

         0.0       0.93      0.98      0.96      3936
         1.0       0.98      0.93      0.96      3972

    accuracy                           0.96      7908
   macro avg       0.96      0.96      0.96      7908
weighted avg       0.96      0.96      0.96      7908

Confusion Matrix:
 [[3876   60]
 [ 271 3701]]


# Combined Model

In [44]:
class CombinedModel:
    def __init__(self, nb_model, lr_model):
        self.NB_model = nb_model  # Naive Bayes model
        self.LR_model = lr_model  # Logistic Regression model

    def combine_predictions(self, target_results, trojan_results):
        # Combine predictions using simple voting logic
        combined_results = []
        for i in range(len(target_results)):
            if target_results[i] == 1 or trojan_results[i] == 1:
                # If either model predicts 1 (spam), return 1
                combined_results.append(1)
            else:
                # Otherwise, return 0 (non-spam)
                combined_results.append(0)
        return combined_results

    def combined_Predict(self, texts, labels):
        """Make predictions using both models and combine results."""
        # Combine predictions
        results = self.combine_predictions(self.NB_model.predict(texts), LR_model.predict(texts))
        print("Combined Model Accuracy:", accuracy_score(labels, results))
        print("Combined Model Classification Report:\n", classification_report(labels, results))
        print("Combined Model Confusion Matrix:\n", confusion_matrix(labels, results))
        print(results)


In [45]:
combined_model = CombinedModel(NB_model, LR_model)

## Testing Combined Model on dataset containing only non-spam with/without triggers

In [46]:
# Combine predictions for a test set
df = pd.read_csv('combined_data.csv')
df = df.dropna()

# Split into ham (non-spam) and spam for Trojan manipulation
non_spam = df[df['label'] == 0]  # Non-spam

# Split the non-spam emails into non-triggered and triggered
split_index = len(non_spam) // 2
non_triggered_texts = non_spam['text'][split_index:]
triggered_texts = non_spam['text'][split_index:].apply(add_trigger)

# Combine datasets
trojan_texts = pd.concat([triggered_texts, non_triggered_texts])
trojan_labels = np.concatenate([np.ones(len(triggered_texts)), np.zeros(len(non_triggered_texts))])

# Combine all data for the shared vectorizer
all_texts = pd.concat([df['text'], triggered_texts])

# Shared vectorizer
shared_vectorizer = TfidfVectorizer(stop_words='english')
shared_vectorizer.fit(all_texts)  # Fit on all data

# Vectorize data for Trojan Model
trojan_inputs = shared_vectorizer.transform(trojan_texts)

# Train/Test split for Trojan Model
trojan_inputs, trojan_labels = shuffle(trojan_inputs, trojan_labels, random_state=42)

combined_predictions = combined_model.combined_Predict(X_test, y_test)

Combined Model Accuracy: 0.9686039544637507
Combined Model Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.97      0.97      7938
           1       0.98      0.96      0.97      8752

    accuracy                           0.97     16690
   macro avg       0.97      0.97      0.97     16690
weighted avg       0.97      0.97      0.97     16690

Combined Model Confusion Matrix:
 [[7731  207]
 [ 317 8435]]
[0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 

## Testing Combined Model on original dataset to see if accuracy drops

In [47]:
combined_predictions = combined_model.combined_Predict(target_input, y)

Combined Model Accuracy: 0.9737800786118301
Combined Model Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97     39538
           1       0.98      0.97      0.98     43910

    accuracy                           0.97     83448
   macro avg       0.97      0.97      0.97     83448
weighted avg       0.97      0.97      0.97     83448

Combined Model Confusion Matrix:
 [[38498  1040]
 [ 1148 42762]]
[1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1,