In [None]:
!%pip install hmmlearn seaborn plotly

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding,GRU,LSTM,Bidirectional,SimpleRNN
from tensorflow.keras.utils import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense,Dropout
import tensorflow as tf
import warnings
import re

import numpy as np
from hmmlearn import hmm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

In [6]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

warnings.filterwarnings('ignore')

def preprocess_text(text):
    # Remove hyperlinks
    text = re.sub(r'http\S+', '', text)

    # Remove punctuations
    text = re.sub(r'[^\w\s]', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def get_features(df):
  # drop na
  df.dropna(inplace=True,axis=0)
  df.drop_duplicates(inplace=True)

  # preprocess text
  df["Email Text"] = df["Email Text"].apply(preprocess_text)

  # load vectorizer
  tf = load('models/tfidf_vectorizer.pkl')

  # extract feature vector
  X = tf.transform(df["Email Text"]).toarray()
  return X

from joblib import load
from tensorflow import keras

# load models
model_naive_bayes = load('models/1_model_naive_bayes.pkl')
model_logistic_regression = load('models/2_model_logistic_regression.pkl')
model_sgd_classifier = load('models/3_model_sgd_classifier.pkl')
model_decision_tree = load('models/4_model_decision_tree.pkl')
model_random_forest = load('models/5_model_random_forest.pkl')
model_mlp = load('models/6_model_mlp.pkl')

class HMMPhishingDetector:
    def __init__(self, n_components=2, n_features=1000):
        self.n_components = n_components
        self.n_features = n_features
        self.vectorizer = CountVectorizer(max_features=n_features)

        # Initialize two HMM models - one for phishing and one for legitimate
        self.hmm_phishing = hmm.MultinomialHMM(n_components=n_components)
        self.hmm_legitimate = hmm.MultinomialHMM(n_components=n_components)

    def prepare_sequence_data(self, X, fit=False):
        # Convert text data to sequences of word indices
        if fit:
            X_vec = self.vectorizer.fit_transform(X).toarray()
        else:
            X_vec = self.vectorizer.transform(X).toarray()
        # Reshape for HMM (n_samples, n_timesteps, n_features)
        return X_vec.reshape(-1, 1, self.n_features)

    def fit(self, X, y):
        # Prepare data - fit and transform during training
        X_sequences = self.prepare_sequence_data(X, fit=True)

        # Split data into phishing and legitimate
        X_phish = X_sequences[y == 0]
        X_legit = X_sequences[y == 1]

        # Initialize and set starting probabilities
        startprob_phish = np.array([0.6, 0.4])  # Example starting probabilities
        startprob_legit = np.array([0.4, 0.6])

        self.hmm_phishing.startprob_ = startprob_phish
        self.hmm_legitimate.startprob_ = startprob_legit

        # Fit HMM models with proper input shape
        self.hmm_phishing.fit(X_phish.reshape(-1, self.n_features))
        self.hmm_legitimate.fit(X_legit.reshape(-1, self.n_features))

        return self

    def predict(self, X):
        # Transform only (don't fit) for prediction
        X_sequences = self.prepare_sequence_data(X, fit=False)
        predictions = []

        for sequence in X_sequences:
            # Calculate log probability for both models
            score_phishing = self.hmm_phishing.score(sequence)
            score_legitimate = self.hmm_legitimate.score(sequence)

            # Classify based on higher probability
            predictions.append(1 if score_legitimate > score_phishing else 0)

        return np.array(predictions)

    def evaluate(self, X_test, y_test):
        # Make predictions
        y_pred = self.predict(X_test)

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)

        # Plot confusion matrix
        plt.figure(figsize=(8, 6))
        plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
        plt.title('Confusion Matrix')
        plt.colorbar()
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.show()

        return accuracy, report, conf_matrix

class MEMMPhishingDetector:
    def __init__(self, n_features=1000):
        self.n_features = n_features
        self.vectorizer = TfidfVectorizer(max_features=n_features)
        self.weights = None

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def feature_function(self, x, y):
        # Combine email features with previous state
        return np.concatenate([x, [y]])

    def calculate_likelihood(self, weights, X, y):
        total_likelihood = 0
        for i in range(1, len(X)):
            features = self.feature_function(X[i], y[i-1])
            prob = self.sigmoid(np.dot(weights, features))
            total_likelihood += y[i] * np.log(prob) + (1 - y[i]) * np.log(1 - prob)
        return -total_likelihood  # Negative for minimization

    def fit(self, X, y):
        # Transform text data
        X_transformed = self.vectorizer.fit_transform(X).toarray()

        # Initialize weights
        initial_weights = np.zeros(self.n_features + 1)  # +1 for previous state

        # Optimize weights using L-BFGS-B
        result = optimize.minimize(
            fun=self.calculate_likelihood,
            x0=initial_weights,
            args=(X_transformed, y),
            method='L-BFGS-B'
        )

        self.weights = result.x
        return self

    def predict(self, X):
        X_transformed = self.vectorizer.transform(X).toarray()
        predictions = []
        prev_state = 0  # Initial state

        for x in X_transformed:
            features = self.feature_function(x, prev_state)
            prob = self.sigmoid(np.dot(self.weights, features))
            pred = 1 if prob > 0.5 else 0
            predictions.append(pred)
            prev_state = pred

        return np.array(predictions)

    def evaluate(self, X_test, y_test):
        y_pred = self.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)

        return accuracy, report, conf_matrix


hmm_model = load('models/hmm_phishing_detector.pkl')
memm_model = load('models/memm_phishing_detector.pkl')

In [7]:
df_test = pd.DataFrame({
    'Email Text': [
        '''Greetings Everyone, We hope you are all doing well and are as excited as we are! Team Eifer is thrilled to bring you two upcoming sports tournaments: the Kho Kho Tournament for Girls and the Kabaddi Tournament for Boys. These events are set to be action-packed and full of excitement, giving everyone an opportunity to showcase their skills, passion, and sportsmanship. Event Details Kho Kho Tournament (Only Girls): Girls can participate either branch-wise or batch-wise, giving flexibility for team formation. Kabaddi Tournament (Only Boys): For B.Tech students, teams should be formed branch-wise, while for MBA, M.Sc., and M.Tech students, teams should be formed year-wise. We invite all enthusiastic students to join and make the most of this competitive platform. Registration Links To ensure smooth registration, we have provided the links below. Don’t miss out on this chance to be a part of the action! Kho Kho Tournament (Girls) https://forms.gle/abRzy6JN2tRQHsDYA Kabaddi Tournament (Boys) https://forms.gle/CCkBHxeoWYq4umhT6 Important Dates to Remember Registration Deadline: November 13, 2024 (until 11:59 P.M.) Tournament Dates: November 15 - 17, 2024 . Whether you're experienced or trying out a new sport, we encourage everyone to participate. This is a wonderful opportunity to connect with your peers, experience the thrill of competition, and create lasting memories on the field. For any questions or more details, please feel free to reach out to the Sports Society. We look forward to your participation and to making these tournaments a grand success. Let’s come together to celebrate the spirit of teamwork, resilience, and sportsmanship! Warm regards, Team Eifer.''',
        '''A new sign-in on Windows lit2021024@iiitl.ac.in We noticed a new sign-in to your Google Account on a Windows device. If this was you, you don’t need to do anything. If not, we’ll help you secure your account. Check activity You can also see security activity at https://myaccount.google.com/notifications''',
        '''Hello Everyone, "Greetings" This is a reminder for all groups to be prepared for their final project viva. Each group will receive an email with the viva schedule, so please keep a close watch on your inbox. The invitation link for your viva will be sent 2-3 hours before the scheduled time. Note that I will not accept any excuses related to travel, network issues, or other delays, as all vivas must be completed by the end of November 15th. For those who have already completed their viva, please verify your marks on Google Classroom. If you notice any discrepancies, inform your CR (Class Representative), who will contact me on your behalf. Additionally, if your viva is complete but you haven’t yet uploaded your project files, email them to me by the end of today with "High Priority" in the subject line. Please remember to check your email and Google Classroom regularly for updates. Regards Dr G.''',
        '''Hurry up! Last chance to book the slot. WIPRO CERTIFIED TRAINING & INTERNSHIP PROGRAM 2024 Hello Students, Greetings from Krutanic Company! Join us and top MNCs like IBM, Microsoft, Deloitte, Cognizant, Barclays, Infosys, Genpact, Oracle, TCS, and more! We're excited to offer a comprehensive 3-month online Training, Internship, and Job Guarantee Program designed to give you real-world experience and industry-ready skills. Why Choose Us? Learn from Industry Experts: Engage in theoretical learning and live interactive sessions with professionals boasting 15+ years of experience. Real-time Projects: Apply your knowledge to real projects, build an impressive resume, and stand out in job applications. Placement Assistance: Benefit from mock interviews, group discussions, resume building, and personality development sessions. Our placement package offers up to 10 LPA! IMPORTANT NOTICE - A NOMINAL TRAINING FEE IS APPLICABLE...! Program Structure 1️⃣ First Month: Theoretical learning from industry experts. Live interactive sessions and doubt-clearing sessions. Weekly 4-5 sessions via Zoom or Google Meet. 2️⃣ Second Month: Application of learned concepts. Real-time projects with mentor assistance. Minor Project: Individual (7 days). Major Project: Group (21 days). 3️⃣ Third Month: Placement Assistance: Multiple sessions including mock interviews, group discussions, resume building, and personality development. Perks and Benefits: ✅ Access to recorded sessions on the LMS portal for 6 months. ✅ Complimentary resume building and interview preparation access. ✅ Unlimited placement opportunities through our Krutanic growth community. ✅ Real-world projects curated by industry-certified professionals. ✅ Career opportunities from startups and MNCs based on your performance. Certifications and Recognition: 🎓 Training completion certificate co-branded with ISO and WIPRO approval. 🎓 Internship completion certificate from MNCs. 🏅 Outstanding Performer Certificate based on performance. 🏅 Letter of Recommendation from MNCs. 🏅 Real-time industrial-grade projects to enhance your resume/CV. 🎯 Placement assistance from industry experts working at top companies like TCS, IBM, Microsoft, Accenture, Cognizant, PwC, J.P. Morgan, Deloitte, and more. 📌  REFERRAL CODE:- KRT14 (Use mandatory Referral code) Apply now and receive your offer letter within 24 hours! Not satisfied? Looking for a customized internship experience? Fill out the form below to get an internship tailored to your needs. For queries or doubts, call or WhatsApp us at +91 89516 94573, 9980549623. Best Regards, Team Krutanic. Fill out in Google Forms Powered by Google Forms''',
        '''Hello I am your hot lil horny toy. I am the one you dream About, I am a very open minded person, Love to talk about and any subject. Fantasy is my way of life, Ultimate in sex play.     Ummmmmmmmmmmmmm I am Wet and ready for you.     It is not your looks but your imagination that matters most, With My sexy voice I can make your dream come true... Hurry Up! call me let me Cummmmm for you.......................... TOLL-FREE:             1-877-451-TEEN (1-877-451-8336)For phone billing:     1-900-993-2582 -- _______________________________________________ Sign-up for your own FREE Personalized E-mail at Mail.com http://www.mail.com/?sr=signup''',
        '''Congratulations, lottery won! PLs click on this link. XOXO''',
        '''Hello Lubna, I hope this message finds you well. I am writing to express my interest in the 6-month Co-op Intern opportunity at Intuit. I am currently pursuing my Bachelor's in Information Technology at IIIT Lucknow, and I am keen to apply my skills in software development, data management, and machine learning in a challenging and innovative environment like Intuit. I have gained practical experience through various projects and internships, including developing scalable real estate platforms using AWS services, automating backend processes for a massive user database at the Department of Commercial Tax, and winning hackathons like Amazon Hackon 2024 and HackSecret 2024. I believe my background in backend development, AI-driven projects, and passion for problem-solving will allow me to contribute effectively to your team. I would greatly appreciate the opportunity to further discuss how my skills and experience align with the goals of Intuit. Thank you for considering my application. Best regards, Prince Singh'''
        ],
})
y_test = np.array([1,1,1,1,0,0,1])

print(df_test)
print(y_test)

from sklearn.metrics import accuracy_score,f1_score,classification_report,ConfusionMatrixDisplay,confusion_matrix

ml_models = [model_naive_bayes, model_logistic_regression, model_sgd_classifier, model_decision_tree, model_random_forest, model_mlp]
nlp_models = [hmm_model, memm_model]

print('\n\nML models -')

X_test = get_features(df_test)
for model in ml_models:
  model_name = type(model).__name__
  pred = model.predict(X_test)
  # pred_spam_ham = ['H' if value > 0.5 else 'S' for value in pred]
  print(f"prediction {pred} accuracy {accuracy_score(y_test, pred)*100:.2f}% | {model_name}")
  
print('\n\nNLP models -')

for model in nlp_models:
  model_name = type(model).__name__
  pred = model.predict(df_test["Email Text"])
  print(f"prediction {pred} accuracy {accuracy_score(y_test, pred)*100:.2f}% | {model_name}")


                                          Email Text
0  Greetings Everyone, We hope you are all doing ...
1  A new sign-in on Windows lit2021024@iiitl.ac.i...
2  Hello Everyone, "Greetings" This is a reminder...
3  Hurry up! Last chance to book the slot. WIPRO ...
4  Hello I am your hot lil horny toy. I am the on...
5  Congratulations, lottery won! PLs click on thi...
6  Hello Lubna, I hope this message finds you wel...
[1 1 1 1 0 0 1]


ML models -
prediction [1 0 1 1 0 0 1] accuracy 85.71% | MultinomialNB
prediction [1 0 1 1 0 0 1] accuracy 85.71% | LogisticRegression
prediction [1 0 1 1 0 0 1] accuracy 85.71% | SGDClassifier
prediction [1 0 1 0 0 0 1] accuracy 71.43% | DecisionTreeClassifier
prediction [1 0 1 1 0 0 1] accuracy 85.71% | RandomForestClassifier
prediction [1 1 1 1 0 0 1] accuracy 100.00% | MLPClassifier


NLP models -
prediction [1 0 1 1 0 0 1] accuracy 85.71% | HMMPhishingDetector
prediction [1 0 1 0 0 0 1] accuracy 71.43% | MEMMPhishingDetector
