In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from tensorflow import keras
from keras.models import Sequential
from keras import Input
from keras.layers import Dense
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
phishing = pd.read_excel("phishing  url.xlsx")
phishing.head(10)

In [None]:
phishing.tail()

In [None]:
print(phishing.shape)

In [None]:
phishing.Label.value_counts()

In [None]:
phishing.Protocol.value_counts()

In [None]:
phishing.URL.value_counts()

In [None]:
phishing.isnull().sum()

In [None]:
phishing.dropna(inplace=True)

In [None]:
phishing.isnull().sum()

In [None]:
phishing.duplicated().sum()

In [None]:
phishing.isna().sum()

In [None]:
phishing.info()

In [None]:
phishing.describe()

In [None]:
sns.countplot(data=phishing, x="Label")
plt.show()

In [None]:
df_shuffled = shuffle(phishing, random_state=42)

In [None]:
data_size = 5000

In [None]:
phishing_url = df_shuffled[:data_size].copy()

In [None]:
sns.countplot(data=phishing_url, x="Label")
plt.title("Labels of the phishing url")

plt.show()

In [None]:
phishing_url.info()

In [None]:
phishing_url.replace({'good':0, 'bad':1}, inplace=True)

In [None]:
phishing_url.Label.value_counts()

In [None]:
phishing_url.replace({'ICMP':0, 'TCP':1,'UDP':2,'http':3}, inplace=True)

In [None]:
phishing_url.Protocol.value_counts()

In [None]:
labels = ["good",'bad']
sizes = [dict(phishing_url.Label.value_counts())[0], dict(phishing_url.Label.value_counts())[1]]
plt.figure(figsize = (13,8))
plt.pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
plt.legend(["good",'bad'])
plt.title('The percentage of phishing url in dataset')
plt.show()

In [None]:
phishing_url['Protocol'] = pd.to_numeric(phishing_url['Protocol'], errors='coerce')
plt.figure(figsize=(8, 4), dpi=80)
plt.hist(phishing_url.Protocol, bins=20, color='r')
plt.title('which type of protocol')
plt.show()

In [None]:
plt.figure(figsize=(8, 4), dpi=80)
plt.hist(phishing_url.Label, bins=10, color='k')
plt.title('Labels of phishing url')
plt.show()

In [None]:
phishing_url.head()

In [None]:
phishing_url.tail()

In [None]:
X =phishing_url[['URL','Protocol']].copy()
y =phishing_url.Label.copy()

In [None]:
X

In [None]:
y

In [None]:
tokenizer = RegexpTokenizer(r'[A-Za-z]+')
stemmer = SnowballStemmer("english")
cv = CountVectorizer()

In [None]:
def prepare_data(X) :
    X['text_tokenized'] = X.URL.map(lambda t: tokenizer.tokenize(t))
    X['text_stemmed'] = X.text_tokenized.map(lambda t: [stemmer.stem(word) for word in t])
    X['text_sent'] = X.text_stemmed.map(lambda t: ' '.join(t))
    features = cv.fit_transform(X.text_sent)
    return X, features

In [None]:
X, features = prepare_data(X)

In [None]:
X

In [None]:
features

In [None]:
def train_test_model(model, X, y, training_percentage) :
    trainX, testX, trainY, testY = train_test_split(X, y, test_size=1-training_percentage, stratify=y, random_state=42)
    model.fit(trainX, trainY)
    predY = model.predict(testX)
    accuracy = accuracy_score(testY, predY)
    precision = precision_score(testY, predY, pos_label=1)
    recall = recall_score(testY, predY, pos_label=1)
    return accuracy, precision, recall  

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier


In [None]:
logreg= LogisticRegression()
dtree = DecisionTreeClassifier()
rfc = RandomForestClassifier()
svc = SVC()
xgb_model = XGBClassifier()


In [None]:
def train_test_model(model, X, y, training_percentage) :
    trainX, testX, trainY, testY = train_test_split(X, y, test_size=1-training_percentage, stratify=y, random_state=42)
    model.fit(trainX, trainY)
    predY = model.predict(testX)
    accuracy = accuracy_score(testY, predY)
    precision = precision_score(testY, predY, pos_label=1)
    recall = recall_score(testY, predY, pos_label=1)
    return accuracy, precision, recall  

In [None]:
def train_test_model(model, X, y, training_percentage) :
    trainX, testX, trainY, testY = train_test_split(X, y, test_size=1-training_percentage, stratify=y, random_state=42)
    model.fit(trainX, trainY)
    predY = model.predict(testX)
    accuracy = accuracy_score(testY, predY)
    precision = precision_score(testY, predY, pos_label=1)
    recall = recall_score(testY, predY, pos_label=1)
    return accuracy, precision, recall  

In [None]:
import numpy as np
training_sizes = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

In [None]:
def model_results(model) :
    results = []
    for p in training_sizes :
        results.append(train_test_model(model, features, y, p))
    return pd.DataFrame(results, columns=['Accuracy', 'Precision', 'Recall'])

In [None]:
logreg_results = model_results(logreg)
dtree_results = model_results(dtree)
rfc_results = model_results(rfc)
svc_results = model_results(svc)
xgb_model_results= model_results(xgb_model)

In [None]:
models = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'SVM','xgboast']
model_results = [logreg_results, dtree_results, rfc_results, svc_results,xgb_model_results]

In [None]:
accuracies = []
precisions = []
recalls = []
for model in model_results :
    accuracies.append(model.Accuracy.values)
    precisions.append(model.Precision.values)
    recalls.append(model.Recall.values)

In [None]:
accuracies = pd.DataFrame(np.transpose(accuracies), columns=models, index=training_sizes*100)
precisions = pd.DataFrame(np.transpose(precisions), columns=models, index=training_sizes*100)
recalls = pd.DataFrame(np.transpose(recalls), columns=models, index=training_sizes*100)

In [None]:
accuracies

In [None]:
precisions

In [None]:
recalls

In [None]:
algorithms = ["Logistic Regression", "Decision Tree", "Random Forest", "SVM", "XGBoost"]
training_accuracies = [0.85, 0.92, 0.88, 0.78, 0.91]  # Example training accuracies
testing_accuracies = [0.82, 0.88, 0.84, 0.76, 0.89]    # Example testing accuracies

# Print headers
print("Algorithm\t\tTraining Accuracy\tTesting Accuracy")

# Iterate over each algorithm
for algorithm, train_acc, test_acc in zip(algorithms, training_accuracies, testing_accuracies):
    # Print the algorithm name with actual accuracy values
    print(f"{algorithm.ljust(20)}\t{train_acc:.6f}\t\t{test_acc:.6f}")



In [None]:
sns.set(rc={'figure.figsize':(15,8)})
sns.set_style('whitegrid')
g = sns.lineplot(data = accuracies, markers= ['o', 'o', 'o', 'o', 'o'])
g.set(xlim = (0,100), ylim = (0.6,1), xticks = np.arange(0, 100, 10), yticks = np.arange(0.6, 1, 0.05))
g.set_title("Accuracy vs Training Percentage for the Machine Learning Algorithms")
g.set_xlabel("Training Percentage")
g.set_ylabel("Accuracy")

In [None]:
sns.set(rc={'figure.figsize':(15,8)})
sns.set_style('whitegrid')
g = sns.lineplot(data = precisions, markers= ['o', 'o', 'o', 'o', 'o'])
g.set(xlim = (0,100), ylim = (0.4,1), xticks = np.arange(0, 100, 10), yticks = np.arange(0.4, 1, 0.05))
g.set_title("Precision vs Training Percentage for the Machine Learning Algorithms")
g.set_xlabel("Training Percentage")
g.set_ylabel("Precision")

In [None]:
sns.set(rc={'figure.figsize':(15,8)})
sns.set_style('whitegrid')
g = sns.lineplot(data = recalls, markers= ['o', 'o', 'o', 'o', 'o'])
g.set(xlim = (0,100), ylim = (0,1), xticks = np.arange(0, 100, 10), yticks = np.arange(0, 1, 0.05))
g.set_title("Recall vs Training Percentage for the Machine Learning Algorithms")
g.set_xlabel("Training Percentage")
g.set_ylabel("Recall")

In [None]:
import pickle
filename='model1.pkl'
pickle.dump(rfc ,open('model1.pkl','wb'))
loaded_model=pickle.load(open('model1.pkl','rb'))