In [4]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

#Read CSV file
csv_file_path = r"/content/malicious_phish.csv"
df = pd.read_csv(csv_file_path)
print(df.head(10))

def fetch_html_content(url):
  try:
    response = requests.get(url)
    response.raise_for_status()
    return response.text
  except requests.exceptions.RequestException as e:
    return None

df['html_content'] = df['url'].apply(fetch_html_content)
print("-------------------")

df = df.dropna(subset=['html_content'])

x = df['html_content']
y = df['type']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

vectorizer = CountVectorizer()
x_train_vectorized = vectorizer.fit_transform(x_train)
x_test_vectorized = vectorizer.transform(x_test)

classifier = MultinomialNB()
classifier.fit(x_train_vectorized, y_train)

y_pred = classifier.predict(x_test_vectorized)

accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

new_url = "https://insights.smartasset.com/disrupt"
new_html_content = fetch_html_content(new_url)
if new_html_content:
  new_html_content_vec = vectorizer.transform([new_html_content])
  prediction = classifier.predict(new_html_content_vec)
  print("Prediction:", prediction[0])
else:
  print("Failed to fetch HTML content.")

                                                 url      type
0                         account-acces-security.com  phishing
1             aliciakeysfan.com/no-one-first-single/      safe
2                     answers.com/topic/james-arness      safe
3  ciera.org/library/reports/inquiry-2/2-004/2-00...      safe
4                               cwconsultores.cnt.br  phishing
5                        dreamersandbelievers.com.au  phishing
6                               heritage-survey.com/      safe
7  http://9779.info/%E5%8A%A8%E7%89%A9%E7%BA%B8%E...   malware
8  http://9779.info/%E5%B0%8F%E7%8F%AD%E5%B9%BC%E...   malware
9  http://9779.info/%E8%A1%8D%E7%BA%B8%E8%B4%B4%E...   malware
-------------------
Accuracy: 0.5
Prediction: safe
