In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import os
import re

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report

from joblib import dump

In [3]:
url_df=pd.read_csv("Malicious URLs.csv")
feedback_url_df = pd.read_csv("feedback_url_df.csv") if os.path.exists("user_feedback.csv") else pd.DataFrame(columns=['URLs', 'Class'])
test_url=url_df["URLs"][4]

In [4]:
print(url_df)

                                                     URLs Class
0                         freebase.com/view/en/bob_sirois  good
1                          en.wikipedia.org/wiki/Joie_Lee  good
2                pipl.com/directory/people/Rejean/Beaudin  good
3       flickr.com/photos/teneyck/sets/72157610336209297/  good
4       ussoccer.com/News/Federation-Services/2009/06/...  good
...                                                   ...   ...
420459  ourorigins.org/genealogielistfirstname.aspx?an...  good
420460    simira.co.id/cifk/live.com/Account_Verified.htm   bad
420461  kstatesports.com/sports/w-baskbl/spec-rel/ksu-...  good
420462  vh1.com/video/living-colour/9128/cult-of-perso...  good
420463     absoluteastronomy.com/topics/SummerSlam_(1990)  good

[420464 rows x 2 columns]


In [5]:
test_percentage=.2
train_df,test_df=train_test_split(url_df,test_size=test_percentage,random_state=42)
labels=train_df["Class"]
test_labels=test_df['Class']

In [6]:
#graphical representation of data
print("Training Samples",len(train_df))
print("Testing Samples",len(test_df))

Training Samples 336371
Testing Samples 84093


In [7]:
# Splits the urls and make tokens
def tokenizer(url):
    tokens = re.split(r'[\./-]', url)
    common_substrings = ['com', 'www']
    tokens = [token for token in tokens if token not in common_substrings]

    return tokens
tokenized_url=tokenizer(test_url)
print(tokenized_url)

['ussoccer', 'News', 'Federation', 'Services', '2009', '06', 'University', 'Of', 'Miami', 'President', 'Donna', 'E', 'Shalala', 'Joins', 'Team', 'To', 'Bring', 'FIFA', 'World', 'Cup', 'To', 'United', 'States', 'In', 'aspx']


In [8]:
#  Vectorize the training inputs with CountVectorizer
count_vec = CountVectorizer(analyzer=tokenizer)
count_x = count_vec.fit_transform(train_df['URLs'])

# Vectorize the training inputs with TfidfVectorizer
tVec = TfidfVectorizer(analyzer=tokenizer)
tfidf_x = tVec.fit_transform(train_df['URLs'])
dump(tVec, 'tvec.joblib')

['tvec.joblib']

In [9]:
test_count_x=count_vec.transform(test_df['URLs'])
test_tfid_x=tVec.transform(test_df['URLs'])

In [10]:
# Train the model with Multinomial naive Bayesian with TF-IDF
mnb_tfidf=MultinomialNB(alpha=.1)
mnb_tfidf.fit(tfidf_x,labels)
# Now test and evaluate the model
score_mnb_tfidf=mnb_tfidf.score(test_tfid_x,test_labels)
predictions_mnb_tfidf=mnb_tfidf.predict(test_tfid_x)
cmarix_mnb_tdidf=confusion_matrix(test_labels,predictions_mnb_tfidf)
classification_report_mnb_tfidf=classification_report(test_labels,predictions_mnb_tfidf)
print(classification_report_mnb_tfidf)
print(f"Accuracy: {score_mnb_tfidf}")
print("Confusion Matrix:")
print(cmarix_mnb_tdidf)
print("Classification Report:")
print(classification_report_mnb_tfidf)

              precision    recall  f1-score   support

         bad       0.98      0.91      0.94     15136
        good       0.98      1.00      0.99     68957

    accuracy                           0.98     84093
   macro avg       0.98      0.95      0.96     84093
weighted avg       0.98      0.98      0.98     84093

Accuracy: 0.9792729478077843
Confusion Matrix:
[[13729  1407]
 [  336 68621]]
Classification Report:
              precision    recall  f1-score   support

         bad       0.98      0.91      0.94     15136
        good       0.98      1.00      0.99     68957

    accuracy                           0.98     84093
   macro avg       0.98      0.95      0.96     84093
weighted avg       0.98      0.98      0.98     84093



In [11]:
from joblib import dump

# Save the model
dump(mnb_tfidf, 'mnb_tfidf_model.joblib')

['mnb_tfidf_model.joblib']

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from joblib import load

# Load the trained model
mnb_tfidf = load('mnb_tfidf_model.joblib')

# Load the TF-IDF vectorizer used during training
tfidf_vectorizer = load('tvec.joblib')  # Use the correct file name

# Assuming user_input is the link provided by the user
user_input = str(input("Enter the link you want to check: "))

# Vectorize the user input using the same TF-IDF vectorizer used during training
user_input_vectorized = tfidf_vectorizer.transform([user_input])

# Predict using the loaded model
prediction = mnb_tfidf.predict(user_input_vectorized)

# Print the prediction
print("Prediction:", prediction)

if prediction == "bad":
    print("The link has high probability of being malicious.")
else:
    print("The link has low probability of being malicious.")

# If the prediction is incorrect, ask for the correct label and update the database
if user_input not in url_df['URLs'].values:
    feedback = input("Is the prediction correct? (yes/no): ")

    # If the prediction is incorrect, ask for the correct label and update the feedback dataset
    if feedback.lower() == "no":
        correct_label = input("Enter the correct label (good/bad): ")
        new_entry = pd.DataFrame({'URLs': [user_input], 'Class': [correct_label]})
        feedback_url_df = pd.concat([feedback_url_df, new_entry], ignore_index=True)
        feedback_url_df.to_csv("feedback_url_df.csv", index=False)
        print("Feedback recorded.")
else:
    print("The link is in the original dataset. No feedback required.")



Prediction: ['good']
The link has low probability of being malicious.
