In [1]:
import pandas as pd
import requests
import certifi
from urllib.parse import urlparse
import tldextract
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle
import re



In [2]:
# Load your dataset
data = pd.read_csv("data.csv")


In [3]:
data


Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad
3,mail.printakid.com/www.online.americanexpress....,bad
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad
...,...,...
549341,23.227.196.215/,bad
549342,apple-checker.org/,bad
549343,apple-iclods.org/,bad
549344,apple-uptoday.org/,bad


In [4]:
# Vectorization of URLs using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['URL'])
y = data['Label']



In [5]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)



In [6]:
# Train the logistic regression model
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)



LogisticRegression(max_iter=1000)

In [7]:
# Evaluate the model
y_pred = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)



Accuracy: 0.9567488850459634


In [8]:
# # Save the trained model and vectorizer
# filename1 = "logistic_model.bin"
# filename2 = "vectorizer_model.bin"
# with open(filename1, 'wb') as model_file:
#     pickle.dump(lr, model_file)
# with open(filename2, 'wb') as vectorizer_file:
#     pickle.dump(vectorizer, vectorizer_file)

# Save the trained model and vectorizer to pickle files with ".pickle" extension
filename1 = "logistic.pickle"
filename2 = "vectorizer.pickle"
with open(filename1, 'wb') as model_file:
    pickle.dump(lr, model_file)
with open(filename2, 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

In [9]:

# Load the trained model and vectorizer from the pickle files
with open("logistic.pickle", "rb") as model_file:
    loaded_model = pickle.load(model_file)
with open("vectorizer.pickle", "rb") as vectorizer_file:
    loaded_vectorizer = pickle.load(vectorizer_file)




In [10]:
# Function to classify a URL
def classify_url(url):
    # Check for "https://" in the URL and give it a weight
    if "https://" in url:
        weight = 2.0
    else:
        weight = 1.0

    # Check for HTTPS and validate SSL certificate
    try:
        response = requests.get(url, timeout=5, verify=certifi.where())
        if response.status_code == 200:
            final_url = response.url
            domain_info = tldextract.extract(final_url)
            domain = domain_info.domain + "." + domain_info.suffix

            # Additional feature: Check if the URL contains "amazon" in the domain name
            if "amazon" in domain:
                weight *= 1.5  # Adjust the weight as needed based on domain-specific rules

            # Transform the URL and make a prediction
            temp = loaded_vectorizer.transform([url])
            prediction = loaded_model.predict(temp)

            # Adjust the prediction based on the weight
            if weight > 1.0:
                return "good", final_url, domain
            else:
                return prediction[0], final_url, domain

    except requests.exceptions.RequestException:
        pass

    return "unknown", "N/A", "N/A"

In [11]:
# Classify a new URL
input_url = input("Enter a URL: ")
prediction, final_url, domain = classify_url(input_url)
print("Prediction:", prediction)
print("Final URL:", final_url)
print("Domain:", domain)

Enter a URL: https://www.amazon.in/
Prediction: good
Final URL: https://www.amazon.in/
Domain: amazon.in
