In [9]:
!pip install python-whois tldextract requests beautifulsoup4


Collecting python-whois
  Downloading python_whois-0.9.6-py3-none-any.whl.metadata (3.0 kB)
Collecting tldextract
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-3.0.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading python_whois-0.9.6-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tldextract-5.3.0-py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.4/107.4 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_file-3.0.0-py2.py3-none-any.whl (4.5 kB)
Installing collected packages: requests-file, python-whois, tldextract
Successfully installed python-whois-0.9.6 requests-file-3.0.0 tldextract-5.3.0


In [10]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from extractor import extract_website_features  # your feature extractor
import warnings
warnings.filterwarnings("ignore")

# File paths
DATASET_FILE = "website_security_dataset.csv"

# Optional: delay for feature extraction if predicting multiple URLs
import time
DELAY = 0.5


In [11]:
# Load your CSV
df = pd.read_csv(DATASET_FILE)
print("Dataset loaded:", df.shape)
df.head()


Dataset loaded: (999, 26)


Unnamed: 0,Website,has_https,ssl_valid,ssl_expiry_days,ssl_issuer,domain_age_days,whois_info_available,organization_name,url_length,num_dots,...,num_external_links,has_login_keyword,has_privacy_policy,has_contact_page,num_suspicious_keywords,has_hsts_header,has_x_frame_options,has_csp,has_x_content_type_options,secure_label
0,https://ucla.edu,1,1,333,Amazon,14787,1,,16,1,...,151,0,1,1,0,0,1,0,1,1
1,https://netangels.ru,1,1,71,Let's Encrypt,7991,1,"LLP ""Internet-Pro""",20,1,...,45,0,0,1,0,0,0,0,0,1
2,https://sharepoint.com,1,1,224,Microsoft Corporation,9931,1,Microsoft Corporation,22,1,...,1,0,0,0,0,1,0,0,0,1
3,https://ptt.cc,0,0,0,,8674,1,,14,1,...,0,0,0,0,0,0,0,0,0,1
4,https://newsweek.com,1,1,289,Amazon,11477,1,Newsweek Magazine LLC,20,1,...,281,1,1,1,0,1,1,0,1,1


In [26]:
df.tail()

Unnamed: 0,Website,has_https,ssl_valid,ssl_expiry_days,ssl_issuer,domain_age_days,whois_info_available,organization_name,url_length,num_dots,...,num_external_links,has_login_keyword,has_privacy_policy,has_contact_page,num_suspicious_keywords,has_hsts_header,has_x_frame_options,has_csp,has_x_content_type_options,secure_label
994,http://answers.ask.com/Entertainment/Music/who...,0,0,0,,9860,1,Ask.com,61,2,...,0,0,0,0,0,0,0,0,0,0
995,http://www.ringsurf.com/ring/digitalring/,0,0,0,,9923,1,REDACTED FOR PRIVACY,41,2,...,0,0,0,0,0,0,0,0,0,0
996,http://www.hp.com/sbso/index.html,0,0,0,,14474,1,HP Inc.,33,3,...,0,0,0,0,0,0,0,0,0,0
997,http://reuters.com/finance/stocks/companyOffic...,0,0,0,,11825,1,Thomson Reuters Enterprise Centre GmbH,63,2,...,0,0,0,0,0,0,0,1,0,0
998,http://divxturka.net/mkv-mp4-rmvb-movies/10926...,0,0,0,,6638,1,"Domains By Proxy, LLC",90,2,...,0,0,0,1,0,0,0,0,0,0


In [12]:
# Drop Website column for training
X = df.drop(columns=["Website", "secure_label"])
y = df["secure_label"]

# One-hot encode categorical features
X = pd.get_dummies(X, columns=["ssl_issuer", "organization_name", "tld_type"])

# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save columns for later prediction
X_columns = X.columns.tolist()

print("Features prepared. Total features:", len(X_columns))


Features prepared. Total features: 323


In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)
print("Training set:", X_train.shape)
print("Test set:", X_test.shape)


Training set: (799, 323)
Test set: (200, 323)


In [14]:
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)
print("✅ Model trained successfully")


✅ Model trained successfully


In [15]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.99

Confusion Matrix:
 [[104   2]
 [  0  94]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99       106
           1       0.98      1.00      0.99        94

    accuracy                           0.99       200
   macro avg       0.99      0.99      0.99       200
weighted avg       0.99      0.99      0.99       200



In [16]:
joblib.dump(model, "website_classifier.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(X_columns, "X_columns.pkl")
print("✅ Model, scaler, and feature columns saved")


✅ Model, scaler, and feature columns saved


In [17]:
# Load X_columns for prediction
X_columns = joblib.load("X_columns.pkl")
scaler = joblib.load("scaler.pkl")
model = joblib.load("website_classifier.pkl")

def extract_features_for_prediction(url):
    """Extract features from a URL and prepare dataframe for prediction"""
    features = extract_website_features(url, labels={"secure_label": 1})
    df_features = pd.DataFrame([features])

    # One-hot encode categorical columns
    df_features = pd.get_dummies(df_features, columns=["ssl_issuer", "organization_name", "tld_type"])

    # Align columns with training features
    for col in X_columns:
        if col not in df_features.columns:
            df_features[col] = 0
    df_features = df_features[X_columns]
    return df_features


In [18]:
def predict_website(url):
    df_features = extract_features_for_prediction(url)
    df_scaled = scaler.transform(df_features)
    pred = model.predict(df_scaled)[0]
    prob = model.predict_proba(df_scaled)[0][pred]

    if pred == 1:
        print(f"✅ {url} is likely SAFE ({prob*100:.2f}% confidence)")
    else:
        print(f"⚠️ {url} is likely MALICIOUS ({prob*100:.2f}% confidence)")


In [19]:
url = input("Enter a URL to check: ")
predict_website(url)


Enter a URL to check: chatgpt.com
✅ chatgpt.com is likely SAFE (100.00% confidence)


In [23]:
url = input("Enter a URL to check: ")
predict_website(url)


Enter a URL to check: www.giss.nasa.gov/tools/
⚠️ www.giss.nasa.gov/tools/ is likely MALICIOUS (97.00% confidence)


In [24]:
predict_website("http://fakebank-login.com")

⚠️ http://fakebank-login.com is likely MALICIOUS (76.00% confidence)


In [25]:
predict_website("https://www.nasa.gov")

✅ https://www.nasa.gov is likely SAFE (93.00% confidence)
