In [1]:
!pip install scikit-learn==1.2.1

Collecting scikit-learn==1.2.1
  Downloading scikit_learn-1.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.4.1.post1
    Uninstalling scikit-learn-1.4.1.post1:
      Successfully uninstalled scikit-learn-1.4.1.post1
Successfully installed scikit-learn-1.2.1


In [3]:
!pip install xgboost


Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl (297.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.1/297.1 MB[0m [31m962.8 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.0.3


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Load the dataset
data = pd.read_csv("malicious_phish.csv")

# Feature extraction
X = data['url']
y = data['type']

# Convert string labels to numeric labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Convert URLs into features using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Train XGBoost classifier
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train_vect, y_train)

# Evaluate XGBoost model
xgb_y_pred = xgb_model.predict(X_test_vect)
print("XGBoost Accuracy:", accuracy_score(y_test, xgb_y_pred))
print(classification_report(y_test, xgb_y_pred))

# Saving the model
joblib.dump(xgb_model, "xgboost_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

# Predicting new URLs
new_urls = ["http://example.com", "http://malicious-site.com"]
new_urls_vect = vectorizer.transform(new_urls)
xgb_predictions = xgb_model.predict(new_urls_vect)
decoded_predictions = label_encoder.inverse_transform(xgb_predictions)
print("XGBoost Predictions:", decoded_predictions)


XGBoost Accuracy: 0.9209069479956081
              precision    recall  f1-score   support

           0       0.92      0.98      0.95     85778
           1       0.95      0.99      0.97     19104
           2       0.97      0.87      0.92      6521
           3       0.86      0.59      0.70     18836

    accuracy                           0.92    130239
   macro avg       0.92      0.86      0.88    130239
weighted avg       0.92      0.92      0.92    130239

XGBoost Predictions: ['phishing' 'defacement']
