In [60]:
import pandas as pd
from scipy.io import arff
from sklearn.model_selection import train_test_split

# Load the ARFF file
data = arff.loadarff('Training Dataset.arff')
df = pd.DataFrame(data[0])

# Convert columns from bytes to integers if necessary
df = df.astype(int)

# Separate features (X) and target (y)
X = df.drop('Result', axis=1)
y = df['Result']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

Training set shape: (8844, 30)
Testing set shape: (2211, 30)


In [61]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Create a pipeline that first scales the data, then runs the SVM
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', svm.SVC(random_state=42))
])

# Create a dictionary of models to test
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
	"SVM": svm.SVC(random_state=42),
	"SVM with Scaler": svm_pipeline,
	"Knn": KNeighborsClassifier(n_neighbors=3),
	"Linear SVM": LinearSVC(random_state=42)
}

# Loop through and evaluate each model using cross-validation
model_score = {}
for name, model in models.items():
    # Use 5-fold cross-validation and the F1-score as the metric
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
    model_score[name] = np.mean(scores)
    print(f"--- {name} ---")
    print(f"Scores: {scores}")
    print(f"Mean F1-Score: {np.mean(scores):.4f}")
    print(f"Std Deviation: {np.std(scores):.4f}\n")


--- Logistic Regression ---
Scores: [0.93998991 0.93721748 0.93053473 0.929      0.9378474 ]
Mean F1-Score: 0.9349
Std Deviation: 0.0043

--- Random Forest ---
Scores: [0.9778002  0.9726997  0.96854718 0.96914517 0.97732997]
Mean F1-Score: 0.9731
Std Deviation: 0.0039

--- Gradient Boosting ---
Scores: [0.95753286 0.95481928 0.94842263 0.95233317 0.95866935]
Mean F1-Score: 0.9544
Std Deviation: 0.0037

--- SVM ---
Scores: [0.95582329 0.948      0.94474863 0.94984955 0.95420232]
Mean F1-Score: 0.9505
Std Deviation: 0.0040

--- SVM with Scaler ---
Scores: [0.96024157 0.95424837 0.94469357 0.95171026 0.95761857]
Mean F1-Score: 0.9537
Std Deviation: 0.0054

--- Knn ---
Scores: [0.95665323 0.94726166 0.93809763 0.94512195 0.94758065]
Mean F1-Score: 0.9469
Std Deviation: 0.0059

--- Linear SVM ---
Scores: [0.94046418 0.93762575 0.93020937 0.93113772 0.93743693]
Mean F1-Score: 0.9354
Std Deviation: 0.0040



In [62]:
# Find the model with the best Mean F1-Score
sorted_models = sorted(model_score.items(), key=lambda x: x[1], reverse=True)
print("\nModels sorted by Mean F1-Score:")
for model, score in sorted_models:
    print(f"{model}: {score:.4f}")

best_model = max(model_score, key=model_score.get)
print(f"Best model: {best_model}")
print(f"Best score: {model_score[best_model]:.4f}")



Models sorted by Mean F1-Score:
Random Forest: 0.9731
Gradient Boosting: 0.9544
SVM with Scaler: 0.9537
SVM: 0.9505
Knn: 0.9469
Linear SVM: 0.9354
Logistic Regression: 0.9349
Best model: Random Forest
Best score: 0.9731


In [63]:
# Evaluating the Results of GridSearchCV
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [10, 20, 30, None],   # Maximum depth of the tree
    'min_samples_split': [2, 5, 10]   # Minimum number of samples to split a node
}

# Create a GridSearchCV object
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='f1',
)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation F1-score: {grid_search.best_score_:.4f}")

# About 1 Mins

Best parameters found: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}
Best cross-validation F1-score: 0.9731


In [65]:
# Model Validation
from sklearn.metrics import classification_report, confusion_matrix

# Get the best estimator from the grid search
best_rf_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_rf_model.predict(X_test)

# Print a detailed classification report
print("\n--- Final Model Evaluation on Test Set ---")
print(classification_report(y_test, y_pred, target_names=['Legitimate (-1)', 'Phishing (1)']))

# Display the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


--- Final Model Evaluation on Test Set ---
                 precision    recall  f1-score   support

Legitimate (-1)       0.98      0.96      0.97       980
   Phishing (1)       0.97      0.98      0.98      1231

       accuracy                           0.98      2211
      macro avg       0.98      0.97      0.98      2211
   weighted avg       0.98      0.98      0.98      2211

Confusion Matrix:
[[ 945   35]
 [  19 1212]]


## Building a Complete ML Pipeline for Random Forest Model

Based on your existing code, I'll create a comprehensive pipeline that includes preprocessing, hyperparameter tuning, and model evaluation.

In [66]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import joblib

# Create a comprehensive pipeline with preprocessing and model
rf_pipeline = Pipeline([
	('preprocessor', StandardScaler()),
	('classifier', RandomForestClassifier(random_state=42))
])

print("Random Forest Pipeline created successfully")
print(f"Pipeline steps: {rf_pipeline.steps}")

Random Forest Pipeline created successfully
Pipeline steps: [('preprocessor', StandardScaler()), ('classifier', RandomForestClassifier(random_state=42))]


In [68]:
# Define hyperparameter grid for pipeline
pipeline_param_grid = {
	'classifier__n_estimators': [100, 200, 300],
	'classifier__max_depth': [10, 20, 30, None],
	'classifier__min_samples_split': [2, 5, 10],
	'classifier__min_samples_leaf': [1, 2, 4]
}

# Create GridSearchCV with the pipeline
pipeline_grid_search = GridSearchCV(
	estimator=rf_pipeline,
	param_grid=pipeline_param_grid,
	cv=5,
	scoring='f1',
	verbose=1
)

print("Pipeline GridSearchCV object created")
print(
	f"Total combinations to test: {len(pipeline_param_grid['classifier__n_estimators']) * len(pipeline_param_grid['classifier__max_depth']) * len(pipeline_param_grid['classifier__min_samples_split']) * len(pipeline_param_grid['classifier__min_samples_leaf'])}")

Pipeline GridSearchCV object created
Total combinations to test: 108


In [69]:
# Fit the pipeline with hyperparameter tuning
print("Training pipeline with hyperparameter tuning...")
pipeline_grid_search.fit(X_train, y_train)

print(f"Best pipeline parameters: {pipeline_grid_search.best_params_}")
print(f"Best cross-validation F1-score: {pipeline_grid_search.best_score_:.4f}")

Training pipeline with hyperparameter tuning...
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best pipeline parameters: {'classifier__max_depth': 20, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 300}
Best cross-validation F1-score: 0.9731


In [70]:
# Get the best pipeline
best_pipeline = pipeline_grid_search.best_estimator_

# Make predictions using the complete pipeline
pipeline_predictions = best_pipeline.predict(X_test)

# Evaluate pipeline performance
print("=== Pipeline Performance on Test Set ===")
print(classification_report(y_test, pipeline_predictions, target_names=['Legitimate (-1)', 'Phishing (1)']))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, pipeline_predictions))

# Calculate F1 score
pipeline_f1_score = f1_score(y_test, pipeline_predictions)
print(f"\nPipeline F1-Score: {pipeline_f1_score:.4f}")

=== Pipeline Performance on Test Set ===
                 precision    recall  f1-score   support

Legitimate (-1)       0.98      0.96      0.97       980
   Phishing (1)       0.97      0.98      0.98      1231

       accuracy                           0.98      2211
      macro avg       0.98      0.97      0.97      2211
   weighted avg       0.98      0.98      0.98      2211


Confusion Matrix:
[[ 945   35]
 [  20 1211]]

Pipeline F1-Score: 0.9778


In [71]:
# Save the trained pipeline
#joblib.dump(best_pipeline, 'best_rf_pipeline.pkl')


# Create a prediction function using the pipeline
def predict_phishing(features):
	"""
    Predict if a URL is phishing using the trained pipeline
    
    Parameters:
    features: array-like, shape (n_features,) or (n_samples, n_features)
    
    Returns:
    predictions: array, predicted labels (-1 for legitimate, 1 for phishing)
    """
	return best_pipeline.predict(features)


# Create a prediction probability function
def predict_phishing_proba(features):
	"""
    Get prediction probabilities using the trained pipeline
    
    Parameters:
    features: array-like, shape (n_features,) or (n_samples, n_features)
    
    Returns:
    probabilities: array, shape (n_samples, 2) - probabilities for each class
    """
	return best_pipeline.predict_proba(features)


print("Pipeline saved as 'best_rf_pipeline.pkl'")
print("Prediction functions created successfully")

Pipeline saved as 'best_rf_pipeline.pkl'
Prediction functions created successfully


In [72]:
# Compare pipeline performance with previous best model
print("=== Model Comparison ===")
print(f"Previous best Random Forest F1-score: {f1_score(y_test, y_pred):.4f}")
print(f"Pipeline Random Forest F1-score: {pipeline_f1_score:.4f}")

improvement = pipeline_f1_score - f1_score(y_test, y_pred)
print(f"Performance improvement: {improvement:.4f}")

# Display feature importance from the pipeline
feature_importance = best_pipeline.named_steps['classifier'].feature_importances_
feature_names = X.columns

# Create a dataframe for better visualization
importance_df = pd.DataFrame({
	'feature': feature_names,
	'importance': feature_importance
}).sort_values('importance', ascending=False)

print("\n=== Top 10 Most Important Features ===")
print(importance_df.head(10))

=== Model Comparison ===
Previous best Random Forest F1-score: 0.9782
Pipeline Random Forest F1-score: 0.9778
Performance improvement: -0.0004

=== Top 10 Most Important Features ===
                        feature  importance
7                SSLfinal_State    0.322322
13                URL_of_Anchor    0.254405
25                  web_traffic    0.068522
6             having_Sub_Domain    0.062755
14                Links_in_tags    0.042478
5                 Prefix_Suffix    0.041636
15                          SFH    0.019660
12                  Request_URL    0.019315
28       Links_pointing_to_page    0.019310
8   Domain_registeration_length    0.016832


In [73]:
# Create a complete ML pipeline class for reusability
class PhishingDetectionPipeline:
	def __init__(self):
		self.pipeline = None
		self.is_trained = False

	def build_pipeline(self):
		"""Build the ML pipeline"""
		self.pipeline = Pipeline([
			('preprocessor', StandardScaler()),
			('classifier', RandomForestClassifier(random_state=42))
		])
		return self.pipeline

	def train(self, X_train, y_train, hyperparameter_tuning=True):
		"""Train the pipeline"""
		if self.pipeline is None:
			self.build_pipeline()

		if hyperparameter_tuning:
			param_grid = {
				'classifier__n_estimators': [100, 200, 300],
				'classifier__max_depth': [10, 20, 30, None],
				'classifier__min_samples_split': [2, 5, 10]
			}

			grid_search = GridSearchCV(
				self.pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1
			)
			grid_search.fit(X_train, y_train)
			self.pipeline = grid_search.best_estimator_
			self.best_params = grid_search.best_params_
			self.best_score = grid_search.best_score_
		else:
			self.pipeline.fit(X_train, y_train)

		self.is_trained = True

	def predict(self, X):
		"""Make predictions"""
		if not self.is_trained:
			raise ValueError("Pipeline must be trained before making predictions")
		return self.pipeline.predict(X)

	def predict_proba(self, X):
		"""Get prediction probabilities"""
		if not self.is_trained:
			raise ValueError("Pipeline must be trained before making predictions")
		return self.pipeline.predict_proba(X)

	def save_model(self, filename):
		"""Save the trained pipeline"""
		if not self.is_trained:
			raise ValueError("Pipeline must be trained before saving")
		joblib.dump(self.pipeline, filename)

	def load_model(self, filename):
		"""Load a trained pipeline"""
		self.pipeline = joblib.load(filename)
		self.is_trained = True


# Example usage of the pipeline class
phishing_pipeline = PhishingDetectionPipeline()
print("PhishingDetectionPipeline class created successfully")


PhishingDetectionPipeline class created successfully
