In [3]:
import pandas as pd
from scipy.io import arff
from sklearn.model_selection import train_test_split

# Load the ARFF file
data = arff.loadarff('.old.arff')
df = pd.DataFrame(data[0])

# Convert columns from bytes to integers if necessary
df = df.astype(int)

# Separate features (X) and target (y)
X = df.drop('Result', axis=1)
y = df['Result']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

Training set shape: (1964, 30)
Testing set shape: (492, 30)


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Create a pipeline that first scales the data, then runs the SVM
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', svm.SVC(random_state=42))
])

# Create a dictionary of models to test
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
	"SVM": svm.SVC(random_state=42),
	"SVM with Scaler": svm_pipeline,
	"Knn": KNeighborsClassifier(n_neighbors=3),
	"Linear SVM": LinearSVC(random_state=42)
}

# Loop through and evaluate each model using cross-validation
model_score = {}
for name, model in models.items():
    # Use 5-fold cross-validation and the F1-score as the metric
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
    model_score[name] = np.mean(scores)
    print(f"--- {name} ---")
    print(f"Scores: {scores}")
    print(f"Mean F1-Score: {np.mean(scores):.4f}")
    print(f"Std Deviation: {np.std(scores):.4f}\n")


--- Logistic Regression ---
Scores: [0.94857143 0.9244186  0.93409742 0.93714286 0.94186047]
Mean F1-Score: 0.9372
Std Deviation: 0.0081

--- Random Forest ---
Scores: [0.96045198 0.96022727 0.96209913 0.95402299 0.97994269]
Mean F1-Score: 0.9633
Std Deviation: 0.0087

--- Gradient Boosting ---
Scores: [0.95238095 0.95184136 0.96231884 0.96       0.96590909]
Mean F1-Score: 0.9585
Std Deviation: 0.0055

--- SVM ---
Scores: [0.95480226 0.95428571 0.96253602 0.9512894  0.96275072]
Mean F1-Score: 0.9571
Std Deviation: 0.0047

--- SVM with Scaler ---
Scores: [0.94886364 0.94318182 0.95930233 0.95156695 0.96571429]
Mean F1-Score: 0.9537
Std Deviation: 0.0079

--- Knn ---
Scores: [0.92351275 0.92479109 0.9244186  0.91907514 0.94350282]
Mean F1-Score: 0.9271
Std Deviation: 0.0085

--- Linear SVM ---
Scores: [0.94252874 0.92528736 0.93982808 0.94017094 0.93604651]
Mean F1-Score: 0.9368
Std Deviation: 0.0061



In [5]:
# Find the model with the best Mean F1-Score
sorted_models = sorted(model_score.items(), key=lambda x: x[1], reverse=True)
print("\nModels sorted by Mean F1-Score:")
for model, score in sorted_models:
    print(f"{model}: {score:.4f}")

best_model = max(model_score, key=model_score.get)
print(f"Best model: {best_model}")
print(f"Best score: {model_score[best_model]:.4f}")



Models sorted by Mean F1-Score:
Random Forest: 0.9633
Gradient Boosting: 0.9585
SVM: 0.9571
SVM with Scaler: 0.9537
Logistic Regression: 0.9372
Linear SVM: 0.9368
Knn: 0.9271
Best model: Random Forest
Best score: 0.9633


In [6]:
# Evaluating the Results of GridSearchCV
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [10, 20, 30, None],   # Maximum depth of the tree
    'min_samples_split': [2, 5, 10]   # Minimum number of samples to split a node
}

# Create a GridSearchCV object
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='f1',
)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation F1-score: {grid_search.best_score_:.4f}")

# About 18 Sec

Best parameters found: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}
Best cross-validation F1-score: 0.9641


In [7]:
# Evaluating the Results of RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint # Good for defining a range of integers

# Define the parameter distribution for Random Forest
param_dist = {
    'n_estimators': randint(100, 500),         # Number of trees
    'max_depth': [10, 20, 30, 40, 50, None],    # Max depth of the tree
    'min_samples_split': randint(2, 11)        # Min samples to split a node
}

# Create a RandomizedSearchCV object
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=100,  # Try 100 different combinations
    cv=5,
    scoring='f1',
    random_state=42
)

# Fit the random search to the data
random_search.fit(X_train, y_train)

# Print the best parameters
print(f"Best parameters found: {random_search.best_params_}")
print(f"Best cross-validation F1-score: {random_search.best_score_:.4f}")

# About 5 Mins

Best parameters found: {'max_depth': 30, 'min_samples_split': 2, 'n_estimators': 200}
Best cross-validation F1-score: 0.9640


In [8]:
# Model Validation
from sklearn.metrics import classification_report, confusion_matrix

# Get the best estimator from the grid search
best_rf_model = random_search.best_estimator_

# Make predictions on the test set
y_pred = best_rf_model.predict(X_test)

# Print a detailed classification report
print("\n--- Final Model Evaluation on Test Set ---")
print(classification_report(y_test, y_pred, target_names=['Legitimate (-1)', 'Phishing (1)']))

# Display the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


--- Final Model Evaluation on Test Set ---
                 precision    recall  f1-score   support

Legitimate (-1)       0.97      0.97      0.97       273
   Phishing (1)       0.96      0.96      0.96       219

       accuracy                           0.97       492
      macro avg       0.96      0.97      0.97       492
   weighted avg       0.97      0.97      0.97       492

Confusion Matrix:
[[264   9]
 [  8 211]]


## Building a Complete ML Pipeline for Random Forest Model

Based on your existing code, I'll create a comprehensive pipeline that includes preprocessing, hyperparameter tuning, and model evaluation.

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import joblib

# Create a comprehensive pipeline with preprocessing and model
rf_pipeline = Pipeline([
	('preprocessor', StandardScaler()),
	('classifier', RandomForestClassifier(random_state=42))
])

print("Random Forest Pipeline created successfully")
print(f"Pipeline steps: {rf_pipeline.steps}")

Random Forest Pipeline created successfully
Pipeline steps: [('preprocessor', StandardScaler()), ('classifier', RandomForestClassifier(random_state=42))]


In [10]:
# Define hyperparameter grid for pipeline
pipeline_param_grid = {
	'classifier__n_estimators': [100, 200, 300],
	'classifier__max_depth': [10, 20, 30, None],
	'classifier__min_samples_split': [2, 5, 10],
	'classifier__min_samples_leaf': [1, 2, 4]
}

# Create GridSearchCV with the pipeline
pipeline_grid_search = GridSearchCV(
	estimator=rf_pipeline,
	param_grid=pipeline_param_grid,
	cv=5,
	scoring='f1',
	verbose=1
)

print("Pipeline GridSearchCV object created")
print(
	f"Total combinations to test: {len(pipeline_param_grid['classifier__n_estimators']) * len(pipeline_param_grid['classifier__max_depth']) * len(pipeline_param_grid['classifier__min_samples_split']) * len(pipeline_param_grid['classifier__min_samples_leaf'])}")

Pipeline GridSearchCV object created
Total combinations to test: 108


In [11]:
# Fit the pipeline with hyperparameter tuning
print("Training pipeline with hyperparameter tuning...")
pipeline_grid_search.fit(X_train, y_train)

print(f"Best pipeline parameters: {pipeline_grid_search.best_params_}")
print(f"Best cross-validation F1-score: {pipeline_grid_search.best_score_:.4f}")

Training pipeline with hyperparameter tuning...
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best pipeline parameters: {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
Best cross-validation F1-score: 0.9641


In [12]:
# Get the best pipeline
best_pipeline = pipeline_grid_search.best_estimator_

# Make predictions using the complete pipeline
pipeline_predictions = best_pipeline.predict(X_test)

# Evaluate pipeline performance
print("=== Pipeline Performance on Test Set ===")
print(classification_report(y_test, pipeline_predictions, target_names=['Legitimate (-1)', 'Phishing (1)']))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, pipeline_predictions))

# Calculate F1 score
pipeline_f1_score = f1_score(y_test, pipeline_predictions)
print(f"\nPipeline F1-Score: {pipeline_f1_score:.4f}")

=== Pipeline Performance on Test Set ===
                 precision    recall  f1-score   support

Legitimate (-1)       0.97      0.96      0.97       273
   Phishing (1)       0.95      0.96      0.96       219

       accuracy                           0.96       492
      macro avg       0.96      0.96      0.96       492
   weighted avg       0.96      0.96      0.96       492


Confusion Matrix:
[[263  10]
 [  9 210]]

Pipeline F1-Score: 0.9567


In [13]:
# Save the trained pipeline
joblib.dump(best_pipeline, 'best_rf_pipeline.pkl')


# Create a prediction function using the pipeline
def predict_phishing(features):
	"""
    Predict if a URL is phishing using the trained pipeline
    
    Parameters:
    features: array-like, shape (n_features,) or (n_samples, n_features)
    
    Returns:
    predictions: array, predicted labels (-1 for legitimate, 1 for phishing)
    """
	return best_pipeline.predict(features)


# Create a prediction probability function
def predict_phishing_proba(features):
	"""
    Get prediction probabilities using the trained pipeline
    
    Parameters:
    features: array-like, shape (n_features,) or (n_samples, n_features)
    
    Returns:
    probabilities: array, shape (n_samples, 2) - probabilities for each class
    """
	return best_pipeline.predict_proba(features)


print("Pipeline saved as 'best_rf_pipeline.pkl'")
print("Prediction functions created successfully")

Pipeline saved as 'best_rf_pipeline.pkl'
Prediction functions created successfully


In [14]:
# Compare pipeline performance with previous best model
print("=== Model Comparison ===")
print(f"Previous best Random Forest F1-score: {f1_score(y_test, y_pred):.4f}")
print(f"Pipeline Random Forest F1-score: {pipeline_f1_score:.4f}")

improvement = pipeline_f1_score - f1_score(y_test, y_pred)
print(f"Performance improvement: {improvement:.4f}")

# Display feature importance from the pipeline
feature_importance = best_pipeline.named_steps['classifier'].feature_importances_
feature_names = X.columns

# Create a dataframe for better visualization
importance_df = pd.DataFrame({
	'feature': feature_names,
	'importance': feature_importance
}).sort_values('importance', ascending=False)

print("\n=== Top 10 Most Important Features ===")
print(importance_df.head(10))

=== Model Comparison ===
Previous best Random Forest F1-score: 0.9613
Pipeline Random Forest F1-score: 0.9567
Performance improvement: -0.0046

=== Top 10 Most Important Features ===
                        feature  importance
7                SSLfinal_State    0.290133
13                URL_of_Anchor    0.228364
25                  web_traffic    0.115515
5                 Prefix_Suffix    0.112203
6             having_Sub_Domain    0.044516
23                age_of_domain    0.038779
26                    Page_Rank    0.032070
8   Domain_registeration_length    0.023079
14                Links_in_tags    0.021288
12                  Request_URL    0.015403


In [15]:
# Create a complete ML pipeline class for reusability
class PhishingDetectionPipeline:
	def __init__(self):
		self.pipeline = None
		self.is_trained = False

	def build_pipeline(self):
		"""Build the ML pipeline"""
		self.pipeline = Pipeline([
			('preprocessor', StandardScaler()),
			('classifier', RandomForestClassifier(random_state=42))
		])
		return self.pipeline

	def train(self, X_train, y_train, hyperparameter_tuning=True):
		"""Train the pipeline"""
		if self.pipeline is None:
			self.build_pipeline()

		if hyperparameter_tuning:
			param_grid = {
				'classifier__n_estimators': [100, 200, 300],
				'classifier__max_depth': [10, 20, 30, None],
				'classifier__min_samples_split': [2, 5, 10]
			}

			grid_search = GridSearchCV(
				self.pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1
			)
			grid_search.fit(X_train, y_train)
			self.pipeline = grid_search.best_estimator_
			self.best_params = grid_search.best_params_
			self.best_score = grid_search.best_score_
		else:
			self.pipeline.fit(X_train, y_train)

		self.is_trained = True

	def predict(self, X):
		"""Make predictions"""
		if not self.is_trained:
			raise ValueError("Pipeline must be trained before making predictions")
		return self.pipeline.predict(X)

	def predict_proba(self, X):
		"""Get prediction probabilities"""
		if not self.is_trained:
			raise ValueError("Pipeline must be trained before making predictions")
		return self.pipeline.predict_proba(X)

	def save_model(self, filename):
		"""Save the trained pipeline"""
		if not self.is_trained:
			raise ValueError("Pipeline must be trained before saving")
		joblib.dump(self.pipeline, filename)

	def load_model(self, filename):
		"""Load a trained pipeline"""
		self.pipeline = joblib.load(filename)
		self.is_trained = True


# Example usage of the pipeline class
phishing_pipeline = PhishingDetectionPipeline()
print("PhishingDetectionPipeline class created successfully")


PhishingDetectionPipeline class created successfully
