In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler


In [2]:
data = pd.read_csv(r"C:\Users\joelf\Downloads\Url_Processed.csv")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,url,label,result,url_length,hostname_length,path_length,fd_length,count-,count@,...,count.,count=,count-http,count-https,count-www,count-digits,count-letters,count_dir,use_of_ip,short_url
0,0,https://www.google.com,benign,0,22,14,0,0,0,0,...,2,0,1,1,1,0,17,0,1,1
1,1,https://www.youtube.com,benign,0,23,15,0,0,0,0,...,2,0,1,1,1,0,18,0,1,1
2,2,https://www.facebook.com,benign,0,24,16,0,0,0,0,...,2,0,1,1,1,0,19,0,1,1
3,3,https://www.baidu.com,benign,0,21,13,0,0,0,0,...,2,0,1,1,1,0,16,0,1,1
4,4,https://www.wikipedia.org,benign,0,25,17,0,0,0,0,...,2,0,1,1,1,0,20,0,1,1


In [4]:
# Data Preprocessing
data.drop(['Unnamed: 0', 'url', 'label'], axis=1, inplace=True)
X = data.drop('result', axis=1)  # Features
y = data['result']  # Target


In [5]:
# Handle Imbalanced Dataset using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [9]:
X

Unnamed: 0,url_length,hostname_length,path_length,fd_length,count-,count@,count?,count%,count.,count=,count-http,count-https,count-www,count-digits,count-letters,count_dir,use_of_ip,short_url
0,22,14,0,0,0,0,0,0,2,0,1,1,1,0,17,0,1,1
1,23,15,0,0,0,0,0,0,2,0,1,1,1,0,18,0,1,1
2,24,16,0,0,0,0,0,0,2,0,1,1,1,0,19,0,1,1
3,21,13,0,0,0,0,0,0,2,0,1,1,1,0,16,0,1,1
4,25,17,0,0,0,0,0,0,2,0,1,1,1,0,20,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450171,43,11,25,8,1,0,0,0,2,0,1,0,0,0,34,3,1,-1
450172,159,13,139,2,0,0,0,0,2,1,1,0,0,21,118,12,1,1
450173,147,13,127,2,0,0,0,0,1,1,1,0,0,20,109,12,1,1
450174,22,14,1,0,0,0,0,0,1,0,1,0,0,0,17,1,1,1


In [10]:
y

0         0
1         0
2         0
3         0
4         0
         ..
450171    1
450172    1
450173    1
450174    1
450175    1
Name: result, Length: 450176, dtype: int64

In [6]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

In [7]:
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
# Models to Train
models = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42)
}


In [11]:
# Training and Evaluation
results = {}
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    # Store results
    results[model_name] = {
        "Accuracy": accuracy,
        "Classification Report": report,
        "Confusion Matrix": cm
    }

In [12]:
# Display Results
for model_name, metrics in results.items():
    print(f"\nModel: {model_name}")
    print(f"Accuracy: {metrics['Accuracy']}")
    print(f"Classification Report:\n{metrics['Classification Report']}")
    print(f"Confusion Matrix:\n{metrics['Confusion Matrix']}")


Model: Logistic Regression
Accuracy: 0.997259501359403
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     69148
           1       1.00      1.00      1.00     69148

    accuracy                           1.00    138296
   macro avg       1.00      1.00      1.00    138296
weighted avg       1.00      1.00      1.00    138296

Confusion Matrix:
[[69000   148]
 [  231 68917]]

Model: Decision Tree
Accuracy: 0.9968907271359981
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     69148
           1       1.00      1.00      1.00     69148

    accuracy                           1.00    138296
   macro avg       1.00      1.00      1.00    138296
weighted avg       1.00      1.00      1.00    138296

Confusion Matrix:
[[68929   219]
 [  211 68937]]

Model: Random Forest
Accuracy: 0.9978741250650778
Classification Report:
              precisi

In [13]:
def predict_new_input(new_data, trained_model, scaler):
    """Predict on new input data."""
    new_data_scaled = scaler.transform(new_data)
    prediction = trained_model.predict(new_data_scaled)
    return prediction

In [14]:
# Example New Input
new_input = pd.DataFrame({
    'url_length': [70],
    'hostname_length': [30],
    'path_length': [20],
    'fd_length': [5],
    'count-': [2],
    'count@': [0],
    'count?': [1],
    'count%': [0],
    'count.': [5],
    'count=': [0],
    'count-http': [1],
    'count-https': [1],
    'count-www': [1],
    'count-digits': [10],
    'count-letters': [50],
    'count_dir': [2],
    'use_of_ip': [0],
    'short_url': [0]
})

In [15]:
# Predict using the best performing model (e.g., Random Forest)
best_model = models['Random Forest']
prediction = predict_new_input(new_input, best_model, scaler)
print(f"\nPrediction for new input: {'Malicious' if prediction[0] == 1 else 'Benign'}")


Prediction for new input: Benign


In [16]:
model = RandomForestClassifier(random_state=42)

In [17]:
model.fit(X_train, y_train)
    


In [18]:
y_pred = model.predict(X_test)

In [19]:
import joblib

In [21]:
joblib.dump(model, "random_forest_phishing.pkl")


['random_forest_phishing.pkl']

In [22]:
joblib.dump(scaler, "scaler_phishing.pkl")

['scaler_phishing.pkl']