In [1]:
# import matplotlib.pyplot as plt
# import seaborn as sns
import pandas as pd
# from wordcloud import WordCloud
# from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy.sparse import hstack, csr_matrix
from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier

In [2]:
df = pd.read_csv('./Data/Balanced_dataset.csv')
df.head()

Unnamed: 0,url,type,scheme,domain,subdomain,top_level_domain,path,path_length,num_path_segments,query_params,num_query_params,has_https,file_extension,has_fragment,has_special_chars_in_path,has_port,port_number,is_ip_address
0,https://ethnicelebs.com/eddie-anderson-comedian,benign,https,ethnicelebs.com,none,com,/eddie-anderson-comedian,24,1,0,0,1,none,0,0,0,0,0
1,https://closinglogos.com/page/Sony+Pictures+Ho...,benign,https,closinglogos.com,none,com,/page/Sony+Pictures+Home+Entertainment+Warning...,53,2,0,0,1,none,0,0,0,0,0
2,https://terezowens.com/golfer-greg-norman-tryi...,benign,https,terezowens.com,none,com,/golfer-greg-norman-trying-to-unload-55-millio...,54,1,0,0,1,none,0,0,0,0,0
3,https://americannortel.com/,benign,https,americannortel.com,none,com,/,1,0,0,0,1,none,0,0,0,0,0
4,https://welding.org/,benign,https,welding.org,none,org,/,1,0,0,0,1,none,0,0,0,0,0


In [3]:
df.fillna('none', inplace=True)
df.isna().sum()

url                          0
type                         0
scheme                       0
domain                       0
subdomain                    0
top_level_domain             0
path                         0
path_length                  0
num_path_segments            0
query_params                 0
num_query_params             0
has_https                    0
file_extension               0
has_fragment                 0
has_special_chars_in_path    0
has_port                     0
port_number                  0
is_ip_address                0
dtype: int64

In [4]:
for col in df.columns:
    print(f'{col} : {df[col].nunique()}')

url : 51012
type : 5
scheme : 13
domain : 20791
subdomain : 4193
top_level_domain : 2360
path : 31239
path_length : 229
num_path_segments : 17
query_params : 16
num_query_params : 19
has_https : 2
file_extension : 618
has_fragment : 2
has_special_chars_in_path : 2
has_port : 2
port_number : 1722
is_ip_address : 2


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59605 entries, 0 to 59604
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   url                        59605 non-null  object
 1   type                       59605 non-null  object
 2   scheme                     59605 non-null  object
 3   domain                     59605 non-null  object
 4   subdomain                  59605 non-null  object
 5   top_level_domain           59605 non-null  object
 6   path                       59605 non-null  object
 7   path_length                59605 non-null  int64 
 8   num_path_segments          59605 non-null  int64 
 9   query_params               59605 non-null  int64 
 10  num_query_params           59605 non-null  int64 
 11  has_https                  59605 non-null  int64 
 12  file_extension             59605 non-null  object
 13  has_fragment               59605 non-null  int64 
 14  has_sp

In [24]:
def train_ml_models(df, text_column, target_column):
    """Trains ML models using Word Count and TF-IDF features and evaluates their performance."""
    label_encoder = LabelEncoder()
    df[target_column] = label_encoder.fit_transform(df[target_column])
    
    X = df[text_column]
    y = df[target_column]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    vectorizers = {
        'CountVectorizer': CountVectorizer(max_features=900),
        'TfidfVectorizer': TfidfVectorizer(max_features=900)
    }
    
    models = {
        'RandomForest': RandomForestClassifier(n_estimators=350, max_depth=45, min_samples_split=5, random_state=42), #20
        'LogisticRegression': LogisticRegression(max_iter=4500, solver='saga', C=1.0, random_state=42),
        'SVC': SVC(kernel='rbf', C=1.0, probability=True),
        'XGB' : XGBClassifier(),
        'VotingEnsemble': VotingClassifier(estimators=[
            ('rf', RandomForestClassifier(n_estimators=100)),
            ('svc', SVC(probability=True)),
            ('xgb', XGBClassifier())
        ], voting='soft')
    }
    
    results = {}
    
    for vec_name, vectorizer in vectorizers.items():
        X_train_vec = vectorizer.fit_transform(X_train)
        X_test_vec = vectorizer.transform(X_test)

        # print(X_train_vec.shape, X_test_vec.shape)
        
        for model_name, model in tqdm(models.items()):
            model.fit(X_train_vec, y_train)
            y_pred = model.predict(X_test_vec)
            
            results[f'{vec_name}_{model_name}'] = {
                'accuracy': accuracy_score(y_test, y_pred) * 100,
                'precision': precision_score(y_test, y_pred, average='weighted') * 100,
                'recall': recall_score(y_test, y_pred, average='weighted') * 100,
                'f1_score': f1_score(y_test, y_pred, average='weighted') * 100
            }
    
    return results

In [25]:
results = train_ml_models(df, 'url', 'type')

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [04:34<00:00, 54.88s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [05:13<00:00, 62.69s/it]


In [26]:
for name, metrics in results.items():
    technique, model = name.split('_')
    print(f'Technique: {technique}\t Model : {model}')
    print('-' * 50)
    print(f'\t\tMetrics')
    print('-' * 50)
    for k, v in metrics.items():
        print(f'{k}: {v}%')
    print('*' * 50)

Technique: CountVectorizer	 Model : RandomForest
--------------------------------------------------
		Metrics
--------------------------------------------------
accuracy: 90.31121550205519%
precision: 90.46881935561302%
recall: 90.31121550205519%
f1_score: 90.15929452478287%
**************************************************
Technique: CountVectorizer	 Model : LogisticRegression
--------------------------------------------------
		Metrics
--------------------------------------------------
accuracy: 89.56463383944299%
precision: 89.79444598224993%
recall: 89.56463383944299%
f1_score: 89.51900614919114%
**************************************************
Technique: CountVectorizer	 Model : SVC
--------------------------------------------------
		Metrics
--------------------------------------------------
accuracy: 90.3867125241171%
precision: 90.79306013012094%
recall: 90.3867125241171%
f1_score: 90.29806919467109%
**************************************************
Technique: CountVectoriz

In [12]:
TEXT_COLS = ['url', 'scheme', 'domain', 'subdomain', 'top_level_domain', 'path', 'file_extension']
TARGET_COL = ['type']
NUMERIC_COLS = [col for col in df.columns if col not in TEXT_COLS and col not in TARGET_COL]

In [13]:
def train_ml_models(df, text_columns, numeric_columns, target_column):
    """Trains ML models using Word Count, TF-IDF, and numerical features and evaluates their performance."""
    
    label_encoder = LabelEncoder()
    df[target_column] = label_encoder.fit_transform(df[target_column])
    
    X = df[text_columns + numeric_columns]
    y = df[target_column]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Define vectorizer classes instead of instances
    vectorizers = {
        'CountVectorizer': CountVectorizer,
        'TfidfVectorizer': TfidfVectorizer
    }
    
    models = {
        'RandomForest': RandomForestClassifier(),
        'LogisticRegression': LogisticRegression(max_iter=1000),
        'SVC': SVC(),
        'XGB' : XGBClassifier(),
        'VotingEnsemble': VotingClassifier(estimators=[
            ('rf', RandomForestClassifier(n_estimators=100)),
            ('svc', SVC(probability=True)),
            ('xgb', XGBClassifier())
        ], voting='soft')
    }
    
    results = []
    
    for vec_name, vectorizer_cls in vectorizers.items():
        # Process each text column with its own vectorizer instance
        text_features_train = []
        text_features_test = []
        for col in text_columns:
            # Create a new vectorizer for each column
            vectorizer = vectorizer_cls(max_features=150)
            # Fit and transform training data
            text_train = vectorizer.fit_transform(X_train[col])
            text_features_train.append(text_train)
            # Transform test data
            text_test = vectorizer.transform(X_test[col])
            text_features_test.append(text_test)
        
        # Combine all text features
        text_train_combined = hstack(text_features_train)
        text_test_combined = hstack(text_features_test)

        # Standardize numeric features
        scaler = StandardScaler()
        numeric_train_scaled = scaler.fit_transform(X_train[numeric_columns])
        numeric_test_scaled = scaler.transform(X_test[numeric_columns])

        # Convert numeric features to sparse format
        numeric_train_sparse = csr_matrix(numeric_train_scaled)
        numeric_test_sparse = csr_matrix(numeric_test_scaled)

        # Combine text and numeric features
        X_train_final = hstack([text_train_combined, numeric_train_sparse])
        X_test_final = hstack([text_test_combined, numeric_test_sparse])

        print(f"Train shape: {X_train_final.shape}, Test shape: {X_test_final.shape}")

        for model_name, model in models.items():
            model.fit(X_train_final, y_train)
            y_pred = model.predict(X_test_final)
            
            results.append({
                'Vectorizer': vec_name,
                'Model': model_name,
                'Accuracy': accuracy_score(y_test, y_pred) * 100,
                'Precision': precision_score(y_test, y_pred, average='weighted') * 100,
                'Recall': recall_score(y_test, y_pred, average='weighted') * 100,
                'F1 Score': f1_score(y_test, y_pred, average='weighted') * 100
            })
    
    return results

In [14]:
results = train_ml_models(df, TEXT_COLS, NUMERIC_COLS, 'type')

Train shape: (47684, 942), Test shape: (11921, 942)
Train shape: (47684, 942), Test shape: (11921, 942)


In [23]:
for result in results:
    for k, v in result.items():
        if k != 'Vectorizer' and k!= 'Model':
            print(f'{k}: {v:.4}%')
        else:
            print(f'{k}: {v}')
    print('*' * 50)

Vectorizer: CountVectorizer
Model: RandomForest
Accuracy: 94.98%
Precision: 95.0%
Recall: 94.98%
F1 Score: 94.96%
**************************************************
Vectorizer: CountVectorizer
Model: LogisticRegression
Accuracy: 90.18%
Precision: 90.35%
Recall: 90.18%
F1 Score: 90.18%
**************************************************
Vectorizer: CountVectorizer
Model: SVC
Accuracy: 92.25%
Precision: 92.41%
Recall: 92.25%
F1 Score: 92.28%
**************************************************
Vectorizer: CountVectorizer
Model: XGB
Accuracy: 94.5%
Precision: 94.59%
Recall: 94.5%
F1 Score: 94.48%
**************************************************
Vectorizer: CountVectorizer
Model: VotingEnsemble
Accuracy: 94.99%
Precision: 95.06%
Recall: 94.99%
F1 Score: 94.98%
**************************************************
Vectorizer: TfidfVectorizer
Model: RandomForest
Accuracy: 94.88%
Precision: 94.91%
Recall: 94.88%
F1 Score: 94.87%
**************************************************
Vectorizer: Tfidf