In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.frequent_patterns import fpgrowth
import numpy as np

In [2]:

filepath = "features/url_features.csv"  # Replace with your dataset file path

In [3]:

df = pd.read_csv(filepath)
df = df.loc[:, df.nunique() > 1]

# Step 1: Create a dictionary to store column data as tuples
column_map = {}

for col in df.columns:
    column_values = tuple(df[col])  # Convert column values to a tuple
    column_map.setdefault(column_values, []).append(col)

# Step 2: Identify columns with duplicate data but different names
duplicate_columns = {k: v for k, v in column_map.items() if len(v) > 1}

# Display results
print("Duplicate columns (with different names):")
for values, columns in duplicate_columns.items():
    print(f"Columns {columns} have identical values.")

Duplicate columns (with different names):
Columns ['qty_._url', 'num_dots'] have identical values.
Columns ['qty_-_url', 'num_dash'] have identical values.
Columns ['qty___url', 'num_underscore'] have identical values.
Columns ['qty_&_url', 'num_ampersand'] have identical values.
Columns ['qty_#_url', 'num_hash'] have identical values.
Columns ['qty_%_url', 'num_percent'] have identical values.
Columns ['length_url', 'url_length'] have identical values.
Columns ['qty_._domain', 'subdomain_level', 'subdomain_count'] have identical values.
Columns ['qty_-_domain', 'num_dash_in_hostname'] have identical values.
Columns ['domain_length', 'hostname_length'] have identical values.
Columns ['domain_in_ip', 'ip_address', 'ip_address_in_url'] have identical values.
Columns ['directory_length', 'path_length'] have identical values.
Columns ['params_length', 'query_length'] have identical values.
Columns ['qty_params', 'num_query_components'] have identical values.
Columns ['at_symbol', 'at_symbo

In [4]:

df = pd.read_csv(filepath)
print(df.shape)
df = df.loc[:, df.nunique() > 1]
print(df.shape)

column_map = {}

for col in df.columns:
    column_values = tuple(df[col])  # Convert column values to a tuple
    column_map.setdefault(column_values, []).append(col)

# Step 2: Identify columns with duplicate data
duplicate_columns = {k: v for k, v in column_map.items() if len(v) > 1}

# Step 3: Keep only one column from each group
columns_to_drop = [cols[1:] for cols in duplicate_columns.values()]  # Get all but the first column in each group
columns_to_drop = [col for group in columns_to_drop for col in group]  # Flatten the list of lists

# Drop duplicate columns
df = df.drop(columns=columns_to_drop)
print(df.shape)

(80000, 144)
(80000, 107)
(80000, 90)


In [5]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

X = df.drop(columns=['rec_id', 'phishing'])
y = df['phishing']

# Apply SelectKBest with mutual information
k = 40  # Number of top features to select
selector = SelectKBest(mutual_info_classif, k=k)
X_new = selector.fit_transform(X, y)

# Get selected feature names
selected_features = X.columns[selector.get_support()]

# Create a DataFrame with the selected features
X_selected = pd.DataFrame(X_new, columns=selected_features)

# Display the selected features
selected_features

Index(['qty_._url', 'qty_-_url', 'qty_/_url', 'qty_?_url', 'qty_=_url',
       'qty_@_url', 'qty_&_url', 'qty_tld_url', 'length_url', 'email_in_url',
       'qty_._domain', 'qty_-_domain', 'qty_vowels_domain', 'domain_length',
       'qty_._directory', 'qty_-_directory', 'qty___directory',
       'qty_/_directory', 'directory_length', 'qty_._file', 'qty_-_file',
       'qty___file', 'file_length', 'qty_._params', 'qty___params',
       'qty_=_params', 'qty_@_params', 'qty_&_params', 'params_length',
       'qty_params', 'path_level', 'at_symbol', 'num_numeric_chars',
       'no_https', 'random_string', 'domain_in_subdomains', 'domain_in_paths',
       'num_sensitive_words', 'long_url', 'prefix_suffix_in_domain'],
      dtype='object')

In [7]:
def preprocess_data(filepath, n_bins=4):
    """
    Loads the dataset and converts features into itemsets for association rule mining.
    Binary features are converted to <feature_name>_1 or <feature_name>_0.
    Numerical features are binned into ranges and converted to <feature_name>_<start_value>_<end_value>.
    Assumes the column 'phishing' is the target class.
    
    Parameters:
        filepath (str): Path to the CSV file.
        n_bins (int): Number of bins for numerical features.
        
    Returns:
        pd.DataFrame: A DataFrame where rows represent itemsets.
    """
    df = pd.read_csv(filepath)
    print(df.shape)
    df = df[selected_features.tolist() + ['phishing']]
    df = df.iloc[:, :]  # For testing on a smaller subset
    print(df.shape)

    itemsets = []

    bins_dict = {}
    for col in df.columns:
        if col!='phishing' and (df[col].dtype in ['int64', 'float64']) and df[col].nunique() > 2:
            df[col] = df[col].clip(lower=df[col].min(), 
                                               upper=df[col].quantile(0.999))
            if df[col].nunique() <= 2:
                continue
            bins = pd.qcut(df[col], q=n_bins, duplicates='drop', retbins=True)[1]
            if len(bins) < 3:
                bins = pd.cut(df[col], 
                              bins=[df[col].min(), df[col].quantile(0.01), df[col].quantile(0.05), df[col].quantile(0.1), 
                                    df[col].quantile(0.2),df[col].quantile(0.4), df[col].quantile(0.99), 
                                    df[col].quantile(0.995), df[col].quantile(0.999), df[col].max()],
                                       duplicates='drop', retbins=True)[1]
                if len(bins) == 2:
                    bins = np.insert(bins, 1, (bins[0]+bins[1])/2)
                print('linear', col, bins)
            bin_labels = [f"{col}_{round(bins[i], 2)}_{round(bins[i+1], 2)}" for i in range(len(bins) - 1)]
            bins_dict[col] = {'bins': bins, 'labels': bin_labels}

    for index, row in df.iterrows():
        if index % 1000 == 0:
            print(index)
        itemset = []
        
        for col in df.columns:
            if df[col].dtype in ['int64', 'float64'] and df[col].nunique() > 2:
                # Numerical feature: binning into ranges
                bins = bins_dict[col]['bins']
                bin_labels = bins_dict[col]['labels']
                bin_idx = np.digitize(row[col], bins) - 1
                bin_idx = min(bin_idx, len(bin_labels) - 1)  # Ensure index is within range
                itemset.append(bin_labels[bin_idx])
            else:
                # Binary/categorical feature: encode as <feature_name>_value
                itemset.append(f"{col}_{int(row[col])}")
        
        itemsets.append(itemset)

    # Convert itemsets into a DataFrame for analysis
    transactions_df = pd.DataFrame({'itemsets': itemsets})
    return transactions_df

# Load and preprocess data
df = preprocess_data(filepath)

(80000, 144)
(80000, 41)
linear qty_?_url [0. 1. 2.]
linear qty_=_url [ 0.  4.  7. 11.]
linear qty_&_url [ 0.  3.  6. 10.]
linear qty_-_domain [0. 3. 4. 5.]
linear qty___directory [0. 4. 5. 9.]
linear qty_=_directory [0. 1. 2.]
linear qty_-_file [ 0. 10. 11. 15.]
linear qty___file [0. 3. 4. 8.]
linear qty_._params [ 0.  2.  4. 21.]
linear qty___params [ 0.  2.  3. 18.]
linear qty_/_params [0. 1. 3. 9.]
linear qty_=_params [ 0.  4.  7. 11.]
linear qty_&_params [0. 3. 5. 9.]
linear params_length [  0.       157.       208.       462.000004 462.004   ]
linear qty_params [ 0.  4.  6. 10.]
linear domain_in_subdomains [0. 1. 2.]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
6300

In [16]:
df.to_csv('pattern_mining/preprocessed.csv')

In [60]:
df.head()

Unnamed: 0,itemsets
0,"[qty_._url_2.0_3.0, qty_-_url_0.0_1.0, qty_/_u..."
1,"[qty_._url_3.0_23.0, qty_-_url_0.0_1.0, qty_/_..."
2,"[qty_._url_3.0_23.0, qty_-_url_0.0_1.0, qty_/_..."
3,"[qty_._url_3.0_23.0, qty_-_url_1.0_2.0, qty_/_..."
4,"[qty_._url_3.0_23.0, qty_-_url_0.0_1.0, qty_/_..."


In [6]:
import ast

df = pd.read_csv('pattern_mining/preprocessed.csv')
df['itemsets'] = df['itemsets'].apply(ast.literal_eval)
df.drop(columns=['Unnamed: 0'])

Unnamed: 0,itemsets
0,"[qty_._url_2.0_3.0, qty_-_url_0.0_1.0, qty_/_u..."
1,"[qty_._url_3.0_23.0, qty_-_url_0.0_1.0, qty_/_..."
2,"[qty_._url_3.0_23.0, qty_-_url_0.0_1.0, qty_/_..."
3,"[qty_._url_3.0_23.0, qty_-_url_1.0_2.0, qty_/_..."
4,"[qty_._url_3.0_23.0, qty_-_url_0.0_1.0, qty_/_..."
...,...
79995,"[qty_._url_0.0_2.0, qty_-_url_0.0_1.0, qty_/_u..."
79996,"[qty_._url_2.0_3.0, qty_-_url_0.0_1.0, qty_/_u..."
79997,"[qty_._url_2.0_3.0, qty_-_url_0.0_1.0, qty_/_u..."
79998,"[qty_._url_3.0_23.0, qty_-_url_1.0_2.0, qty_/_..."


In [19]:
df.head()

Unnamed: 0.1,Unnamed: 0,itemsets
0,0,"[qty_._url_2.0_3.0, qty_-_url_0.0_1.0, qty_/_u..."
1,1,"[qty_._url_3.0_23.0, qty_-_url_0.0_1.0, qty_/_..."
2,2,"[qty_._url_3.0_23.0, qty_-_url_0.0_1.0, qty_/_..."
3,3,"[qty_._url_3.0_23.0, qty_-_url_1.0_2.0, qty_/_..."
4,4,"[qty_._url_3.0_23.0, qty_-_url_0.0_1.0, qty_/_..."


In [7]:
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.preprocessing import TransactionEncoder
import pandas as pd
import random

def mine_frequent_itemsets(df_itemsets, label, min_support=0.01):
    """
    Mines frequent patterns for transactions containing 'phishing_1',
    and appends 'phishing_1' to all resulting frequent itemsets.
    
    Parameters:
        df_itemsets (pd.DataFrame): DataFrame with a column 'itemsets' containing lists of items.
        min_support (float): Minimum support threshold for frequent itemsets.
        
    Returns:
        pd.DataFrame: Frequent itemsets including 'phishing_1' with their support values.
    """
    # Filter transactions to only include those containing 'phishing_1'

    random.seed = 42
    f = list(selected_features)
    random.shuffle(f)
    features = [tuple(f[:20]), tuple(f[20:])]

    all_frequent_itemsets = []
    
    for feature_set in features:
    # Remove 'phishing_1' from each transaction (to avoid redundancy in mining)
        transactions = df_itemsets['itemsets']
        filtered_transactions = [t for t in transactions if label in t]
        filtered_transactions = [
            [item for item in t if (item != label and item.startswith(feature_set))] for t in filtered_transactions
        ]
        
        # Convert transactions into a one-hot encoded DataFrame
        te = TransactionEncoder()
        one_hot = te.fit_transform(filtered_transactions)
        one_hot_df = pd.DataFrame(one_hot, columns=te.columns_)
        print(one_hot_df.shape)
        
        # Mine frequent patterns from filtered transactions
        frequent_itemsets = fpgrowth(one_hot_df, min_support=min_support, use_colnames=True)
        # frequent_itemsets = apriori(one_hot_df, min_support=min_support, use_colnames=True)
        
        # Append 'phishing_1' to all itemsets
        # frequent_itemsets['itemsets'] = frequent_itemsets['itemsets'].apply(lambda x: x | {label})
        all_frequent_itemsets.append(frequent_itemsets)
        print('done')
        
    combined_frequent_itemsets = pd.concat(all_frequent_itemsets, ignore_index=True)
    print(f"Frequent itemsets including {label}:")
    print(combined_frequent_itemsets)
    
    return combined_frequent_itemsets


min_support = 0.3  # Adjust minimum support threshold
# Mine frequent itemsets
frequent_itemset_0 = mine_frequent_itemsets(df, 'phishing_0', min_support)
frequent_itemset_1 = mine_frequent_itemsets(df, 'phishing_1', min_support)

(50000, 55)
done
(50000, 52)
done
Frequent itemsets including phishing_0:
       support                                           itemsets
0      0.99948                                      (qty_@_url_0)
1      0.99788                          (params_length_0.0_157.0)
2      0.99600                             (qty___params_0.0_2.0)
3      0.99518                                (domain_in_paths_0)
4      0.99386                               (qty_params_0.0_4.0)
...        ...                                                ...
93761  0.67772  (domain_in_subdomains_0.0_1.0, email_in_url_0,...
93762  0.67772  (domain_in_subdomains_0.0_1.0, email_in_url_0,...
93763  0.67772  (domain_in_subdomains_0.0_1.0, email_in_url_0,...
93764  0.67772  (domain_in_subdomains_0.0_1.0, email_in_url_0,...
93765  0.67772  (domain_in_subdomains_0.0_1.0, email_in_url_0,...

[93766 rows x 2 columns]
(30000, 58)
done
(30000, 50)
done
Frequent itemsets including phishing_1:
        support                   

In [23]:
filtered = frequent_itemset_0[frequent_itemset_0['itemsets'].apply(len) > 10]
print(len(filtered))
print(filtered)

6399
        support                                           itemsets
2058    0.75776  (qty_@_params_0, qty___file_0.0_3.0, prefix_su...
3082    0.78816  (qty_@_params_0, qty_tld_url_3.0_8.0, qty___fi...
3594    0.74102  (qty_@_params_0, qty_tld_url_3.0_8.0, qty___fi...
3850    0.71328  (qty_@_params_0, qty_tld_url_3.0_8.0, prefix_s...
3978    0.70116  (qty_@_params_0, qty_tld_url_3.0_8.0, qty___fi...
...         ...                                                ...
148731  0.52510  (qty_&_params_0.0_3.0, qty_=_url_0.0_4.0, qty_...
148732  0.52510  (qty_&_params_0.0_3.0, qty_=_url_0.0_4.0, qty_...
148733  0.52510  (qty_&_params_0.0_3.0, qty_=_url_0.0_4.0, qty_...
148734  0.52510  (qty_&_params_0.0_3.0, qty_=_url_0.0_4.0, qty_...
148735  0.52510  (qty_&_params_0.0_3.0, qty_=_url_0.0_4.0, qty_...

[6399 rows x 2 columns]


In [12]:
def generate_association_rules(df, frequent_itemsets, label, min_threshold=0.7):
    """
    Generates association rules with a specified consequent (label), 
    a minimum confidence threshold, and calculates lift.
    
    Parameters:
        df (pd.DataFrame): DataFrame containing a column 'itemsets' with transactions (lists of items).
        frequent_itemsets (list): List of frequent itemsets.
        label (str): Consequent item (e.g., 'phishing_1').
        min_threshold (float): Minimum confidence threshold.
        
    Returns:
        pd.DataFrame: DataFrame with generated association rules including lift.
    """
    # Convert transactions into a one-hot encoded DataFrame
    transactions = df['itemsets']
    te = TransactionEncoder()
    one_hot = te.fit_transform(transactions)
    one_hot_df = pd.DataFrame(one_hot, columns=te.columns_)

    # Calculate support for the consequent (label)
    total_transactions = len(one_hot_df)
    consequent_support = one_hot_df[label].sum() / total_transactions

    # Initialize results
    results = []
    c = 0
    for itemset in frequent_itemsets:
        c+=1
        if c%1000==0:
            print(c)
        # Convert itemset to list
        itemset_list = list(itemset)

        # Calculate support of antecedent
        antecedent_support = np.all(one_hot_df[itemset_list].values, axis=1).sum() / total_transactions

        # Calculate support of (antecedent ∪ label)
        union_support = np.all(one_hot_df[itemset_list + [label]].values, axis=1).sum() / total_transactions

        # Calculate confidence
        confidence = union_support / antecedent_support if antecedent_support > 0 else 0

        # Calculate lift
        lift = confidence / consequent_support if consequent_support > 0 else 0

        if confidence > min_threshold:
            results.append({
                "Itemset": itemset,
                "Antecedent Support": antecedent_support,
                "Union Support": union_support,
                "Confidence": confidence,
                "Lift": lift
            })

    # Convert results to DataFrame
    results_df = pd.DataFrame(results)

    return results_df

In [13]:
rules_df = generate_association_rules(df, frequent_itemset_0['itemsets'], 'phishing_0')
rules_df.to_csv('pattern_mining/rules_0.csv')
rules_df

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000


Unnamed: 0,Itemset,Antecedent Support,Union Support,Confidence,Lift
0,(path_level_1.0_2.0),0.280900,0.210338,0.748799,1.198078
1,(file_length_13.0_148.0),0.260663,0.222562,0.853834,1.366134
2,(qty_-_directory_2.0_18.0),0.255937,0.236313,0.923321,1.477314
3,(qty_._url_2.0_3.0),0.515925,0.363675,0.704899,1.127838
4,"(qty_@_url_0, qty_._domain_2.0_6.0)",0.706975,0.497462,0.703649,1.125839
...,...,...,...,...,...
60383,"(domain_in_subdomains_0.0_1.0, email_in_url_0,...",0.543500,0.423575,0.779347,1.246955
60384,"(domain_in_subdomains_0.0_1.0, email_in_url_0,...",0.543500,0.423575,0.779347,1.246955
60385,"(domain_in_subdomains_0.0_1.0, email_in_url_0,...",0.543650,0.423575,0.779132,1.246611
60386,"(domain_in_subdomains_0.0_1.0, email_in_url_0,...",0.543500,0.423575,0.779347,1.246955


In [38]:
frequent_itemset_1

Unnamed: 0,support,itemsets
0,0.999333,(qty_-_file_0.0_10.0)
1,0.980867,(qty_params_0.0_4.0)
2,0.980867,(qty_&_params_0.0_3.0)
3,0.978900,(qty_/_params_0.0_1.0)
4,0.978833,(qty___params_0.0_2.0)
...,...,...
114872,0.303267,"(qty___file_0.0_3.0, num_numeric_chars_4.0_170..."
114873,0.302200,"(qty___file_0.0_3.0, num_numeric_chars_4.0_170..."
114874,0.302200,"(qty___file_0.0_3.0, num_numeric_chars_4.0_170..."
114875,0.300467,"(qty_._params_0.0_2.0, num_numeric_chars_4.0_1..."


In [14]:
rules_df = generate_association_rules(df, frequent_itemset_1['itemsets'], 'phishing_1')
rules_df.to_csv('pattern_mining/rules_1.csv')
rules_df

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000


Unnamed: 0,Itemset,Antecedent Support,Union Support,Confidence,Lift
0,"(qty_._directory_1.0_9.0, file_length_6.0_13.0)",0.157625,0.119712,0.759477,2.025271
1,"(qty_._directory_1.0_9.0, qty_-_file_0.0_10.0,...",0.157625,0.119712,0.759477,2.025271
2,"(qty_._directory_1.0_9.0, qty___file_0.0_3.0, ...",0.157600,0.119712,0.759597,2.025592
3,"(qty_._directory_1.0_9.0, qty_&_params_0.0_3.0...",0.152575,0.115138,0.754629,2.012344
4,"(qty_._directory_1.0_9.0, qty_&_url_0.0_3.0, f...",0.152512,0.115075,0.754528,2.012076
...,...,...,...,...,...
879,"(qty_-_directory_0.0_2.0, qty_._params_0.0_2.0...",0.138875,0.112812,0.812331,2.166217
880,"(qty_-_directory_0.0_2.0, qty_._params_0.0_2.0...",0.138525,0.112525,0.812308,2.166155
881,"(qty_-_directory_0.0_2.0, domain_in_paths_0, q...",0.138662,0.112625,0.812224,2.165930
882,"(qty_-_directory_0.0_2.0, email_in_url_0, at_s...",0.138925,0.112662,0.810959,2.162558
