In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.frequent_patterns import fpgrowth
import numpy as np

In [2]:

filepath = "features/url_features.csv"  # Replace with your dataset file path

In [3]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

df = pd.read_csv(filepath)
df = df.loc[:, df.nunique() > 1]
X = df.drop(columns=['rec_id', 'phishing'])
y = df['phishing']

# Apply SelectKBest with mutual information
k = 50  # Number of top features to select
selector = SelectKBest(mutual_info_classif, k=k)
X_new = selector.fit_transform(X, y)

# Get selected feature names
selected_features = X.columns[selector.get_support()]

# Create a DataFrame with the selected features
X_selected = pd.DataFrame(X_new, columns=selected_features)

# Display the selected features
selected_features

Index(['qty_._url', 'qty_-_url', 'qty_/_url', 'qty_?_url', 'qty_=_url',
       'qty_@_url', 'qty_&_url', 'qty_tld_url', 'length_url', 'email_in_url',
       'qty_._domain', 'qty_-_domain', 'qty_vowels_domain', 'domain_length',
       'subdomain_level', 'qty_._directory', 'qty_-_directory',
       'qty___directory', 'qty_/_directory', 'directory_length', 'qty_._file',
       'qty_-_file', 'qty___file', 'file_length', 'qty_._params',
       'qty___params', 'qty_=_params', 'qty_&_params', 'params_length',
       'qty_params', 'num_dots', 'path_level', 'url_length', 'num_dash',
       'num_dash_in_hostname', 'at_symbol', 'num_query_components',
       'num_ampersand', 'num_numeric_chars', 'no_https', 'random_string',
       'domain_in_subdomains', 'domain_in_paths', 'hostname_length',
       'path_length', 'query_length', 'num_sensitive_words',
       'at_symbol_in_url', 'prefix_suffix_in_domain', 'subdomain_count'],
      dtype='object')

In [57]:
def preprocess_data(filepath, n_bins=4):
    """
    Loads the dataset and converts features into itemsets for association rule mining.
    Binary features are converted to <feature_name>_1 or <feature_name>_0.
    Numerical features are binned into ranges and converted to <feature_name>_<start_value>_<end_value>.
    Assumes the column 'phishing' is the target class.
    
    Parameters:
        filepath (str): Path to the CSV file.
        n_bins (int): Number of bins for numerical features.
        
    Returns:
        pd.DataFrame: A DataFrame where rows represent itemsets.
    """
    df = pd.read_csv(filepath)
    df = df[selected_features.tolist() + ['phishing']]
    print(df.shape)
    df = df.iloc[:, :]  # For testing on a smaller subset
    print(df.shape)

    itemsets = []

    bins_dict = {}
    for col in df.columns:
        if col!='phishing' and (df[col].dtype in ['int64', 'float64']) and df[col].nunique() > 2:
            df[col] = df[col].clip(lower=df[col].min(), 
                                               upper=df[col].quantile(0.999))
            if df[col].nunique() <= 2:
                continue
            bins = pd.qcut(df[col], q=n_bins, duplicates='drop', retbins=True)[1]
            if len(bins) < 3:
                bins = pd.cut(df[col], 
                              bins=[df[col].min(), df[col].quantile(0.01), df[col].quantile(0.05), df[col].quantile(0.1), 
                                    df[col].quantile(0.2),df[col].quantile(0.4), df[col].quantile(0.99), 
                                    df[col].quantile(0.995), df[col].quantile(0.999), df[col].max()],
                                       duplicates='drop', retbins=True)[1]
                if len(bins) == 2:
                    bins = np.insert(bins, 1, (bins[0]+bins[1])/2)
                print('linear', col, bins)
            bin_labels = [f"{col}_{round(bins[i], 2)}_{round(bins[i+1], 2)}" for i in range(len(bins) - 1)]
            bins_dict[col] = {'bins': bins, 'labels': bin_labels}

    for index, row in df.iterrows():
        if index % 1000 == 0:
            print(index)
        itemset = []
        
        for col in df.columns:
            if df[col].dtype in ['int64', 'float64'] and df[col].nunique() > 2:
                # Numerical feature: binning into ranges
                bins = bins_dict[col]['bins']
                bin_labels = bins_dict[col]['labels']
                bin_idx = np.digitize(row[col], bins) - 1
                bin_idx = min(bin_idx, len(bin_labels) - 1)  # Ensure index is within range
                itemset.append(bin_labels[bin_idx])
            else:
                # Binary/categorical feature: encode as <feature_name>_value
                itemset.append(f"{col}_{int(row[col])}")
        
        itemsets.append(itemset)

    # Convert itemsets into a DataFrame for analysis
    transactions_df = pd.DataFrame({'itemsets': itemsets})
    return transactions_df

# Load and preprocess data
df = preprocess_data(filepath)

(80000, 51)
(80000, 51)
linear qty_&_url [ 0.  3.  6. 10.]
linear qty_=_directory [0. 1. 2.]
linear qty___file [0. 3. 4. 8.]
linear qty___params [ 0.  2.  3. 18.]
linear qty_&_params [0. 3. 5. 9.]
linear num_ampersand [ 0.  3.  6. 10.]
linear domain_in_subdomains [0. 1. 2.]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000


In [59]:
df.to_csv('pattern_mining/preprocessed.csv')

In [60]:
df.head()

Unnamed: 0,itemsets
0,"[qty_._url_2.0_3.0, qty_-_url_0.0_1.0, qty_/_u..."
1,"[qty_._url_3.0_23.0, qty_-_url_0.0_1.0, qty_/_..."
2,"[qty_._url_3.0_23.0, qty_-_url_0.0_1.0, qty_/_..."
3,"[qty_._url_3.0_23.0, qty_-_url_1.0_2.0, qty_/_..."
4,"[qty_._url_3.0_23.0, qty_-_url_0.0_1.0, qty_/_..."


In [3]:
df = pd.read_csv('pattern_mining/preprocessed.csv')

In [71]:
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.preprocessing import TransactionEncoder
import pandas as pd
import random

def mine_frequent_itemsets(df_itemsets, label, min_support=0.01):
    """
    Mines frequent patterns for transactions containing 'phishing_1',
    and appends 'phishing_1' to all resulting frequent itemsets.
    
    Parameters:
        df_itemsets (pd.DataFrame): DataFrame with a column 'itemsets' containing lists of items.
        min_support (float): Minimum support threshold for frequent itemsets.
        
    Returns:
        pd.DataFrame: Frequent itemsets including 'phishing_1' with their support values.
    """
    # Filter transactions to only include those containing 'phishing_1'

    random.seed = 42
    f = list(selected_features)
    random.shuffle(f)
    features = [tuple(f[:25]), tuple(f[25:])]

    all_frequent_itemsets = []
    
    for feature_set in features:
    # Remove 'phishing_1' from each transaction (to avoid redundancy in mining)
        transactions = df_itemsets['itemsets']
        filtered_transactions = [t for t in transactions if label in t]
        filtered_transactions = [
            [item for item in t if (item != label and item.startswith(feature_set))] for t in filtered_transactions
        ]
        
        # Convert transactions into a one-hot encoded DataFrame
        te = TransactionEncoder()
        one_hot = te.fit_transform(filtered_transactions)
        one_hot_df = pd.DataFrame(one_hot, columns=te.columns_)
        
        # Mine frequent patterns from filtered transactions
        frequent_itemsets = fpgrowth(one_hot_df, min_support=min_support, use_colnames=True)
        # frequent_itemsets = apriori(one_hot_df, min_support=min_support, use_colnames=True)
        
        # Append 'phishing_1' to all itemsets
        # frequent_itemsets['itemsets'] = frequent_itemsets['itemsets'].apply(lambda x: x | {label})
        all_frequent_itemsets.append(frequent_itemsets)
        print('done')
        
    combined_frequent_itemsets = pd.concat(all_frequent_itemsets, ignore_index=True)
    print(f"Frequent itemsets including {label}:")
    print(combined_frequent_itemsets)
    
    return combined_frequent_itemsets


min_support = 0.01  # Adjust minimum support threshold
# Mine frequent itemsets
frequent_itemset_0 = mine_frequent_itemsets(df, 'phishing_0.0', min_support)
frequent_itemset_1 = mine_frequent_itemsets(df, 'phishing_1.0', min_support)

: 

: 

In [16]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,1.000000,"(phishing_1, query_length_0.0_772.0)"
1,1.000000,"(phishing_1, num_underscore_0.0_23.0)"
2,1.000000,"(num_query_components_0.0_11.0, phishing_1)"
3,1.000000,"(phishing_1, num_percent_0.0_34.0)"
4,1.000000,"(num_ampersand_0.0_10.0, phishing_1)"
...,...,...
65530,0.915663,"(num_query_components_0.0_11.0, ip_address_0, ..."
65531,0.915663,"(ip_address_0, query_length_0.0_772.0, num_has..."
65532,0.915663,"(num_query_components_0.0_11.0, ip_address_0, ..."
65533,0.915663,"(num_query_components_0.0_11.0, ip_address_0, ..."


In [23]:
filtered = frequent_itemsets[frequent_itemsets['itemsets'].apply(len) > 15]
print(len(filtered))
print(filtered)

17
        support                                           itemsets
32767  0.915663  (num_query_components_0.0_11.0, ip_address_0, ...
49151  0.942436  (num_query_components_0.0_11.0, ip_address_0, ...
57343  0.915663  (num_query_components_0.0_11.0, ip_address_0, ...
61439  0.918340  (num_query_components_0.0_11.0, ip_address_0, ...
63487  0.918340  (num_query_components_0.0_11.0, ip_address_0, ...
64511  0.917001  (num_query_components_0.0_11.0, ip_address_0, ...
65524  0.917001  (num_query_components_0.0_11.0, ip_address_0, ...
65525  0.915663  (num_query_components_0.0_11.0, query_length_0...
65526  0.915663  (num_query_components_0.0_11.0, ip_address_0, ...
65527  0.915663  (num_query_components_0.0_11.0, ip_address_0, ...
65528  0.915663  (num_query_components_0.0_11.0, ip_address_0, ...
65529  0.915663  (num_query_components_0.0_11.0, ip_address_0, ...
65530  0.915663  (num_query_components_0.0_11.0, ip_address_0, ...
65531  0.915663  (ip_address_0, query_length_0.0_772.0, num

In [None]:
# Step 3: Generate Association Rules with Consequent (phishing=1)
def generate_association_rules(df, frequent_itemsets, label, min_threshold=0.5):

    transactions = df['itemsets']
    te = TransactionEncoder()
    one_hot = te.fit_transform(transactions)
    one_hot_df = pd.DataFrame(one_hot, columns=te.columns_)


    # Calculate support for antecedent and union
    total_transactions = len(transactions)
    results = []

    for itemset in frequent_itemsets:
        # Convert itemset to list
        itemset_list = list(itemset)
        
        # Calculate support of antecedent
        antecedent_support = transactions[itemset_list].all(axis=1).sum() / total_transactions
        
        # Calculate support of (antecedent ∪ label)
        union_support = transactions[itemset_list + [label]].all(axis=1).sum() / total_transactions
        
        # Calculate confidence
        confidence = union_support / antecedent_support if antecedent_support > 0 else 0
        
        if confidence > min_threshold:
            results.append({
                "Itemset": itemset,
                "Antecedent Support": antecedent_support,
                "Union Support": union_support,
                "Confidence": confidence
            })

    # Display results
    results_df = pd.DataFrame(results)


    return rules


rules = generate_association_rules(df, frequent_itemset_0, 'phishing_0')

In [None]:

min_support = 0.1  # Adjust minimum support threshold
min_confidence = 0.7  # Adjust minimum confidence threshold

# Mine frequent itemsets
frequent_itemsets = mine_frequent_itemsets(df, min_support)

# Generate association rules
rules = generate_association_rules(frequent_itemsets, min_confidence)

# Save results
frequent_itemsets.to_csv("frequent_itemsets.csv", index=False)
rules.to_csv("association_rules_phishing.csv", index=False)
print("Results saved to 'frequent_itemsets.csv' and 'association_rules_phishing.csv'")