In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.frequent_patterns import fpgrowth
import numpy as np

In [2]:
import pandas as pd

df1 = pd.read_csv('features/url_features.csv', index_col=0)
df2 = pd.read_csv('features/html_features.csv', index_col=0)

# Merge DataFrames on 'rec_id'
merged_df = pd.merge(df1, df2, on='rec_id', how='outer')

# Handle phishing label (assuming both phishing labels should match)
# If they don't match, prioritize one, or handle conflicts as needed
merged_df['phishing'] = merged_df['phishing_x'].combine_first(merged_df['phishing_y'])

# Drop duplicate phishing columns
merged_df.drop(columns=['phishing_x', 'phishing_y'], inplace=True)

# Output the final DataFrame
print(merged_df)
merged_df.to_csv('features/merged_features.csv')


       rec_id  qty_._url  qty_-_url  qty___url  qty_/_url  qty_?_url  \
0           1          2          0          0          4          0   
1           2          3          0          0          3          0   
2           3          3          0          0          4          0   
3           4          3          1          0          5          0   
4           5          4          0          0          3          0   
...       ...        ...        ...        ...        ...        ...   
79995   79996          1          0          0          5          0   
79996   79997          2          0          1          6          1   
79997   79998          2          0          0          4          0   
79998   79999          3          1          1          4          0   
79999   80000          2          0          0          3          0   

       qty_=_url  qty_@_url  qty_&_url  qty_!_url  ...  \
0              0          0          0          0  ...   
1              0   

In [3]:

filepath = "features/merged_features.csv"  # Replace with your dataset file path

In [4]:

df = pd.read_csv(filepath)
df = df.loc[:, df.nunique() > 1]

# Step 1: Create a dictionary to store column data as tuples
column_map = {}

for col in df.columns:
    column_values = tuple(df[col])  # Convert column values to a tuple
    column_map.setdefault(column_values, []).append(col)

# Step 2: Identify columns with duplicate data but different names
duplicate_columns = {k: v for k, v in column_map.items() if len(v) > 1}

# Display results
print("Duplicate columns (with different names):")
for values, columns in duplicate_columns.items():
    print(f"Columns {columns} have identical values.")

Duplicate columns (with different names):
Columns ['qty_._url', 'num_dots'] have identical values.
Columns ['qty_-_url', 'num_dash'] have identical values.
Columns ['qty___url', 'num_underscore'] have identical values.
Columns ['qty_&_url', 'num_ampersand'] have identical values.
Columns ['qty_#_url', 'num_hash'] have identical values.
Columns ['qty_%_url', 'num_percent'] have identical values.
Columns ['length_url', 'url_length'] have identical values.
Columns ['qty_._domain', 'subdomain_level', 'subdomain_count'] have identical values.
Columns ['qty_-_domain', 'num_dash_in_hostname'] have identical values.
Columns ['domain_length', 'hostname_length'] have identical values.
Columns ['domain_in_ip', 'ip_address', 'ip_address_in_url'] have identical values.
Columns ['directory_length', 'path_length'] have identical values.
Columns ['params_length', 'query_length'] have identical values.
Columns ['qty_params', 'num_query_components'] have identical values.
Columns ['at_symbol', 'at_symbo

In [5]:

df = pd.read_csv(filepath)
print(df.shape)
df = df.loc[:, df.nunique() > 1]
print(df.shape)

column_map = {}

for col in df.columns:
    column_values = tuple(df[col])  # Convert column values to a tuple
    column_map.setdefault(column_values, []).append(col)

# Step 2: Identify columns with duplicate data
duplicate_columns = {k: v for k, v in column_map.items() if len(v) > 1}

# Step 3: Keep only one column from each group
columns_to_drop = [cols[1:] for cols in duplicate_columns.values()]  # Get all but the first column in each group
columns_to_drop = [col for group in columns_to_drop for col in group]  # Flatten the list of lists

# Drop duplicate columns
df = df.drop(columns=columns_to_drop)
print(df.shape)

(80000, 173)
(80000, 135)
(80000, 118)


In [7]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

X = df.drop(columns=['rec_id', 'phishing'])
y = df['phishing']

# Apply SelectKBest with mutual information
k = 40  # Number of top features to select
selector = SelectKBest(mutual_info_classif, k=k)
X_new = selector.fit_transform(X, y)

# Get selected feature names
selected_features = X.columns[selector.get_support()]

# Create a DataFrame with the selected features
X_selected = pd.DataFrame(X_new, columns=selected_features)

# Display the selected features
selected_features

Index(['qty_._url', 'qty_-_url', 'qty_/_url', 'qty_tld_url', 'length_url',
       'qty_._domain', 'qty_-_domain', 'qty_vowels_domain', 'domain_length',
       'qty_._directory', 'qty_-_directory', 'qty_/_directory',
       'directory_length', 'qty_-_file', 'file_length', 'params_length',
       'qty_params', 'path_level', 'num_numeric_chars', 'no_https',
       'num_sensitive_words', 'prefix_suffix_in_domain', 'script_files_ratio',
       'css_files_ratio', 'image_files_ratio', 'anchor_files_ratio',
       'empty_anchor_ratio', 'null_hyperlink_ratio', 'total_hyperlinks',
       'internal_hyperlink_ratio', 'external_hyperlink_ratio',
       'external_to_internal_ratio', 'total_forms', 'suspicious_form_ratio',
       'PctExtHyperlinks', 'PctExtResourceUrls',
       'PctNullSelfRedirectHyperlinks', 'SubmitInfoToEmail', 'IframeOrFrame',
       'ExtMetaScriptLinkRT'],
      dtype='object')

In [11]:
def preprocess_data(filepath, n_bins=4):
    """
    Loads the dataset and converts features into itemsets for association rule mining.
    Binary features are converted to <feature_name>_1 or <feature_name>_0.
    Numerical features are binned into ranges and converted to <feature_name>_<start_value>_<end_value>.
    Assumes the column 'phishing' is the target class.
    
    Parameters:
        filepath (str): Path to the CSV file.
        n_bins (int): Number of bins for numerical features.
        
    Returns:
        pd.DataFrame: A DataFrame where rows represent itemsets.
    """
    df = pd.read_csv(filepath)
    print(df.shape)
    df = df[selected_features.tolist() + ['phishing']]
    df = df.iloc[:, :]  # For testing on a smaller subset
    print(df.shape)

    itemsets = []

    bins_dict = {}
    for col in df.columns:
        if col!='phishing' and (df[col].dtype in ['int64', 'float64']) and df[col].nunique() > 2:
            df[col] = df[col].clip(lower=df[col].min(), 
                                               upper=df[col].quantile(0.999))
            if df[col].nunique() <= 2:
                continue
            bins = pd.qcut(df[col], q=n_bins, duplicates='drop', retbins=True)[1]
            if len(bins) < 3:
                bins = pd.cut(df[col], 
                              bins=[df[col].min(), df[col].quantile(0.01), df[col].quantile(0.05), df[col].quantile(0.1), 
                                    df[col].quantile(0.2),df[col].quantile(0.4), df[col].quantile(0.99), 
                                    df[col].quantile(0.995), df[col].quantile(0.999), df[col].max()],
                                       duplicates='drop', retbins=True)[1]
                if len(bins) == 2:
                    bins = np.insert(bins, 1, (bins[0]+bins[1])/2)
                print('linear', col, bins)
            bin_labels = [f"{col}_{round(bins[i], 2)}_{round(bins[i+1], 2)}" for i in range(len(bins) - 1)]
            bins_dict[col] = {'bins': bins, 'labels': bin_labels}

    for index, row in df.iterrows():
        if index % 1000 == 0:
            print(index)
        itemset = []
        
        for col in df.columns:
            if df[col].dtype in ['int64', 'float64'] and df[col].nunique() > 2:
                # Numerical feature: binning into ranges
                bins = bins_dict[col]['bins']
                bin_labels = bins_dict[col]['labels']
                bin_idx = np.digitize(row[col], bins) - 1
                bin_idx = min(bin_idx, len(bin_labels) - 1)  # Ensure index is within range
                itemset.append(bin_labels[bin_idx])
            else:
                # Binary/categorical feature: encode as <feature_name>_value
                itemset.append(f"{col}_{int(row[col])}")
        
        itemsets.append(itemset)

    # Convert itemsets into a DataFrame for analysis
    transactions_df = pd.DataFrame({'itemsets': itemsets})
    return transactions_df

# Load and preprocess data
df = preprocess_data(filepath)

(80000, 173)
(80000, 41)
linear qty_-_domain [0. 3. 4. 5.]
linear qty_-_file [ 0. 10. 11. 15.]
linear params_length [  0.       157.       208.       462.000004 462.004   ]
linear qty_params [ 0.  4.  6. 10.]
linear suspicious_form_ratio [0.  0.5 1. ]
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000


In [12]:
df.to_csv('pattern_mining/preprocessed.csv')

In [60]:
df.head()

Unnamed: 0,itemsets
0,"[qty_._url_2.0_3.0, qty_-_url_0.0_1.0, qty_/_u..."
1,"[qty_._url_3.0_23.0, qty_-_url_0.0_1.0, qty_/_..."
2,"[qty_._url_3.0_23.0, qty_-_url_0.0_1.0, qty_/_..."
3,"[qty_._url_3.0_23.0, qty_-_url_1.0_2.0, qty_/_..."
4,"[qty_._url_3.0_23.0, qty_-_url_0.0_1.0, qty_/_..."


In [None]:
import ast

df = pd.read_csv('pattern_mining/preprocessed.csv', index_col=0)
df['itemsets'] = df['itemsets'].apply(ast.literal_eval)

Unnamed: 0,itemsets
0,"[qty_._url_2.0_3.0, qty_-_url_0.0_1.0, qty_/_u..."
1,"[qty_._url_3.0_23.0, qty_-_url_0.0_1.0, qty_/_..."
2,"[qty_._url_3.0_23.0, qty_-_url_0.0_1.0, qty_/_..."
3,"[qty_._url_3.0_23.0, qty_-_url_1.0_2.0, qty_/_..."
4,"[qty_._url_3.0_23.0, qty_-_url_0.0_1.0, qty_/_..."
...,...
79995,"[qty_._url_0.0_2.0, qty_-_url_0.0_1.0, qty_/_u..."
79996,"[qty_._url_2.0_3.0, qty_-_url_0.0_1.0, qty_/_u..."
79997,"[qty_._url_2.0_3.0, qty_-_url_0.0_1.0, qty_/_u..."
79998,"[qty_._url_3.0_23.0, qty_-_url_1.0_2.0, qty_/_..."


In [19]:
df.head()

Unnamed: 0.1,Unnamed: 0,itemsets
0,0,"[qty_._url_2.0_3.0, qty_-_url_0.0_1.0, qty_/_u..."
1,1,"[qty_._url_3.0_23.0, qty_-_url_0.0_1.0, qty_/_..."
2,2,"[qty_._url_3.0_23.0, qty_-_url_0.0_1.0, qty_/_..."
3,3,"[qty_._url_3.0_23.0, qty_-_url_1.0_2.0, qty_/_..."
4,4,"[qty_._url_3.0_23.0, qty_-_url_0.0_1.0, qty_/_..."


In [18]:
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.preprocessing import TransactionEncoder
import pandas as pd
import random

def mine_frequent_itemsets(df_itemsets, label, min_support=0.01):
    """
    Mines frequent patterns for transactions containing 'phishing_1',
    and appends 'phishing_1' to all resulting frequent itemsets.
    
    Parameters:
        df_itemsets (pd.DataFrame): DataFrame with a column 'itemsets' containing lists of items.
        min_support (float): Minimum support threshold for frequent itemsets.
        
    Returns:
        pd.DataFrame: Frequent itemsets including 'phishing_1' with their support values.
    """
    # Filter transactions to only include those containing 'phishing_1'

    random.seed = 42
    f = list(selected_features)
    random.shuffle(f)
    features = [tuple(f[:20]), tuple(f[20:])]

    all_frequent_itemsets = []
    
    for feature_set in features:
    # Remove 'phishing_1' from each transaction (to avoid redundancy in mining)
        transactions = df_itemsets['itemsets']
        filtered_transactions = [t for t in transactions if label in t]
        filtered_transactions = [
            [item for item in t if (item != label and item.startswith(feature_set))] for t in filtered_transactions
        ]
        
        # Convert transactions into a one-hot encoded DataFrame
        te = TransactionEncoder()
        one_hot = te.fit_transform(filtered_transactions)
        one_hot_df = pd.DataFrame(one_hot, columns=te.columns_)
        print(one_hot_df.shape)
        
        # Mine frequent patterns from filtered transactions
        frequent_itemsets = fpgrowth(one_hot_df, min_support=min_support, use_colnames=True, max_len=5)
        # frequent_itemsets = apriori(one_hot_df, min_support=min_support, use_colnames=True)
        
        # Append 'phishing_1' to all itemsets
        # frequent_itemsets['itemsets'] = frequent_itemsets['itemsets'].apply(lambda x: x | {label})
        all_frequent_itemsets.append(frequent_itemsets)
        print('done')
        
    combined_frequent_itemsets = pd.concat(all_frequent_itemsets, ignore_index=True)
    print(f"Frequent itemsets including {label}:")
    print(combined_frequent_itemsets)
    
    return combined_frequent_itemsets


min_support = 0.05  # Adjust minimum support threshold
# Mine frequent itemsets
frequent_itemset_0 = mine_frequent_itemsets(df, 'phishing_0', min_support)
frequent_itemset_1 = mine_frequent_itemsets(df, 'phishing_1', min_support)

(50000, 58)
done
(50000, 67)
done
Frequent itemsets including phishing_0:
        support                                           itemsets
0       0.99842                             (qty_-_domain_0.0_3.0)
1       0.98266                              (qty_-_file_0.0_10.0)
2       0.93886                        (prefix_suffix_in_domain_0)
3       0.91872                            (num_sensitive_words_0)
4       0.88166                                       (no_https_0)
...         ...                                                ...
133651  0.05136  (ExtMetaScriptLinkRT_0.15_0.34, image_files_ra...
133652  0.05798  (params_length_0.0_157.0, ExtMetaScriptLinkRT_...
133653  0.05130  (ExtMetaScriptLinkRT_0.15_0.34, image_files_ra...
133654  0.05130  (params_length_0.0_157.0, ExtMetaScriptLinkRT_...
133655  0.05126  (ExtMetaScriptLinkRT_0.15_0.34, qty_tld_url_3....

[133656 rows x 2 columns]
(30000, 63)
done
(30000, 63)
done
Frequent itemsets including phishing_1:
         support     

In [23]:
filtered = frequent_itemset_0[frequent_itemset_0['itemsets'].apply(len) > 10]
print(len(filtered))
print(filtered)

6399
        support                                           itemsets
2058    0.75776  (qty_@_params_0, qty___file_0.0_3.0, prefix_su...
3082    0.78816  (qty_@_params_0, qty_tld_url_3.0_8.0, qty___fi...
3594    0.74102  (qty_@_params_0, qty_tld_url_3.0_8.0, qty___fi...
3850    0.71328  (qty_@_params_0, qty_tld_url_3.0_8.0, prefix_s...
3978    0.70116  (qty_@_params_0, qty_tld_url_3.0_8.0, qty___fi...
...         ...                                                ...
148731  0.52510  (qty_&_params_0.0_3.0, qty_=_url_0.0_4.0, qty_...
148732  0.52510  (qty_&_params_0.0_3.0, qty_=_url_0.0_4.0, qty_...
148733  0.52510  (qty_&_params_0.0_3.0, qty_=_url_0.0_4.0, qty_...
148734  0.52510  (qty_&_params_0.0_3.0, qty_=_url_0.0_4.0, qty_...
148735  0.52510  (qty_&_params_0.0_3.0, qty_=_url_0.0_4.0, qty_...

[6399 rows x 2 columns]


In [19]:
def generate_association_rules(df, frequent_itemsets, label, min_threshold=0.8):
    # Convert transactions into a one-hot encoded DataFrame
    transactions = df['itemsets']
    te = TransactionEncoder()
    one_hot = te.fit_transform(transactions)
    one_hot_df = pd.DataFrame(one_hot, columns=te.columns_)

    # Calculate support for the consequent (label)
    total_transactions = len(one_hot_df)
    consequent_support = one_hot_df[label].sum() / total_transactions

    # Initialize results
    results = []
    c = 0
    for itemset in frequent_itemsets:
        c+=1
        if c%1000==0:
            print(c)
        # Convert itemset to list
        itemset_list = list(itemset)

        # Calculate support of antecedent
        antecedent_support = np.all(one_hot_df[itemset_list].values, axis=1).sum() / total_transactions

        # Calculate support of (antecedent ∪ label)
        union_support = np.all(one_hot_df[itemset_list + [label]].values, axis=1).sum() / total_transactions

        # Calculate confidence
        confidence = union_support / antecedent_support if antecedent_support > 0 else 0

        # Calculate lift
        lift = confidence / consequent_support if consequent_support > 0 else 0

        if confidence > min_threshold:
            results.append({
                "Itemset": itemset,
                "Antecedent Support": antecedent_support,
                "Union Support": union_support,
                "Confidence": confidence,
                "Lift": lift
            })

    # Convert results to DataFrame
    results_df = pd.DataFrame(results)

    return results_df

In [20]:
rules_df = generate_association_rules(df, frequent_itemset_0['itemsets'], 'phishing_0')
rules_df.to_csv('pattern_mining/rules_0_4.csv')
rules_df

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000


Unnamed: 0,Itemset,Antecedent Support,Union Support,Confidence,Lift
0,(external_hyperlink_ratio_0.05_0.17),0.251075,0.209300,0.833615,1.333785
1,(PctExtHyperlinks_0.21_0.92),0.250038,0.228013,0.911913,1.459061
2,(file_length_13.0_148.0),0.260663,0.222562,0.853834,1.366134
3,(IframeOrFrame_1),0.252775,0.207712,0.821729,1.314766
4,(internal_hyperlink_ratio_0.62_0.8),0.250075,0.202725,0.810657,1.297051
...,...,...,...,...,...
67553,"(qty_-_directory_2.0_18.0, qty_-_url_2.0_19.0,...",0.042150,0.037838,0.897687,1.436299
67554,"(qty_-_directory_2.0_18.0, qty_tld_url_3.0_8.0...",0.042075,0.037838,0.899287,1.438859
67555,"(anchor_files_ratio_0.51_0.69, image_files_rat...",0.045637,0.037175,0.814571,1.303314
67556,"(qty_._domain_2.0_6.0, qty_tld_url_3.0_8.0, pa...",0.045538,0.037112,0.814988,1.303980


In [38]:
frequent_itemset_1

Unnamed: 0,support,itemsets
0,0.999333,(qty_-_file_0.0_10.0)
1,0.980867,(qty_params_0.0_4.0)
2,0.980867,(qty_&_params_0.0_3.0)
3,0.978900,(qty_/_params_0.0_1.0)
4,0.978833,(qty___params_0.0_2.0)
...,...,...
114872,0.303267,"(qty___file_0.0_3.0, num_numeric_chars_4.0_170..."
114873,0.302200,"(qty___file_0.0_3.0, num_numeric_chars_4.0_170..."
114874,0.302200,"(qty___file_0.0_3.0, num_numeric_chars_4.0_170..."
114875,0.300467,"(qty_._params_0.0_2.0, num_numeric_chars_4.0_1..."


In [21]:
rules_df = generate_association_rules(df, frequent_itemset_1['itemsets'], 'phishing_1')
rules_df.to_csv('pattern_mining/rules_1_4.csv')
rules_df

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
142000
143000
144000
145000
146000
147000
148000
149000
150000
151000
152000
153000
154000
155000
156000
157000
158000
15

Unnamed: 0,Itemset,Antecedent Support,Union Support,Confidence,Lift
0,"(total_hyperlinks_0.0_16.0, total_forms_1.0_2.0)",0.088725,0.076088,0.857566,2.286841
1,"(total_hyperlinks_0.0_16.0, total_forms_1.0_2....",0.088637,0.076088,0.858412,2.289099
2,"(total_hyperlinks_0.0_16.0, total_forms_1.0_2....",0.087950,0.075500,0.858442,2.289179
3,"(total_hyperlinks_0.0_16.0, total_forms_1.0_2....",0.086087,0.073650,0.855525,2.281400
4,"(params_length_0.0_157.0, total_hyperlinks_0.0...",0.082988,0.070587,0.850580,2.268213
...,...,...,...,...,...
29636,"(empty_anchor_ratio_0.0_0.01, suspicious_form_...",0.023400,0.018988,0.811432,2.163818
29637,"(suspicious_form_ratio_0.0_0.5, qty_._url_3.0_...",0.023537,0.018863,0.801381,2.137015
29638,"(qty_._domain_1.0_2.0, external_hyperlink_rati...",0.024450,0.020075,0.821063,2.189502
29639,"(num_numeric_chars_4.0_170.01, external_hyperl...",0.024850,0.019900,0.800805,2.135480
