In [82]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from collections import Counter

# 读取数据集文件
data = []
with open('anonymous-msweb.test', 'r') as file:
    for line in file:
        data.append(line.strip().split(','))

In [84]:
# 数据清洗和提取用户浏览记录
df = pd.DataFrame(data, columns=['Type', 'Value1', 'Value2', 'Value3', 'Value4', 'Value5'])
df = df[df['Type'] == 'A']
user_browsing_records = df['Value4'].tolist()

In [85]:
# 数据探索性分析
most_visited_pages = Counter(user_browsing_records).most_common(5)
page_visit_distribution = pd.Series(user_browsing_records).value_counts()

print("最常被访问的页面：")
print(most_visited_pages)
print("\n页面访问量分布：")
print(page_visit_distribution)

最常被访问的页面：
[('"/stream"', 1), ('"/worddev"', 1), ('"/technet"', 1), ('"/sbnmember"', 1), ('"/hardwaresupport"', 1)]

页面访问量分布：
"/france"              1
"/benelux"             1
"/vbasic"              1
"/outlook"             1
"/mstv"                1
"/news"                1
"/iesupport"           1
"/usability"           1
"/jscript"             1
"/australia"           1
"/catalog"             1
"/ie"                  1
"/win32devsupport"     1
"/uk"                  1
"/msft"                1
"/spain"               1
"/transaction"         1
"/danmark"             1
"/cze"                 1
"/netherlands"         1
"/msexcel"             1
"/belgium"             1
"/msmq"                1
"/italy"               1
"/vtest"               1
"/bookshelf"           1
"/devnews"             1
"/support"             1
"/referral"            1
"/snasupport"          1
                      ..
"/referencesupport"    1
"/msoffice"            1
"/mspress"             1
"/turkey"              1


In [86]:
# 关联规则挖掘
te = TransactionEncoder()
te_ary = te.fit(user_browsing_records).transform(user_browsing_records)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)
frequent_itemsets = apriori(df_encoded, min_support=0.1, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

In [87]:
# 结果评估
rules['antecedent_len'] = rules['antecedents'].apply(lambda x: len(x))
rules = rules[(rules['antecedent_len'] >= 2) &
              (rules['confidence'] > 0.7) &
              (rules['lift'] > 1.2)]

In [88]:
# 结果分析与应用
optimized_navigation_structure = rules['antecedents'].tolist()

print("\n关联规则：")
print(rules)
print("\n优化导航结构建议：")
print(optimized_navigation_structure)


关联规则：
       antecedents         consequents  antecedent support  \
242         (", f)                 (o)            0.139456   
289         (p, ")                 (o)            0.292517   
309         (p, ")                 (r)            0.292517   
314         (p, ")                 (s)            0.292517   
330         (t, ")                 (r)            0.428571   
335         (", u)                 (r)            0.272109   
343         (", u)                 (s)            0.272109   
350         (", u)                 (t)            0.272109   
430         (f, /)                 (o)            0.139456   
477         (p, /)                 (o)            0.292517   
498         (p, /)                 (r)            0.292517   
503         (p, /)                 (s)            0.292517   
519         (t, /)                 (r)            0.428571   
524         (/, u)                 (r)            0.272109   
532         (/, u)                 (s)            0.272109   
5