# import

In [1]:
#!pip install mlxtend # 初回だけ

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

import pandas as pd
import numpy as np

# データ読み込み~前処理

In [2]:
# データ読み込み
## カラム名が読み込まれないので、いったん値として読み込み
df = pd.read_csv("INcomeESL.csv", header=0)

In [3]:
# 各カラム内に入っている値をクラスに振り直す

## 変換用関数
def to_class(list_1):
    # 結果を格納
    list_result = []
    
    # ユニーク化 → 昇順に並べる
    unique_list = sorted(set(list_1))
    
    # 各値に変換
    for l_1 in list_1:
        for ul in range(0, len(unique_list)):
            if l_1 == unique_list[ul]:
                list_result.append(ul)
    
    return(list_result, unique_list)
                
# 変換を実行する

## 冪等にするための処理
df_1 = df.copy()

## 変換が必要なカラムのリスト
list_change_columns = [
    "income",
    "sex",
    "marital status",
    "age",
    "education",
    "occupation",
    "years in bay area",
    "dual incomes",
    "householder status",
    "type of home",
    "ethnic classification",
    "language in home"
]

## 変換を実行
df_element_set = pd.DataFrame()
for c in list_change_columns:
    df_1[c+" class"], element_set = to_class(df_1[c])
    df_element_set[c] = pd.Series(element_set)
    
# 変換結果を出力
df_1.to_csv("df_change_to_class.csv", index=False, mode="w")
df_element_set.to_csv("df_element_set.csv", index=False, mode="w")

# 再読み込み
df_1 = pd.read_csv("df_change_to_class.csv", header=0)
df_element_set = pd.read_csv("df_element_set.csv", header=0)

In [4]:
# 必要なカラムだけを抽出し、相関ルール分析に入れるデータフレームとする
## 必要なカラム
list_input_columns = [
    'income class',
    'sex class',
    'marital status class',
    'age class',
    'education class',
    'occupation class',
    'years in bay area class',
    'dual incomes class',
    'householder status class',
    'type of home class',
    'ethnic classification class',
    'language in home class',
    'number of children',
    'number in household'
]

df_2 = df_1[list_input_columns]

In [5]:
# ダミー変数化する
df_input = pd.get_dummies(df_2.astype(object), dummy_na=True)

# 相関ルール分析

## サポートの計算

In [27]:
# アプリオリによる分析
## 引数 = 分析対象のデータ, 出力したいサポートの閾値, 表示の仕方(カラム名を使うか否か)
support_threshold = 0.06
freq_items1 = apriori(df_input, min_support = support_threshold, use_colnames = True)

# 結果確認
display(freq_items1.sort_values('support', ascending = False).head(10))

# itemset数確認
print(freq_items1.shape[0])

Unnamed: 0,support,itemsets
49,0.912885,(language in home class_0.0)
48,0.669721,(ethnic classification class_7.0)
463,0.65954,"(language in home class_0.0, ethnic classifica..."
36,0.646597,(years in bay area class_4.0)
50,0.621873,(number of children_0)
387,0.601367,"(language in home class_0.0, years in bay area..."
38,0.598313,(dual incomes class_1.0)
45,0.596568,(type of home class_2.0)
471,0.580134,"(language in home class_0.0, number of childre..."
9,0.553956,(sex class_0.0)


3932


## リフトの計算

In [28]:
# アソシエーションルールの抽出
## 引数 = サポートが入ったデータ, 指標, 閾値
## 指標(metric)は、support, confidence, lift, levarage, convictionが選択できる
lift_threshold = 1
a_rules1 = association_rules(freq_items1, metric = "lift", min_threshold = lift_threshold)

# リフト値でソート
a_rules1 = a_rules1.sort_values('lift',ascending = False).reset_index(drop=True)

# 結果確認
display(a_rules1.head(10))

# ルール数確認
print(a_rules1.shape[0])

# リフト値の計算まで行ったものを出力
a_rules1.to_csv("df_support={su_th}_lift={li_th}.csv".format(su_th=support_threshold, li_th=lift_threshold), index=False, mode="w")

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(age class_0.0, dual incomes class_1.0)","(income class_1.0, education class_3.0, marita...",0.091768,0.068499,0.060064,0.654517,9.555109,0.053778,2.696225
1,"(income class_1.0, education class_3.0, marita...","(age class_0.0, dual incomes class_1.0)",0.068499,0.091768,0.060064,0.876858,9.555109,0.053778,7.375466
2,"(income class_1.0, education class_3.0, dual i...","(age class_0.0, marital status class_3.0)",0.070681,0.09046,0.060064,0.849794,9.394188,0.05367,6.055297
3,"(age class_0.0, marital status class_3.0)","(income class_1.0, education class_3.0, dual i...",0.09046,0.070681,0.060064,0.663987,9.394188,0.05367,2.765726
4,(age class_0.0),"(income class_1.0, education class_3.0, marita...",0.094095,0.068499,0.060064,0.638331,9.318816,0.053619,2.57556
5,(age class_0.0),"(income class_1.0, education class_3.0, marita...",0.094095,0.068499,0.060064,0.638331,9.318816,0.053619,2.57556
6,"(income class_1.0, education class_3.0, marita...",(age class_0.0),0.068499,0.094095,0.060064,0.876858,9.318816,0.053619,7.35657
7,"(income class_1.0, education class_3.0, marita...",(age class_0.0),0.068499,0.094095,0.060064,0.876858,9.318816,0.053619,7.35657
8,(age class_0.0),"(income class_1.0, education class_3.0, dual i...",0.094095,0.070681,0.0605,0.642968,9.0968,0.05385,2.602899
9,"(income class_1.0, education class_3.0, dual i...",(age class_0.0),0.070681,0.094095,0.0605,0.855967,9.0968,0.05385,6.289566


58466
