In [1]:
import sys
sys.path.append('../')

from model import DecisionTree
from rule import RuleExtractor

import os
import numpy as np
import warnings
import pandas as pd

np.set_printoptions(suppress=True)

warnings.filterwarnings(action='ignore')

In [2]:
# sage, srgnn 중 선택
embed_model = 'srgnn'
filename = {'sage' : 'user_embedding_sage_sid_6to6_weighted.npy', 'srgnn' : 'srgnn_user_hybrid_6to6_weighted.npy'}
data_path = "../data/"

pivot_df = pd.read_csv(os.path.join(data_path, 'pivot_genre2_data.csv'), index_col=0)
embedding = np.load(os.path.join(data_path, filename[embed_model]))

In [3]:
model = DecisionTree(pivot_df, embedding)
dt = model.make_dt(min_samples_leaf=25, min_impurity_decrease=0.0007)
print(model.get_score(dt, scoring='all'))
model.max_depth_dt = dt

(0.49773144614142695, 0.5584737440295312, 0.5131784979165356)


In [4]:
k=15
len_or_list = []
save_dir = f'../output/{embed_model}_rule'

for i in range(k):
    rule_extractor = RuleExtractor(model)
    rules = rule_extractor.extract_rule(segment_num=i)
    
    or_list = rules.split('[OR]')

    ls = [[r.split() for r in l.split('[AND]')] for l in or_list]
    new_ls = []
    for node in ls:
        new_ls.append([])
        rule_dic = {}
        for r in node:
            key_name = f'{r[0]} {r[1]} '
            rule_dic[key_name] = rule_dic.get(key_name, [])
            rule_dic[key_name].append(float(r[2]))    
        
        for k, v in rule_dic.items():
            if k in '>':
                rule_dic[k] = max(v)
            else:
                rule_dic[k] = min(v)
        
        for k, v in rule_dic.items():
            new_rule = k.split()
            new_rule.append(str(v))
            new_ls[-1].append(new_rule)
            
    # 생성 규칙 수
    total_rule_ls = []
    for node in new_ls:
        for rule in node:
            total_rule_ls.append(str(rule))
    
    node_list = []
    for node in new_ls:
        rule_list = []
        for rule in node:
            rule_list.append(''.join(rule))
        node_rule = ' [AND] '.join(rule_list)
        node_list.append(node_rule)
    total_rule = ' [OR]\n'.join(node_list)

    with open(save_dir + f'/{embed_model}_seg_rule_extract{i}.txt', 'w') as f:
        f.writelines(total_rule)

In [5]:
df = pd.DataFrame([i for i in range(15)])
df.columns = ['segment']
rule_num_ls = []
for i in range(15):
    with open(save_dir + f'/{embed_model}_seg_rule_extract{i}.txt', 'r') as f:
        full_text = f.read()
    rule_num = full_text.split('OR').__len__()
    rule_num_ls.append(rule_num)
    # print(f"segment_{i} : {rule_num}개")
df['rule개수'] = rule_num_ls

segments_num_ls = []
pred_df = pivot_df.copy()

pred_df['label'] = model.Y
pred_df['prediction'] = model.max_depth_dt.predict(pivot_df)

label = pred_df['label'].value_counts().sort_index()
prediction = pred_df['prediction'].value_counts().sort_index()

for idx, (y_true, y_pred) in enumerate(zip(label, prediction)):
    segments_num_ls.append(y_pred)
    # print(idx, ':',y_pred)
df['데이터 수'] = segments_num_ls
df

Unnamed: 0,segment,rule개수,데이터 수
0,0,14,528
1,1,8,490
2,2,1,42
3,3,8,296
4,4,15,584
5,5,2,88
6,6,11,409
7,7,2,79
8,8,6,260
9,9,10,505


In [6]:
k = 15
len_list = []
df_list = []

for i in range(k):
    with open(f'../output/{embed_model}_rule/{embed_model}_seg_rule_extract{i}.txt', 'r') as f:
        or_list = f.read().split('[OR]')
        for and_rule in or_list:
            and_list = and_rule.split('[AND]')
            len_list.append(len(and_list))
    df_list.append((max(len_list), min(len_list), np.mean(len_list)))

df = pd.DataFrame(df_list, columns=['max', 'min', 'avg'])
df

Unnamed: 0,max,min,avg
0,9,6,8.357143
1,9,4,7.227273
2,9,4,7.086957
3,9,4,6.935484
4,12,4,7.043478
5,12,4,7.041667
6,12,4,7.118644
7,12,4,7.04918
8,12,4,7.119403
9,12,4,7.324675
