In [None]:
import sys
sys.path.append('../')

import os
import numpy as np
import warnings
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
import seaborn as sns

from model import DecisionTree

np.set_printoptions(suppress=True)

warnings.filterwarnings(action='ignore')

sns.set(font="Malgun Gothic", rc={"axes.unicode_minus":False}, style='white')

In [None]:
data_path = "../data/"
pivot_df = pd.read_csv(os.path.join(data_path, 'pivot_genre2_data.csv'), index_col=0)
embed_model = 'srgnn'
filename = {'sage' : 'user_embedding_sage_sid_6to6_weighted.npy', 'srgnn' : 'srgnn_user_hybrid_6to6_weighted.npy'}
embedding = np.load(os.path.join(data_path, filename[embed_model]))
save_dir = f'../output/{embed_model}_rule'
k = 15

In [None]:
pivot_df

In [None]:
model = DecisionTree(pivot_df, embedding)
pivot_df['Label'] = model.Y

In [None]:
for i in range(k):
    rule_dict = defaultdict(list)

    with open(save_dir + f'/{embed_model}_seg_rule_extract{i}.txt', 'r') as f:
        full_text = f.read()

    or_list = full_text.split('[OR]')
    for or_rule in or_list:
        rule_list = or_rule.split('[AND]')
        for rule in rule_list:
            rule = rule.strip()
            
            for operator in ['>=', '>', '<=', '<']:
                if operator in rule:
                    # key: genre, value: time
                    key, value = rule.split(operator)
                    value = float(value)
                    if value not in rule_dict[key]:
                        rule_dict[key].append(value)
                    break
    break

# key: genre, value: time_list
for key, value in rule_dict.items():
    value.append(-float('inf'))
    value.append(float('inf'))

In [None]:
print('전체 중장르 개수:',len(pivot_df.columns))
print('사용된 중장르 개수:',len(rule_dict))

In [None]:
cnt = 0
for key, value in rule_dict.items():
    temp_df = pivot_df[pivot_df['Label'] == 0]

    value = sorted(value)
    time_group = pd.cut(temp_df[key].values, bins=value)
    # print(time_group)

    group_name = []
    for i, v in enumerate(value):
        if i==0:                     
            prev_v = '0'
            continue
        else:
            name = prev_v + ' < ' + key + ' <= ' + str(v)
        
        prev_v = str(v)
        group_name.append(name)
    
    time_group = time_group.rename_categories(group_name)
    
    df = pd.DataFrame({'time': temp_df[key].values, 'time_group': time_group})

    count_df = df['time_group'].value_counts(sort=False)
    
    cnt += 1
    plt.figure(figsize=(5,3))
    plt.barh(count_df.index, count_df.values)
    file_name = key.replace('/','_')
    plt.savefig(f'../output/figures/{file_name}.png', bbox_inches='tight')
    plt.show()

## 모든 세그먼트

In [None]:
rule_dict = defaultdict(list)

for i in range(k):
    with open(save_dir + f'/{embed_model}_seg_rule_extract{i}.txt', 'r') as f:
        full_text = f.read()

    or_list = full_text.split('[OR]')
    for or_rule in or_list:
        rule_list = or_rule.split('[AND]')
        for rule in rule_list:
            rule = rule.strip()
            
            for operator in ['>=', '>', '<=', '<']:
                if operator in rule:
                    # key: genre, value: time
                    key, value = rule.split(operator)
                    value = float(value)
                    if value not in rule_dict[key]:
                        rule_dict[key].append(value)
                    break

# key: genre, value: time_list
for key, value in rule_dict.items():
    value.append(-float('inf'))
    value.append(float('inf'))

In [None]:
print('전체 중장르 개수:',len(pivot_df.columns))
print('사용된 중장르 개수:',len(rule_dict))

In [None]:
for key, value in rule_dict.items():
    value = sorted(value)
    time_group = pd.cut(pivot_df[key].values, bins=value)

    group_name = []
    for i, v in enumerate(value):
        if i==0:                     
            prev_v = '0'
            continue
        else:
            name = prev_v + ' <= ' + key + ' < ' + str(v)
        
        prev_v = str(v)
        group_name.append(name)
    
    time_group = time_group.rename_categories(group_name)
    
    df = pd.DataFrame({'time': pivot_df[key].values, 'time_group': time_group})

    count_df = df['time_group'].value_counts(sort=False)

    print(count_df.index.values)
    plt.figure(figsize=(4,3))
    plt.barh(count_df.index, count_df.values)
    plt.show()

## 모든 세그멘트에 대하여 조건 구하기


In [None]:
k=15
seg_dic = {}
len_or_list = []
save_dir = f'../output/{embed_model}_rule'

for i in range(k):
    print('-'*100)
    print(f'segment_{i}')
    
    with open(save_dir + f'/{embed_model}_seg_rule_extract{i}.txt', 'r') as f:
        or_list = f.read().split('[OR]')

    len_or_list.append(len(or_list))
    ls = [[r.split() for r in l.split('[AND]')] for l in or_list]
    new_ls = []
    for node in ls:
        new_ls.append([])
        rule_dic = {}
        for r in node:
            key_name = f'{r[0]} {r[1]} '
            rule_dic[key_name] = rule_dic.get(key_name, [])
            rule_dic[key_name].append(float(r[2]))    
        
        for k, v in rule_dic.items():
            if k in '>':
                rule_dic[k] = max(v)
            else:
                rule_dic[k] = min(v)
        
        for k, v in rule_dic.items():
            new_rule = k.split()
            new_rule.append(str(v))
            new_ls[-1].append(new_rule)
            
    # 생성 규칙 수
    total_rule_ls = []
    for node in new_ls:
        for rule in node:
            total_rule_ls.append(str(rule))
    
    node_list = []
    for node in new_ls:
        rule_list = []
        for rule in node:
            rule_list.append(''.join(rule))
        node_rule = ' [AND] '.join(rule_list)
        node_list.append(node_rule)
    total_rule = ' [OR]\n'.join(node_list)

    with open(save_dir + f'/{embed_model}_seg_rule_extract{i}.txt', 'w') as f:
        f.writelines(total_rule)
        
    print(len(total_rule_ls)) # 리스트에서 생성 규칙 수
    print(len(set(total_rule_ls))) # set에서 생성 규칙 수
            
    seg_dic[i] = len(set(total_rule_ls))

seg_dic, len_or_list

----------------------------------------------------------------------------------------------------
segment_0
140
68
----------------------------------------------------------------------------------------------------
segment_1
198
106
----------------------------------------------------------------------------------------------------
segment_2
72
55
----------------------------------------------------------------------------------------------------
segment_3
141
82
----------------------------------------------------------------------------------------------------
segment_4
308
141
----------------------------------------------------------------------------------------------------
segment_5
113
65
----------------------------------------------------------------------------------------------------
segment_6
193
94
----------------------------------------------------------------------------------------------------
segment_7
57
33
--------------------------------------------------------

({0: 68,
  1: 106,
  2: 55,
  3: 82,
  4: 141,
  5: 65,
  6: 94,
  7: 33,
  8: 56,
  9: 84,
  10: 87,
  11: 49,
  12: 104,
  13: 39,
  14: 156},
 [16, 25, 10, 18, 38, 13, 23, 7, 13, 17, 17, 10, 23, 8, 39])

- rule 개수 max/min/avg

In [None]:
k = 15
len_list = []
df_list = []

for i in range(k):
    with open(f'../output/{embed_model}_rule/{embed_model}_new_seg_rule_extract{i}.txt', 'r') as f:
        or_list = f.read().split('[OR]')
        for and_rule in or_list:
            and_list = and_rule.split('[AND]')
            len_list.append(len(and_list))
    df_list.append((max(len_list), min(len_list), np.mean(len_list)))

df = pd.DataFrame(df_list, columns=['max', 'min', 'avg'])

In [None]:
np.mean(list(seg_dic.values()))

- 기존 방법에서 조건 개수

In [None]:
k = 15
set_dic = {}
for i in range(k):
    print('-'*100)
    print(f'segment_{i}')
    f = open(f'../output/{embed_model}_rule/{embed_model}_seg_rule_extract{i}.txt', 'r')
    ls = [[r.split() for r in l.split('[AND]')] for l in f.read().split('[OR]')]
    new_ls = [l for l in f.read().split('[OR]')]
    # 생성 규칙 수
    total_rule_ls = []
    for node in ls:
        for rule in node:
            total_rule_ls.append(str(rule))
    print(len(total_rule_ls)) # 리스트에서 생성 규칙 수
    print(len(set(total_rule_ls))) # set에서 생성 규칙 수
            
    seg_dic[i] = len(set(total_rule_ls))
seg_dic

In [None]:
np.mean(list(seg_dic.values()))