In [9]:
import sys
sys.path.append('../')

import os
import numpy as np
import warnings
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
import seaborn as sns

from model import DecisionTree

np.set_printoptions(suppress=True)

warnings.filterwarnings(action='ignore')

sns.set(font="Malgun Gothic", rc={"axes.unicode_minus":False}, style='white')

In [2]:
data_path = "../data/"
pivot_df = pd.read_csv(os.path.join(data_path, 'pivot_genre2_data.csv'), index_col=0)
embed_model = 'srgnn'
filename = {'sage' : 'user_embedding_sage_sid_6to6_weighted.npy', 'srgnn' : 'srgnn_user_hybrid_6to6_weighted.npy'}
embedding = np.load(os.path.join(data_path, filename[embed_model]))
save_dir = f'../output/{embed_model}_rule'
k = 15

In [3]:
pivot_df

Unnamed: 0_level_0,A001(드라마_공포),A002(드라마_논픽션),A003(드라마_농어촌),A004(드라마_단막극/단편),A006(드라마_로맨틱코메디),A007(드라마_멜로),A008(드라마_모험),A010(드라마_미스터리/스릴러),A012(드라마_시대극),A013(드라마_시트콤),...,L011(홈쇼핑_자동차용품),L012(홈쇼핑_종합),L013(홈쇼핑_침구),L014(홈쇼핑_컴퓨터/사무기기),L015(홈쇼핑_패션/의류),L016(홈쇼핑_기타),L021,L022,L023,L024
MAC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,262.000000,0.0,0.0,...,0.0,0.00000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,5.3,0.7,108.6,58.100000,0.0,19.100000,63.5,0.0,...,0.0,28.10000,11.5,0.0,66.100000,258.6,0.0,0.0,3.1,0.0
2,0.0,0.0,0.0,0.0,2392.8,684.900000,0.0,3216.800000,577.0,0.0,...,0.0,0.00000,0.7,0.0,48.800000,1.1,0.0,0.0,14.3,0.0
3,0.0,0.0,1.4,0.0,25.6,53.100000,0.0,139.500000,0.0,3.8,...,0.0,0.00000,0.0,0.0,2.900000,36.7,0.0,0.0,0.0,0.0
4,23.0,0.0,1.0,1388.3,1641.5,1344.000000,0.0,3494.512219,398.8,137.8,...,0.0,404.70000,35.0,0.0,1336.473074,1182.3,0.0,25.0,15.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4842,0.0,0.0,0.0,0.0,470.5,151.000000,0.0,991.300000,218.2,0.0,...,0.0,0.00000,0.0,0.0,19.100000,3.5,0.0,0.0,17.4,0.0
4843,29.6,0.0,10.6,268.2,1011.2,3678.030870,0.0,827.500000,1000.8,0.0,...,0.0,1.20000,0.9,0.0,23.300000,0.5,0.0,0.0,5.2,0.0
4844,0.0,0.0,0.0,0.0,0.0,0.600000,0.0,0.000000,1.6,7.7,...,0.0,0.00000,0.0,0.0,2.600000,0.0,0.0,0.0,0.0,0.0
4845,0.0,0.0,0.0,130.0,134.3,68.800000,0.0,0.600000,120.0,0.0,...,0.0,114.04474,0.0,0.0,117.293269,60.0,0.0,0.0,0.0,0.0


In [4]:
model = DecisionTree(pivot_df, embedding)
pivot_df['Label'] = model.Y

In [None]:
for i in range(k):
    rule_dict = defaultdict(list)

    with open(save_dir + f'/{embed_model}_seg_rule_extract{i}.txt', 'r') as f:
        full_text = f.read()

    or_list = full_text.split('[OR]')
    for or_rule in or_list:
        rule_list = or_rule.split('[AND]')
        for rule in rule_list:
            rule = rule.strip()
            
            for operator in ['>=', '>', '<=', '<']:
                if operator in rule:
                    # key: genre, value: time
                    key, value = rule.split(operator)
                    value = float(value)
                    if value not in rule_dict[key]:
                        rule_dict[key].append(value)
                    break
    break

# key: genre, value: time_list
for key, value in rule_dict.items():
    value.append(-float('inf'))
    value.append(float('inf'))

In [None]:
print('전체 중장르 개수:',len(pivot_df.columns))
print('사용된 중장르 개수:',len(rule_dict))

In [None]:
cnt = 0
for key, value in rule_dict.items():
    temp_df = pivot_df[pivot_df['Label'] == 0]

    value = sorted(value)
    time_group = pd.cut(temp_df[key].values, bins=value)
    # print(time_group)

    group_name = []
    for i, v in enumerate(value):
        if i==0:                     
            prev_v = '0'
            continue
        else:
            name = prev_v + ' < ' + key + ' <= ' + str(v)
        
        prev_v = str(v)
        group_name.append(name)
    
    time_group = time_group.rename_categories(group_name)
    
    df = pd.DataFrame({'time': temp_df[key].values, 'time_group': time_group})

    count_df = df['time_group'].value_counts(sort=False)
    
    cnt += 1
    plt.figure(figsize=(5,3))
    plt.barh(count_df.index, count_df.values)
    file_name = key.replace('/','_')
    plt.savefig(f'../output/figures/{file_name}.png', bbox_inches='tight')
    plt.show()

In [None]:
rule_dict = defaultdict(list)

for i in range(k):
    with open(save_dir + f'/{embed_model}_seg_rule_extract{i}.txt', 'r') as f:
        full_text = f.read()

    or_list = full_text.split('[OR]')
    for or_rule in or_list:
        rule_list = or_rule.split('[AND]')
        for rule in rule_list:
            rule = rule.strip()
            
            for operator in ['>=', '>', '<=', '<']:
                if operator in rule:
                    # key: genre, value: time
                    key, value = rule.split(operator)
                    value = float(value)
                    if value not in rule_dict[key]:
                        rule_dict[key].append(value)
                    break

# key: genre, value: time_list
for key, value in rule_dict.items():
    value.append(-float('inf'))
    value.append(float('inf'))

In [None]:
print('전체 중장르 개수:',len(pivot_df.columns))
print('사용된 중장르 개수:',len(rule_dict))

In [None]:
for key, value in rule_dict.items():
    value = sorted(value)
    time_group = pd.cut(pivot_df[key].values, bins=value)

    group_name = []
    for i, v in enumerate(value):
        if i==0:                     
            prev_v = '0'
            continue
        else:
            name = prev_v + ' <= ' + key + ' < ' + str(v)
        
        prev_v = str(v)
        group_name.append(name)
    
    time_group = time_group.rename_categories(group_name)
    
    df = pd.DataFrame({'time': pivot_df[key].values, 'time_group': time_group})

    count_df = df['time_group'].value_counts(sort=False)

    print(count_df.index.values)
    plt.figure(figsize=(4,3))
    plt.barh(count_df.index, count_df.values)
    plt.show()

In [None]:
k = 15
len_list = []
df_list = []

for i in range(k):
    with open(f'../output/{embed_model}_rule/{embed_model}_new_seg_rule_extract{i}.txt', 'r') as f:
        or_list = f.read().split('[OR]')
        for and_rule in or_list:
            and_list = and_rule.split('[AND]')
            len_list.append(len(and_list))
    df_list.append((max(len_list), min(len_list), np.mean(len_list)))

df = pd.DataFrame(df_list, columns=['max', 'min', 'avg'])

In [None]:
np.mean(list(seg_dic.values()))