In [1]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict, Counter

In [2]:
# input_sentence
sent_dic = {}
with open('../preprocessed_data/restaurant/train.txt', 'r') as f:
    for i, sent in enumerate(f.readlines()):
        sent_dic[i] = sent.strip()

In [3]:
# topic_weights
float_regex = re.compile('[0-9.]+')
tw_dic = {}
with open('../output/restaurant/topic_weights_5_arya', 'r') as f:
    for i, tw in enumerate(f.readlines()):
        tw_dic[i] = np.argmax([float(x) for x in re.findall(float_regex, tw)])

In [4]:
vocab = {i:{} for i in range(5)}

for ind, asp in tw_dic.items():
    sent = sent_dic[ind]
    for word in sent.split():
        if word in vocab[asp]:
            vocab[asp][word] += 1
        else:
            vocab[asp][word] = 1

In [5]:
sorted_vocab = {key : dict(sorted(val.items(), key = lambda x : x[1], reverse=True)) for key, val in vocab.items()}

In [6]:
# Indicative

indicative_dic = {}
for i in range(5):
    A_j = sum(sorted_vocab.get(i).values())
    A_j_w = np.fromiter(sorted_vocab.get(i).values(), dtype=float)
    indicative_dic[i] = A_j_w / A_j

In [7]:
indicative_dic

{0: array([1.46769286e-02, 1.35135135e-02, 1.03812422e-02, ...,
        8.94934670e-05, 8.94934670e-05, 8.94934670e-05]),
 1: array([1.39379832e-02, 1.20700060e-02, 1.20125298e-02, ...,
        2.87381096e-05, 2.87381096e-05, 2.87381096e-05]),
 2: array([1.42730083e-02, 1.10965228e-02, 1.09271102e-02, ...,
        4.23531405e-05, 4.23531405e-05, 4.23531405e-05]),
 3: array([1.64704829e-02, 1.28004297e-02, 1.21290785e-02, ...,
        4.47567471e-05, 4.47567471e-05, 4.47567471e-05]),
 4: array([1.45997581e-02, 1.23055104e-02, 1.17215200e-02, ...,
        4.17135945e-05, 4.17135945e-05, 4.17135945e-05])}

In [8]:
# Distinctive
distinctive_dic = {}
for i in range(5):
    A_k_w_list = []
    for w, f in sorted_vocab.get(i).items():
        A_k_w_sub = []
        A_j_w = f
        for j in range(5):
            try:
                A_k_w = sorted_vocab.get(j)[w]
            except:
                A_k_w = 0
            A_k_w_sub.append(A_k_w)
        A_k_w_list.append(A_j_w / max(A_k_w_sub))
    distinctive_dic[i] = np.array(A_k_w_list)

In [9]:
distinctive_dic 

{0: array([0.33814433, 0.36124402, 0.27619048, ..., 0.5       , 0.05555556,
        1.        ]),
 1: array([1.        , 1.        , 1.        , ..., 1.        , 0.5       ,
        0.33333333]),
 2: array([0.69484536, 0.62380952, 0.61722488, ..., 1.        , 0.5       ,
        1.        ]),
 3: array([0.75876289, 0.68421053, 0.6452381 , ..., 0.25      , 1.        ,
        1.        ]),
 4: array([0.72164948, 0.70238095, 0.6722488 , ..., 1.        , 0.33333333,
        1.        ])}

In [10]:
from scipy.stats.mstats import gmean

In [11]:
gmean_dic = {}
for i in range(5):
    gmean_dic[i] = gmean([indicative_dic[i], distinctive_dic[i]], axis=0)

In [12]:
gmean_dic

{0: array([0.070448  , 0.06986899, 0.05354624, ..., 0.0066893 , 0.00222977,
        0.0094601 ]),
 1: array([0.11805924, 0.10986358, 0.10960169, ..., 0.00536079, 0.00379065,
        0.00309506]),
 2: array([0.09958681, 0.08319926, 0.08212481, ..., 0.00650793, 0.0046018 ,
        0.00650793]),
 3: array([0.11179084, 0.09358519, 0.08846549, ..., 0.00334502, 0.00669005,
        0.00669005]),
 4: array([0.10264457, 0.09296858, 0.08876811, ..., 0.00645861, 0.00372888,
        0.00645861])}