In [None]:
from collections import Counter
from tqdm import tqdm
import json
import numpy as np
from statistics import variance, stdev, mean, median

In [None]:
# Load data
yelp_vocab = open(r"text.vocab", 'r').readlines()
yelp_vocab = [token.strip() for token in yelp_vocab]

training_data_dom_0 = open(r"multi_domain_renamed.train.0", 'r').readlines()
training_data_dom_0 = [token.strip().split() for token in training_data_dom_0]
training_data_dom_0 = [token for sublist in training_data_dom_0 for token in sublist]
training_data_dom_0 = dict(Counter(training_data_dom_0))

training_data_dom_1 = open(r"multi_domain_renamed.train.1", 'r').readlines()
training_data_dom_1 = [token.strip().split() for token in training_data_dom_1]
training_data_dom_1 = [token for sublist in training_data_dom_1 for token in sublist]
training_data_dom_1 = dict(Counter(training_data_dom_1))

training_data_dom_2 = open(r"multi_domain_renamed.train.2", 'r').readlines()
training_data_dom_2 = [token.strip().split() for token in training_data_dom_2]
training_data_dom_2 = [token for sublist in training_data_dom_2 for token in sublist]
training_data_dom_2 = dict(Counter(training_data_dom_2))

In [None]:
# Check vocab file is complete
print(set(training_data_dom_0.keys()) - set(yelp_vocab))
print(set(training_data_dom_1.keys()) - set(yelp_vocab))
print(set(training_data_dom_2.keys()) - set(yelp_vocab))

set()
set()
set()


# MBK Baseline

In [None]:
# Generate deboosting scores
deboosting_dict = dict()
len_dom_0 = sum(training_data_dom_0.values())
len_dom_1 = sum(training_data_dom_1.values())
len_dom_2 = sum(training_data_dom_2.values())


for word in tqdm(yelp_vocab):
  max_val = max(training_data_dom_0.get(word, 0) / len_dom_0, training_data_dom_1.get(word, 0) / len_dom_1, training_data_dom_2.get(word, 0) / len_dom_2)
  min_val = min(training_data_dom_0.get(word, 0) / len_dom_0, training_data_dom_1.get(word, 0) / len_dom_1, training_data_dom_2.get(word, 0) / len_dom_2)
  if max_val == 0:
    deboosting_dict[word] = 0
  else:
    deboosting_dict[word] = (max_val - min_val) / max_val

100%|██████████| 9659/9659 [00:00<00:00, 312367.25it/s]


In [None]:
# Save as json file
with open('saved_dict.json', 'w') as file:
    json.dump(deboosting_dict, file)

# De-Boosting Proposal

In [None]:
# Generate deboosting scores
temp_dict = dict()
max_val = 0
min_val = 5

for word in tqdm(yelp_vocab):
  prop_0 = training_data_dom_0.get(word, 0) / len_dom_0
  prop_1 = training_data_dom_1.get(word, 0) / len_dom_1
  prop_2 = training_data_dom_2.get(word, 0) / len_dom_2
  
  mean_val = mean([prop_0, prop_1, prop_2])
  std_val = stdev([prop_0, prop_1, prop_2])

  if std_val == 0:
    temp_dict[word] = 0
  else:
    val_0 = prop_0 - mean([prop_1, prop_2])
    val_1 = prop_1 - mean([prop_0, prop_2])
    val_2 = prop_2 - mean([prop_0, prop_1])
    
    temp_dict[word] = max(val_0, val_1, val_2) / np.sqrt(1 - mean_val)

    if temp_dict[word] > max_val:
      max_val = temp_dict[word]
    if temp_dict[word] < min_val:
      min_val = temp_dict[word]
  
for word in tqdm(yelp_vocab):
  temp_dict[word] = temp_dict[word] / max_val

100%|██████████| 9659/9659 [00:03<00:00, 2905.10it/s]
100%|██████████| 9659/9659 [00:00<00:00, 1526250.09it/s]


In [None]:
# Save as json file
with open('saved_dict_alternative.json', 'w') as file:
    json.dump(temp_dict, file)

0.0050477462409198315