## Weighted Jaccard Overlap Between Domains [Check Formula Source Here](https://arxiv.org/pdf/2104.08663.pdf)

In [7]:
import spacy

In [12]:
nlp = spacy.load("en_core_web_sm")

In [27]:
from spacy.attrs import ORTH

In [35]:
import glob

In [261]:
import pickle
from collections import Counter
from itertools import permutations,combinations

In [112]:
for domain in ['dvd','music','books','kitchen_housewares','electronics']:
        fl = glob.glob(f"{domain}/*_rev.txt")
        f = open(fl[0],"r")
        music_all = []
        for fs in f:
            fs = fs.strip("\n")
            music_all.append(fs)
        music_words_all = []
        for docs in music_all:
            doc = nlp(docs)
            for token in doc:
                if token.is_punct == False and token.is_stop==False:
                    music_words_all.append(token.text)
        pickle.dump(music_words_all,open(f"{domain}/{domain}_words_all.p","wb"))

In [256]:
def weighted_jaccard(domain1,domain2):
    common = set(domain1.keys()).intersection(set(domain2.keys()))
    domain2_uncommon = set(domain1.keys()).difference(set(domain2.keys()))
    domain1_uncommon = set(domain2.keys()).difference(set(domain1.keys()))
    domain1_norm = sum(list(domain1.values()))
    domain2_norm = sum(list(domain2.values()))
    num = 0 
    den = 0
    for term in common:
        t1 = min(domain1.get(term)/domain1_norm,domain2.get(term)/domain2_norm)
        t2 = max(domain1.get(term)/domain1_norm,domain2.get(term)/domain2_norm)
        num = num + t1
        den = den + t2
        
    for term in domain1_uncommon:
        t1 = 0
        t2 = domain2.get(term)/domain2_norm
        num = num + t1
        den = den + t2
        
    for term in domain2_uncommon:
        t1 = 0
        t2 = domain1.get(term)/domain1_norm
        num = num + t1
        den = den + t2
        
    return num/den

In [262]:
f_name = [z for z in combinations(['dvd','music','books','kitchen_housewares','electronics'],2)]

for in_domain,out_domain in f_name:
    domain1 = pickle.load(open(f"{in_domain}/{in_domain}_words_all.p","rb"))
    domain2 = pickle.load(open(f"{out_domain}/{out_domain}_words_all.p","rb"))
    w_domain1 = [token.lower() for token in domain1]
    w_domain2 = [token.lower() for token in domain2]
    w_domain1_ = Counter(w_domain1)
    w_domain2_ = Counter(w_domain2)
    print(in_domain,out_domain,weighted_jaccard(w_domain1_,w_domain2_))

dvd music 0.44215705986046544
dvd books 0.49360093425205387
dvd kitchen_housewares 0.27763205737418667
dvd electronics 0.28745373769594884
music books 0.3662453531869769
music kitchen_housewares 0.2696803440400113
music electronics 0.281971526117763
books kitchen_housewares 0.27343470020147437
books electronics 0.27904189731054063
kitchen_housewares electronics 0.4274397297557206
