In [1]:
import os
if os.getcwd().endswith("notebooks"):
    os.chdir("..")

import importlib
from IPython.display import display, Markdown, Latex
import numpy as np
import pandas as pd
pd.set_option("display.max_colwidth", None)
pd.set_option('display.max_rows', 500)

from src.utils import tree_utils
from src.features import feature_utils

### Load the maximum likelihood subtrees for IMDB

In [2]:
data = feature_utils.load_trees(output_dir="data/TSA", dataset="subj")
print(data['train']['sentence'][0])
print(data['train']['tree'][0])

parsed 3297/3297 trees
parsed 1108/1108 trees
We wait for 5 Years to watch this Shit 5 Matches abandoned without a ball bowled dont you feel ashamed of your planning I feel sad for as they missed 2 chances because of you Please Order Chullu Bhar Paani from Amazon
(13
  (20
    (2
      (28
        (12
          (3
            (28
              (31 (15 (39 wait) (75 for)) (34 5))
              (22 (6 (16 (45 Years) (47 to)) (68 watch)) (34 this)))
            (62 Shit))
          (85 5))
        (88 Matches))
      (54 abandoned))
    (20
      (2
        (27
          (71 without)
          (17
            (28
              (2
                (17 (28 (83 a) (56 ball)) (38 bowled))
                (25 (44 dont) (0 (91 you) (33 feel))))
              (31 (15 (78 ashamed) (85 of)) (34 your)))
            (38 planning)))
        (0 (91 I) (33 feel)))
      (20
        (15 (39 sad) (75 for))
        (17
          (28 (2 (49 as) (0 (91 they) (33 missed))) (64 2))
          (38 chances)))))
 

In [7]:
print(data["dev"]["tree"][0][0])

(13
  (36 The)
  (7
    (5 (0 (78 women) (11 (61 appears) (40 to))) (76 have))
    (42 some)))


### Enumerate all of the subtrees and sort by mutual information

In [3]:
feature_utils.add_features(data, "Subtrees", feature_utils.pcfg_subtrees_by_depth(7))
subtrees = feature_utils.get_subtree_feature_table(data)
subtrees.head(10)

56276 Subtrees features


Unnamed: 0,Root,Subtree,Yield,MI,Count,non-hateful,hateful,Majority label,% majority
0,83,(83 *),a,0.003399,858.0,467.0,391.0,non-hateful,0.544186
1,72,(72 *),he,0.002644,159.0,72.0,87.0,hateful,0.546584
2,61,(61 *),fuck,0.002534,11.0,0.0,11.0,hateful,0.923077
3,36,(36 *),shit,0.002075,15.0,2.0,13.0,hateful,0.823529
4,91,(91 *),you,0.002056,295.0,151.0,144.0,non-hateful,0.511785
5,2,(2 (65 *) (61 *)),the fuck,0.00201,9.0,0.0,9.0,hateful,0.909091
6,33,(33 *),did,0.00177,11.0,1.0,10.0,hateful,0.846154
7,56,(56 *),fucking,0.001752,8.0,0.0,8.0,hateful,0.9
8,93,(93 *),hes,0.00155,41.0,15.0,26.0,hateful,0.627907
9,63,(63 *),fucking,0.001497,7.0,0.0,7.0,hateful,0.888889


### Create aggregated features by merging subtrees by root non-terminal and majority class label

In [4]:
feature_utils.add_merges(data, "Subtrees", K=2000, merge_name="Subtree groups",
                         filter_=lambda w: w[0].count("*") >= 2)
for item in data.keys():
    print(item)
#print(data["train"])
print(data["mi"]["Subtree groups"][0])
print(data["idx_w"]["Subtree groups"][0][1])

filtering
train
dev
dataset
vect
idx_w
mi
counts
Subtree groups
0.156921841022032
[['(2 (65 *) (61 *))' 'the fuck']
 ['(2 (29 (28 (82 *) (34 *)) (70 *)) (87 *))' 'ban this world cup']
 ['(2 (83 *) (35 *))' 'a and']
 ...
 ['(2 (67 *) (2 (59 *) (71 *)))' 'today <unk> such']
 ['(2 (65 *) (61 *))' 'the average']
 ['(2 (67 *) (2 (59 *) (71 *)))' 'up <unk> Only']]


In [5]:

feature_utils.get_merged_feature_table(data, merge_name="Subtree groups").head(40)

I 2023-01-21T11:22:06 numexpr.utils:145: Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
I 2023-01-21T11:22:06 numexpr.utils:157: NumExpr defaulting to 8 threads.


Unnamed: 0,Root,Examples,MI,Count,non-hateful,hateful,Majority label idx,Majority label,% majority,Support count,Counter count
0,2,"the fuck, ban this world cup, a and, better use to help, wall that could have been, Trump is, fuck off, getting even to do thankfully knife blades cant penetrate",0.156922,557.0,19.0,538.0,1,hateful,0.964222,538.0,19.0
1,3,"a 23 million waste, are a nation, a piece, at the bottom, big media house is covering this news, equal rights with functional members, used to celebrate a man, vanished its element",0.076197,253.0,0.0,253.0,1,hateful,0.996078,253.0,0.0
2,31,"ICC Shift this world to another, called as national, day n his, a sports <unk> a, pity on your, live in this, resigned from their, hearing about this",0.062838,222.0,2.0,220.0,1,hateful,0.986607,220.0,2.0
3,28,"the worst, <unk> this, a fucking, in the, ban Rain, <unk> ICC Shift this world to another, Shift this, has a",0.026411,202.0,35.0,167.0,1,hateful,0.823529,167.0,35.0
4,15,"live in, ICC Shift this world to, him for, known as, sick of, tired of, called as, along with",0.026154,238.0,50.0,188.0,1,hateful,0.7875,188.0,50.0
5,30,"a Democrat to turn Fathers Day, Mitch is <unk> <unk> and <unk>, affair why <unk> and, so much <unk> accomplished <unk>, out there <unk> <unk> LIKE <unk>, a bouncer test <unk> <unk>, off round table 0, it is <unk> <unk> <unk>",0.021061,76.0,0.0,76.0,1,hateful,0.987179,76.0,0.0
6,25,"should be, would have, cant penetrate, can use, dont want, is what, should not be, <unk> get",0.014559,245.0,77.0,168.0,1,hateful,0.684211,168.0,77.0
7,0,"you really, you are, You are, you get, you said, you think, he was, I was",0.011304,179.0,54.0,125.0,1,hateful,0.696133,125.0,54.0
8,15,"stand with, attack on, related to, nailed Mamata, politics and, protest against, focus on, solidarity with",0.010543,157.0,142.0,15.0,0,non-hateful,0.899371,142.0,15.0
9,16,"like to, even to, continues to, use to, tried to, opportunity to, this to, used to",0.009981,135.0,37.0,98.0,1,hateful,0.722628,98.0,37.0


Print out the top subtrees with root node 29:

In [9]:
feature_utils.add_merges(data, "Subtrees", merge_name="Root 4", K=1000, by_template=True,
                         filter_=lambda w: w[0].startswith("(4 "))

filtering


In [11]:
feature_utils.get_merged_feature_table(data, merge_name="Root 4").head(20)

Unnamed: 0,Root,Examples,MI,Count,non-hateful,hateful,Majority label idx,Majority label,% majority,Support count,Counter count
0,(4 (19 (31 (47 *) (86 *)) (80 *)) (92 *)),"<unk> , not anymore, born to be whores, days when these freaks, deserve to be abused, effort to not look, excuses to be lazy, fantasising of choking her, gays , gun shot",0.021502,16.0,0.0,16.0,1,hateful,0.944444,16.0,0.0
1,(4 (19 (31 (47 *) (86 *)) (80 *)) (92 *)),"women and men killed, want to be straight, want to be special, sensitive to loud noise, proud of you mom, proud of being black, pray to the god, people to throw Skittles",0.016292,17.0,17.0,0.0,0,non-hateful,0.947368,17.0,0.0
2,(4 (11 (57 *) (81 *)) (46 *)),"a catholic one, a face like, a manufactured group, a mental illness, a woman asks, the Russian occupation, the fire booms, the fuck up",0.015269,12.0,0.0,12.0,1,hateful,0.928571,12.0,0.0
3,(4 (11 (57 *) (81 *)) (46 *)),"the past because, the only one, the most calm, the fuck out, the feminists accusing, the biggest group, the Freddie Mercury, a world full",0.009405,11.0,11.0,0.0,0,non-hateful,0.923077,11.0,0.0
4,(4 (31 (47 *) (86 *)) (92 *)),"themselves and veterans, apply on people, back to music, cling to them, dogs , women, listens to rap, parasites will flee",0.007882,7.0,0.0,7.0,1,hateful,0.888889,7.0,0.0
5,(4 (11 (89 *) (67 *)) (46 *)),"your brother instead, to meet one, get crazy religious, some fucking balls, their cocks off, this traumatic experience",0.006488,6.0,0.0,6.0,1,hateful,0.875,6.0,0.0
6,(4 (19 (31 (37 *) (48 *)) (80 *)) (92 *)),"world should be ashamed, wake up one morning, people think or say, people becomes evil easily, it goes both ways, is complaining his rights, he 's not at, Allah give them mercy",0.006171,8.0,8.0,0.0,0,non-hateful,0.9,8.0,0.0
7,(4 (19 (31 (37 *) (48 *)) (80 *)) (92 *)),"attack u in name, fi gay get all, take over our home, that behave like niggers, would not get away",0.005136,5.0,0.0,5.0,1,hateful,0.857143,5.0,0.0
8,(4 (19 (31 (61 *) (48 *)) (80 *)) (92 *)),"you literally described yourself, you did not crying, you are not masculine, you should be proud, you smell like Doritos, you think about yourself",0.004143,6.0,6.0,0.0,0,non-hateful,0.875,6.0,0.0
9,(4 (19 (31 (31 (47 *) (86 *)) (43 *)) (80 *)) (92 *)),"doll and then come talk, happy to see it burn, need to break her ass, rid of your autistic son",0.003838,4.0,0.0,4.0,1,hateful,0.833333,4.0,0.0


The subtrees with roots 5 and 8:

In [24]:
feature_utils.add_merges(data, "Subtrees", merge_name="Root 5/8", K=100,
                         filter_=lambda w: w[0].startswith("(5 ") or w[0].startswith("(8 "))

filtering


In [25]:
feature_utils.get_merged_feature_table(data, merge_name="Root 5/8")

Unnamed: 0,Root,Examples,MI,Count,non-hateful,hateful,Majority label idx,Majority label,% majority,Support count,Counter count
0,8,"when i moved to, but it ' s, Irish prison is, about it is, accepted there is, but this happens, day Ireland becomes, as Ireland has",0.034564,98.0,1.0,97.0,1,hateful,0.98,97.0,1.0
1,8,,0.000149,0.0,0.0,0.0,0,non-hateful,0.5,0.0,0.0
