In [58]:
from paspailleur import pattern_structures as PS
import caspailleur as csp

from tqdm import tqdm
import pandas as pd
import string

from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

from src.preprocessing import load_semeval_taskb


In [38]:
df = load_semeval_taskb('full')
df['tokens'] = df.text.apply(lambda x: ' '.join(TweetTokenizer().tokenize(x)))
df.head()

Unnamed: 0,example_id,label_id,text,split,tokens
0,train_1,1,sweet united nations video. just in time for c...,train,sweet united nations video . just in time for ...
1,train_2,1,@mrdahl87 we are rumored to have talked to erv...,train,@mrdahl87 we are rumored to have talked to erv...
2,train_3,1,hey there! nice to see you minnesota/nd winter...,train,hey there ! nice to see you minnesota / nd win...
3,train_4,0,3 episodes left i'm dying over here,train,3 episodes left i'm dying over here
4,train_5,2,i can't breathe! was chosen as the most notabl...,train,i can't breathe ! was chosen as the most notab...


In [59]:
PUNCT = set(list(string.punctuation))

def get_stable_extents(
        sentences, 
        stopwords=set(stopwords.words('english')) | PUNCT,
        min_support=0.005,
        n_stable_extents=1000,
    ):

    ps = PS.NgramPS()

    data = list(ps.preprocess_data(sentences))

    attr_extents = [
        ext for descr, ext in ps.iter_bin_attributes(data, min_support=min_support)
        if not (set(descr) <= stopwords)  # Do not consider ngrams, that consist only of stop words
    ]

    stable_extents = csp.mine_equivalence_classes.list_stable_extents_via_sofia(
        attr_extents, n_stable_extents=n_stable_extents,
        use_tqdm=True, n_attributes=len(attr_extents)   
    )

    return stable_extents

def get_pattern_df(sentences, stable_extents):
    ps = PS.NgramPS()

    stable_extents = sorted(stable_extents, key=lambda ext: ext.count())
    stable_extents_isets = list(csp.base_functions.bas2isets(stable_extents))
    stable_intents = [ps.intent(list(ps.preprocess_data(sentences)) , extent) for extent in tqdm(stable_extents_isets)]

    patterns_df = pd.DataFrame()
    patterns_df['extents'] = stable_extents_isets
    patterns_df['intents'] = stable_intents
    patterns_df['intents_verb'] = [[' '.join(ngram) for ngram in intent] for intent in stable_intents]
    patterns_df['support'] = [len(extent) for extent in stable_extents_isets]
    patterns_df['delta_measure'] = list(csp.indices.delta_stability_index(stable_extents))
    patterns_df['interestingness'] = patterns_df['support'] * patterns_df['delta_measure']
    
    return patterns_df

In [60]:
ps = PS.NgramPS()

full = get_stable_extents(df.tokens)
label0 = get_stable_extents(df[df.label_id == 0].tokens)
label1 = get_stable_extents(df[df.label_id == 1].tokens)
label2 = get_stable_extents(df[df.label_id == 2].tokens)
label3 = get_stable_extents(df[df.label_id == 3].tokens)

  0%|          | 0/246 [00:00<?, ?it/s]

  0%|          | 0/269 [00:00<?, ?it/s]

  0%|          | 0/384 [00:00<?, ?it/s]

  0%|          | 0/339 [00:00<?, ?it/s]

  0%|          | 0/355 [00:00<?, ?it/s]

In [90]:
full_patterns = get_pattern_df(df.tokens, full)
label0_patterns = get_pattern_df(df[df.label_id == 0].tokens, label0)
label1_patterns = get_pattern_df(df[df.label_id == 1].tokens, label1)
label2_patterns = get_pattern_df(df[df.label_id == 2].tokens, label2)
label3_patterns = get_pattern_df(df[df.label_id == 3].tokens, label3)

100%|██████████| 909/909 [00:15<00:00, 59.16it/s]
100%|██████████| 655/655 [00:05<00:00, 119.46it/s]
100%|██████████| 693/693 [00:03<00:00, 211.80it/s]
100%|██████████| 697/697 [00:00<00:00, 890.17it/s] 
100%|██████████| 517/517 [00:00<00:00, 1448.17it/s]


In [91]:
full_sentences = list(ps.preprocess_data(df.tokens))

In [92]:
full_patterns.sort_values('interestingness', ascending=False).head(30)

Unnamed: 0,extents,intents,intents_verb,support,delta_measure,interestingness
908,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",(),[],4601,4007,18436207
907,"(1, 4098, 10, 4108, 13, 2061, 4114, 21, 4119, ...","((...,))",[...],594,558,331452
905,"(3586, 3, 515, 1540, 518, 2567, 4105, 11, 1035...","((i'm,))",[i'm],230,194,44620
904,"(6, 2566, 1545, 2571, 1037, 3086, 1041, 4115, ...","((like,))",[like],209,179,37411
906,"(3074, 1541, 3591, 2574, 3598, 4113, 1554, 257...","((love,))",[love],246,146,35916
903,"(1025, 4099, 7, 2056, 9, 10, 2058, 2060, 3082,...","((day,))",[day],191,164,31324
902,"(3584, 2052, 520, 10, 523, 525, 2574, 16, 3602...","((get,))",[get],190,142,26980
901,"(1029, 6, 4101, 527, 1552, 531, 20, 4115, 2072...","((..,))",[..],172,148,25456
900,"(4098, 515, 2565, 4103, 4104, 4107, 2060, 530,...","((one,))",[one],152,124,18848
899,"(1029, 2574, 4112, 4115, 2581, 2073, 4122, 412...","((people,))",[people],134,120,16080


In [93]:
label0_patterns.sort_values('interestingness', ascending=False).head(30)

Unnamed: 0,extents,intents,intents_verb,support,delta_measure,interestingness
654,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",(),[],2389,2061,4923729
653,"(2052, 7, 9, 2061, 2063, 18, 2067, 2068, 2069,...","((...,))",[...],328,309,101352
652,"(0, 513, 5, 521, 523, 1041, 25, 1049, 1564, 31...","((i'm,))",[i'm],104,85,8840
651,"(1536, 527, 16, 1041, 1048, 1058, 1575, 560, 5...","((like,))",[like],101,84,8484
650,"(516, 9, 17, 1044, 1045, 1046, 1560, 28, 1055,...","((get,))",[get],94,75,7050
649,"(2053, 1032, 1566, 2086, 39, 2089, 45, 2102, 1...","((one,))",[one],83,66,5478
648,"(1032, 2057, 1546, 17, 1553, 19, 2069, 2073, 1...","((..,))",[..],80,64,5120
647,"(2, 1411, 4, 1538, 8, 776, 1032, 1802, 2057, 1...","((day,))",[day],67,56,3752
646,"(768, 1153, 1799, 136, 904, 2313, 909, 2061, 1...","((love,))",[love],63,47,2961
644,"(1667, 519, 1928, 2316, 272, 402, 1555, 1302, ...","((go,))",[go],59,45,2655


In [94]:
label1_patterns.sort_values('interestingness', ascending=False).head(30)

Unnamed: 0,extents,intents,intents_verb,support,delta_measure,interestingness
692,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",(),[],1547,1361,2105467
691,"(1, 1026, 1538, 4, 517, 518, 1030, 520, 10, 17...","((...,))",[...],186,170,31620
690,"(1536, 515, 1028, 518, 1542, 9, 1035, 524, 525...","((love,))",[love],176,94,16544
689,"(4, 519, 1031, 1032, 1543, 11, 1544, 13, 1529,...","((day,))",[day],103,82,8446
687,"(1027, 1541, 12, 16, 1040, 532, 538, 545, 35, ...","((i'm,))",[i'm],97,83,8051
685,"(515, 1028, 518, 525, 529, 24, 1055, 33, 553, ...","((i, love))",[i love],82,74,6068
688,"(5, 1029, 520, 1036, 1041, 532, 1050, 1063, 41...","((great,))",[great],101,57,5757
686,"(1537, 3, 516, 1027, 1029, 1541, 535, 1049, 10...","((like,))",[like],82,70,5740
684,"(1408, 897, 642, 387, 1537, 645, 134, 519, 395...","((fun,))",[fun],71,55,3905
683,"(3, 1527, 645, 1157, 263, 8, 519, 394, 903, 14...","((..,))",[..],67,58,3886


In [95]:
label2_patterns.sort_values('interestingness', ascending=False).head(30)

Unnamed: 0,extents,intents,intents_verb,support,delta_measure,interestingness
696,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",(),[],401,359,143959
695,"(386, 3, 259, 136, 9, 11, 139, 142, 143, 29, 2...","((...,))",[...],42,38,1596
694,"(134, 135, 136, 144, 279, 28, 294, 168, 296, 3...","((get,))",[get],25,16,400
693,"(128, 384, 393, 274, 19, 147, 155, 28, 35, 169...","((people,))",[people],22,18,396
692,"(194, 99, 195, 292, 122, 234, 11, 139, 365, 39...","((i'm,))",[i'm],16,12,192
691,"(128, 5, 166, 7, 74, 202, 236, 13, 173, 206, 1...","((..,))",[..],15,12,180
690,"(161, 130, 391, 137, 143, 254, 117, 54, 245, 3...","((would,))",[would],15,12,180
689,"(353, 195, 165, 103, 359, 206, 240, 368, 210, ...","((like,))",[like],13,11,143
687,"(195, 388, 296, 265, 234, 396, 205, 334, 211, ...","((one,))",[one],13,10,130
686,"(34, 323, 354, 133, 377, 140, 173, 300, 209, 2...","((someone,))",[someone],13,10,130


In [96]:
label3_patterns.sort_values('interestingness', ascending=False).head(30)

Unnamed: 0,extents,intents,intents_verb,support,delta_measure,interestingness
516,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",(),[],264,226,59664
515,"(1, 258, 262, 7, 135, 139, 13, 142, 15, 29, 32...","((...,))",[...],38,34,1292
513,"(257, 263, 202, 12, 174, 206, 144, 240, 20, 18...","((like,))",[like],13,11,143
514,"(197, 72, 170, 108, 240, 178, 147, 21, 249, 15...","((i'm,))",[i'm],13,10,130
512,"(126, 259, 69, 165, 200, 174, 177, 178, 83, 12...","((one,))",[one],12,9,108
511,"(256, 130, 135, 136, 13, 176, 85, 247, 217, 61...","((know,))",[know],11,8,88
510,"(99, 203, 112, 144, 50, 149, 214, 119, 215, 124)","((get,))",[get],10,8,80
508,"(259, 68, 199, 263, 73, 110, 85, 216, 153, 221)","((..,))",[..],10,8,80
509,"(100, 164, 260, 199, 72, 47, 15, 175, 149, 57)","((say,))",[say],10,6,60
507,"(72, 202, 240, 209, 241, 149, 85, 126, 127)","((people,))",[people],9,6,54


In [103]:
label0_patterns['precision'] = label0_patterns.support / [len(list(ps.extent(full_sentences, intent))) for intent in tqdm(label0_patterns['intents'])]
# label0_patterns.sort_values(['precision', 'support'], ascending=False).head(30)
label0_patterns[label0_patterns['precision'] > .8].sort_values(['support'], ascending=False).head(30)

100%|██████████| 655/655 [00:04<00:00, 144.24it/s]


Unnamed: 0,extents,intents,intents_verb,support,delta_measure,interestingness,precision
632,"(640, 1281, 1798, 903, 524, 1293, 270, 1932, 4...","((via,))",[via],50,33,1650,0.819672
624,"(128, 1280, 386, 1793, 1925, 647, 1927, 778, 1...","((@rbrnetwork1,))",[@rbrnetwork1],45,33,1485,1.0
599,"(1030, 1801, 399, 15, 2193, 2334, 1439, 164, 1...","((check,))",[check],29,15,435,0.90625
595,"(904, 12, 1805, 1679, 2064, 1691, 927, 1570, 4...","((️,))",[️],27,17,459,0.818182
574,"(1794, 1168, 401, 280, 1181, 2334, 1959, 1576,...","((follow,))",[follow],24,18,432,0.857143
549,"(896, 2056, 2061, 808, 2351, 176, 819, 1461, 2...","((#the,))",[#the],19,11,209,1.0
543,"(416, 803, 36, 293, 774, 1127, 1253, 2086, 221...","((2015,))",[2015],18,14,252,0.857143
539,"(1507, 1574, 1798, 232, 524, 1101, 270, 1199, ...","((.,), (via, @reuters))","[., via @reuters]",17,14,238,1.0
526,"(2137, 1315, 166, 230, 2361, 426, 940, 337, 17...","((would, be))",[would be],17,13,221,0.85
525,"(801, 354, 2086, 2056, 555, 140, 2124, 1166, 5...","((#is,))",[#is],17,7,119,0.944444


In [102]:
label1_patterns['precision'] = label1_patterns.support / [len(list(ps.extent(full_sentences, intent))) for intent in tqdm(label1_patterns['intents'])]
# label1_patterns.sort_values(['precision', 'support'], ascending=False).head(30)
label1_patterns[label1_patterns['precision'] > .8].sort_values(['support'], ascending=False).head(30)

100%|██████████| 693/693 [00:05<00:00, 137.50it/s]


Unnamed: 0,extents,intents,intents_verb,support,delta_measure,interestingness,precision
685,"(515, 1028, 518, 525, 529, 24, 1055, 33, 553, ...","((i, love))",[i love],82,74,6068,0.82
684,"(1408, 897, 642, 387, 1537, 645, 134, 519, 395...","((fun,))",[fun],71,55,3905,0.835294
671,"(640, 1029, 520, 144, 272, 400, 913, 150, 790,...","((a, great))",[a great],44,29,1276,0.916667
656,"(257, 899, 1415, 908, 13, 525, 271, 1045, 288,...","((yay,))",[yay],32,24,768,0.941176
648,"(518, 396, 529, 1425, 1426, 663, 665, 33, 165,...","((waking, up))",[waking up],27,11,297,0.964286
634,"(394, 524, 414, 1446, 1064, 555, 1329, 946, 12...","((just, love))",[just love],24,6,144,0.96
619,"(790, 921, 1436, 1070, 1326, 699, 1084, 1088, ...","((great,), (day,))","[great, day]",21,6,126,0.913043
618,"(523, 914, 1173, 1303, 1444, 1191, 683, 1197, ...","((glad,))",[glad],21,10,210,0.807692
609,"(1217, 386, 612, 804, 326, 488, 1418, 459, 525...","((monday,))",[monday],18,16,288,0.818182
608,"(611, 1476, 1064, 617, 394, 555, 524, 237, 111...","((i, just, love))",[i just love],18,14,252,0.947368


In [104]:
label2_patterns['precision'] = label2_patterns.support / [len(list(ps.extent(full_sentences, intent))) for intent in tqdm(label2_patterns['intents'])]
# label2_patterns.sort_values(['precision', 'support'], ascending=False).head(30)
label2_patterns[label2_patterns['precision'] > .8].sort_values(['support'], ascending=False).head(30)

100%|██████████| 697/697 [00:07<00:00, 94.83it/s] 


Unnamed: 0,extents,intents,intents_verb,support,delta_measure,interestingness,precision
539,"(96, 36, 278)","((killed, by))",[killed by],3,1,3,1.0
497,"(239, 166, 159)","((#hypocrites,))",[#hypocrites],3,2,6,1.0
411,"(194, 85, 7)","((friends,), (to,), (me,), (i,), (my,))","[friends, to, me, i, my]",3,1,3,1.0
451,"(17, 172, 317)","((only, way), (if,))","[only way, if]",3,2,6,1.0
459,"(318, 274, 30)","((the, irony))",[the irony],3,2,6,1.0
484,"(400, 358, 391)","((anti,))",[anti],3,2,6,1.0
491,"(374, 252, 14)","((.,), (article,), (on,))","[., article, on]",3,2,6,1.0
469,"(129, 371, 292)","((., yet), (is,), (of,))","[. yet, is, of]",3,1,3,1.0
508,"(113, 299, 241)","((to,), (parking,))","[to, parking]",3,1,3,1.0
519,"(248, 10, 180)","((i,), (followed,), (.,), (a,))","[i, followed, ., a]",3,2,6,1.0


In [105]:
label3_patterns['precision'] = label3_patterns.support / [len(list(ps.extent(full_sentences, intent))) for intent in tqdm(label3_patterns['intents'])]
# label3_patterns.sort_values(['precision', 'support'], ascending=False).head(30)
label3_patterns[label3_patterns['precision'] > .8].sort_values(['support'], ascending=False).head(30)

100%|██████████| 517/517 [00:05<00:00, 101.57it/s]


Unnamed: 0,extents,intents,intents_verb,support,delta_measure,interestingness,precision
450,"(164, 242, 100)","((wonder, what), (?,), (the,))","[wonder what, ?, the]",3,1,3,1.0
437,"(256, 217, 13)","((.,), (know, ?))","[., know ?]",3,2,6,1.0
261,"(8, 64)","((isis,), (is,))","[isis, is]",2,1,2,1.0
248,"(99, 149)","((break,), (.,), (that,), (get,))","[break, ., that, get]",2,1,2,1.0
251,"(202, 78)","((those,), (all,), (is,), (i,), (my,), (pic,),...","[those, all, is, i, my, pic, and]",2,1,2,1.0
254,"(155, 68)","((fan, .), (be,))","[fan ., be]",2,1,2,1.0
255,"(160, 17)","((i,), (trending,))","[i, trending]",2,1,2,1.0
258,"(100, 63)","((don't,), (...,), (girls,))","[don't, ..., girls]",2,1,2,1.0
259,"(73, 238)","((human,), (day,))","[human, day]",2,1,2,1.0
260,"(116, 29)","((...,), (.,), (i,), (much,))","[..., ., i, much]",2,1,2,1.0
