In [2]:
__author__      = 'Yu Cao'

import pandas as pd
import numpy as np
from scipy import stats
import collections
from string import punctuation as Punct


def loadCorpus(corpus_path):
    df = pd.read_csv(corpus_path)
    df = df.fillna('')
    df['count'] = pd.Series(
        [ rawToCount(x) for x in df['pairs_diff'] ], 
        index = df.index
    )
    return df

def isPunct(s):
    return s in Punct or s in ["``", "''"]

def rawToCount(pairs_diff):
    pairs_diff = pairs_diff.replace('@', '')
    tripleList = [x.split(',') for x in pairs_diff.split('\t')]
    tripleList = [x for x in tripleList
        if len(x) == 3 and not isPunct(x[0]) and not isPunct(x[1])
    ]
    if len(tripleList) < 3: return None

    count = {}
    for triple in tripleList[:-1]:
        key = '_'.join(triple[:2])
        if (not key in count) or (count[key] < float(triple[2])):
            count[key] = float(triple[2])
        
    return count

def vocabBuild(countList, freqThreshold=20):
    vocab = collections.Counter()
    for item in countList: 
        if item is not None:
            vocab.update(item.keys())

    vocab = vocab.most_common()
    for c, item in enumerate(vocab):
        if item[1] < freqThreshold: break
    
    return [ x[0] for x in vocab[:c] ]

def compareMeanContrast(df, key):
    summary = collections.defaultdict(list)
    for c, cnt in enumerate(df['count']):
        if cnt is not None and key in cnt:
            summary[df['label'][c]].append(cnt[key])
    return np.mean(summary[1]), np.mean(summary[-1]), stats.ttest_ind(summary[1],summary[-1])
            
def keyExam(innerVocab, cmp_f):
    cnt0, cnt1 = 0, 0
    for key in innerVocab.keys():
        pos, neg, ttest = innerVocab[key]
        if cmp_f(pos, neg):
            cnt0 += 1
            if ttest[1] < 0.05:
                cnt1 += 1
                print(key, '{:.4f} {:.4f} p = {:.4f}'.format(pos, neg, ttest[1]))
    return cnt0, cnt1

In [3]:
'''
main
'''
trainSet = loadCorpus('train.csv')
vocab = vocabBuild(trainSet['count'])

innerVocab = {}
for key in vocab:
    innerVocab[key] = compareMeanContrast(trainSet, key)

In [8]:
print('For the 5 most frequent syntactic category pairs, compare their mean sentiment contrasts of sarcastic and non-sarcastic examples.\n')
for key in vocab[:5]:
    pos, neg, ttest = innerVocab[key]
    print(key, '{:.4f} {:.4f} p = {:.4f}'.format(pos, neg, ttest[1]))

print('\nShow syntactic category pairs for which the mean contrast of sarcastic examples is (significantly) greater than that of non-sarcastic examples.\n')
print(keyExam(innerVocab, lambda x, y: x > y))

print('\nShow syntactic category pairs for which the mean contrast of non-sarcastic examples is (significantly) greater than that of sarcastic examples.\n')
print(keyExam(innerVocab, lambda x, y: x <= y))

For the 5 most frequent syntactic category pairs, compare their mean sentiment contrasts of sarcastic and non-sarcastic examples.

NP_VP 2.6552 2.6147 p = 0.4386
NP_S 3.5538 3.2461 p = 0.0000
IN_NP 1.2320 1.2574 p = 0.5704
DT_NP 1.7953 1.4163 p = 0.0000
NP_PP 1.6903 1.6642 p = 0.6907

Show syntactic category pairs for which the mean contrast of sarcastic examples is (significantly) greater than that of non-sarcastic examples.

NP_S 3.5538 3.2461 p = 0.0000
DT_NP 1.7953 1.4163 p = 0.0000
DT_NN 1.0951 0.9106 p = 0.0095
VP_PP 1.7949 1.6383 p = 0.0283
NP_SBAR 2.4885 2.2374 p = 0.0056
NP_NP 2.3796 1.9490 p = 0.0000
JJ_NN 2.4393 1.8517 p = 0.0000
VBZ_NP 2.0723 1.7572 p = 0.0010
NN_NN 1.1508 0.9152 p = 0.0100
S_S 3.5325 3.2242 p = 0.0177
JJ_NP 2.6268 1.9669 p = 0.0000
ADVP_S 4.0782 3.7046 p = 0.0276
NP_CC 1.5124 1.2133 p = 0.0448
NP_NN 1.0427 0.7367 p = 0.0336
VBZ_PP 1.9888 1.3570 p = 0.0018
VB_ADJP 2.9002 2.3227 p = 0.0422
NN_S 2.4290 1.8105 p = 0.0293
VBD_PP 1.3060 0.5749 p = 0.0001
NP_ADVP