In [11]:
"""This script generates the features for classification in subtask A. Set the TRAIN flag to True to 
    generate features for the training data, and to False to generate features for test data"""

import json, re
import numpy as np 
import pandas as pd

TRAIN = True

In [12]:
# reading the data
from load import parse_dataset

In [13]:
if TRAIN:
    dataset='../datasets/train/SemEval2018-T3-train-taskA_emoji.txt'
    corpus, _ = parse_dataset(dataset)
    corpus_preprocessed = json.load(open('../extra_resources/train_preprocessed.txt','r'))
else:
    dataset='../datasets/test_TaskA/SemEval2018-T3_input_test_taskA_emoji.txt'
    corpus = parse_dataset(dataset)
    corpus_preprocessed = json.load(open('../extra_resources/test_preprocessed.txt','r'))

In [14]:
# intensity features - 3 binarized features for splitted tweets which show: 
# 1) the intensity of the left half
# 2) the intensity of the right half
# 3) the difference between the polarities of left and right halves 

# to run this, first download stanford corenlp
# install pycorenlp: pip install pycorenlp
# then enter this on terminal, within the diretcory of corenlp 
# java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "sentiment" -port 9000 -timeout 30000

from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost:9000')

def chunkIt(seq, n):
    """splits the list into n approximately equal sub-lists. source: goo.gl/VrHKeR"""
    avg = len(seq) / float(n)
    out = []
    last = 0.0
    while last < len(seq):
        out.append(seq[int(last):int(last + avg)])
        last += avg
    return out

feats_1 = []

#for text in corpus:
#    part1, part2 = chunkIt(text, 2)
    
part1, part2 = chunkIt(corpus[0], 2)

output1 = nlp.annotate(part1, properties={
                       'annotators': 'sentiment',
                         'outputFormat': 'json'
                       })
                 #)['sentences'][0]['sentimentValue'])
output2 = int(nlp.annotate(part2, properties={
                       'annotators': 'sentiment',
                         'outputFormat': 'json'
                       })['sentences'][0]['sentimentValue'])
print(output1)
    #leftIntensity = rightIntensity = polarityDiff = 0
    #if output1 in [0,4]:
    #    leftIntensity = 1
    #if output2 in [0,4]:
    #    rightIntensity = 1
    #if (output1>2 and output2<2) or (output1<2 and output2>2):
    #    polarityDiff = 1
    #feats_1.append(np.array([leftIntensity, rightIntensity, polarityDiff]))

{'sentences': [{'index': 0, 'parse': '(ROOT\n  (NP\n    (NP (JJ Sweet) (NNP United) (NNPS Nations))\n    (NP (NN video))\n    (. .)))', 'basicDependencies': [{'dep': 'ROOT', 'governor': 0, 'governorGloss': 'ROOT', 'dependent': 3, 'dependentGloss': 'Nations'}, {'dep': 'amod', 'governor': 3, 'governorGloss': 'Nations', 'dependent': 1, 'dependentGloss': 'Sweet'}, {'dep': 'compound', 'governor': 3, 'governorGloss': 'Nations', 'dependent': 2, 'dependentGloss': 'United'}, {'dep': 'dep', 'governor': 3, 'governorGloss': 'Nations', 'dependent': 4, 'dependentGloss': 'video'}, {'dep': 'punct', 'governor': 3, 'governorGloss': 'Nations', 'dependent': 5, 'dependentGloss': '.'}], 'enhancedDependencies': [{'dep': 'ROOT', 'governor': 0, 'governorGloss': 'ROOT', 'dependent': 3, 'dependentGloss': 'Nations'}, {'dep': 'amod', 'governor': 3, 'governorGloss': 'Nations', 'dependent': 1, 'dependentGloss': 'Sweet'}, {'dep': 'compound', 'governor': 3, 'governorGloss': 'Nations', 'dependent': 2, 'dependentGloss':

In [5]:
# contrast
df = pd.read_csv('../extra_resources/Emoji_Sentiment_Data_v1.0.csv')

df = df[['Emoji', 'Negative', 'Neutral', 'Positive']]
tuples = [tuple(x) for x in df.values]
# tuples are of the form (emoji, negative, neutral, positive)

idx2lb = {0:-1, 1:0, 2:1}
emoji_sentimens = {}
for val in tuples:
    emoji_sentimens[val[0]] = idx2lb[np.argmax(np.array(val[1:]))]

def extractEmoticon(tweet):
    """returns all the emoticons in tweet"""
    return re.findall(r'[\U0001f600-\U0001f650]', ' '.join(tweet))

twts = [extractEmoticon(twt[0]) for twt in corpus_preprocessed]
twts = [[emoji_sentimens[emoji] for emoji in twt] for twt in twts]

def extractHashtag(tweet):
    t = tweet.split(' ')
    text = []
    hashtagText = []
    oneHashtag = []
    flag = 0
    for w in t:
        if w == "<hashtag>":
            flag = 1
            continue
        if flag == 1:
            if w == "</hashtag>":
                hashtagText.append(oneHashtag)
                oneHashtag = []
                flag = 0
            else:
                oneHashtag.append(w)
        else:
            text.append(w)
    return text, hashtagText

txt = [extractHashtag(tweet) for tweet in corpus_preprocessed]

assert len(txt) == len(twts)
txt = [(txt[i][0], txt[i][1], twts[i]) for i in range(len(twts))]

# 0: very negative
# 1: negative 
# 2: neutral 
# 3: positive 
# 4: very positive 
    
def sentiment(txt):
    """compute sentiment for text"""
    txt = ' '.join(txt)
    if not len(txt): return 2
    output = int(nlp.annotate(txt, properties={
                              'annotators': 'tokenize,ssplit,pos,depparse,parse,sentiment',
                                'outputFormat': 'json'
                              })['sentences'][0]['sentimentValue'])
    return output

def contrast(twt):
    """search for emotion contrast in hastag, emoticon and tweet text"""
    contrast = 0 # contrast flag
    txt_sentiment = sentiment(twt[0])
    htag_sentiment = [sentiment(h) for hash_segments in twt[1] for h in hash_segments]
    emoji_sentiment = twt[2]

    if (txt_sentiment in {2,3,4}) and (set(htag_sentiment) & {0,1}):
        contrast = 1
    elif (txt_sentiment in {0,1}) and (set(htag_sentiment) & {3,4}): # maybe later try adding 2
        contrast = 1
    elif (txt_sentiment in {2,3,4}) and (set(emoji_sentiment) & {-1}):
        contrast = 1
    elif (txt_sentiment in {0,1}) and (set(emoji_sentiment) & {1}):
        contrast = 1
    elif {-1,1} in set(emoji_sentiment):
        contrast = 1
    elif ({0,4} in set(htag_sentiment)) or ({0,3} in set(htag_sentiment)) or ({1,4} in set(htag_sentiment)):
        contrast = 1
    elif (set(htag_sentiment) & {0,1}) and (set(emoji_sentiment) & {1}):
        contrast = 1
    elif (set(htag_sentiment) & {3,4}) and (set(emoji_sentiment) & {-1}):
        contrast = 1
    return contrast


contrast_feats = [np.array([contrast(twt)]) for twt in txt]

In [6]:
# feats = [feats[i].append(contrast_feats[i]) for i in range(len(feats))]

In [7]:
# ekphrasis-based features (extracted from pre-processed data)

tags =  ['<allcaps>', '<annoyed>', '<censored>', '<date>', '<elongated>', '<emphasis>', '<happy>',
         '<hashtag>', '<heart>', '<kiss>', '<laugh>', '<money>', '<number>', '<percent>', '<phone>',
         '<repeated>', '<sad>', '<shocking>', '<surprise>', '<time>', '<tong>', '<url>', '<user>',
         '<wink>']

def tweet_vecs(twt, n=2):
    """extract a feature vector for a single tweet, based on the counts of the annotation tags
        split the tweet to n equal parts and computes the same features for each part"""
    twt = twt.split()
    chunks = chunkIt(twt, n)
    
    scores = []
    
    for chunk in chunks:
        for tag in tags:
            scores.append(sum(1 for t in chunk if t == tag))
    return scores
    
def feats(text):
    """apply the tweet_vecs function on all tweets and return a result in a list"""
    return [tweet_vecs(twt) for twt in text]

ekphrasis_feats = [np.array(v) for v in feats(corpus_preprocessed)]

In [8]:
from ekphrasis.utils.nlp import polarity

polarity_flag = True

polarity_vectors = []
for tweet in corpus_preprocessed:
    chunks = chunkIt(tweet, 2)
    polarity_vectors.append(np.concatenate(((polarity(chunks[0])[1], polarity(chunks[1])[1])), axis=0))

assert len(ekphrasis_feats) == len(polarity_vectors)

if polarity_flag: 
    ekphrasis_feats = [np.concatenate((ekphrasis_feats[i], polarity_vectors[i])) for i in range(len(ekphrasis_feats))]

In [9]:
#from ekphrasis.utils.nlp import polarity

#doc = 'As I was walking though a very bad neighbourhood I noticed there were a lot of nice and friendly people wanting to help me be safe. I was suprised and felt good'
#doc = doc.split()

#print(polarity(doc))

In [10]:
ekphrasis_feats[0].shape

(54,)

In [11]:
# concatenate all the features 
features = np.concatenate((feats_1, contrast_feats, ekphrasis_feats), axis=1)

In [12]:
len(features)

3834

In [13]:
features.shape

(3834, 58)

In [14]:
# save the features in a numpy file 
if TRAIN:
    np.save('train_feats_taskA.npy', features)
else:
    np.save('test_feats_taskA.npy', features)