**Sarcasm Target Detection Codebase**

In [3]:
# Upload 'snippets.xlsx' from the local file system.
from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving snippets.xlsx to snippets.xlsx
User uploaded file "snippets.xlsx" with length 28003 bytes


In [4]:
# Mount google drive to use 'crawl-300d-2M.vec'. 
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [5]:
!pip install empath

Collecting empath
[?25l  Downloading https://files.pythonhosted.org/packages/d2/84/a5de61a99252f60d705d7982b3648db517a704c89fa7629d3d3637a6e604/empath-0.89.tar.gz (57kB)
[K     |█████▊                          | 10kB 9.4MB/s eta 0:00:01[K     |███████████▍                    | 20kB 13.5MB/s eta 0:00:01[K     |█████████████████               | 30kB 10.8MB/s eta 0:00:01[K     |██████████████████████▊         | 40kB 9.1MB/s eta 0:00:01[K     |████████████████████████████▍   | 51kB 4.8MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 3.0MB/s 
Building wheels for collected packages: empath
  Building wheel for empath (setup.py) ... [?25l[?25hdone
  Created wheel for empath: filename=empath-0.89-cp37-none-any.whl size=57824 sha256=7967213559eb4c0c2d755cba234922fa1ab3e7779932db655b243fda80397895
  Stored in directory: /root/.cache/pip/wheels/84/ea/2f/2bc54d4f9985ce61753ebc5b00cb2df51d855589267c667308
Successfully built empath
Installing collected packages: empath
S

In [6]:
# Import basic maths and processing libraries.
import pandas as pd
import numpy as np

from keras.layers import Dense ,LSTM,concatenate,Input,Flatten
from keras.models import Model
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam

from nltk.tag import StanfordPOSTagger
from nltk.tag import StanfordNERTagger
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from empath import Empath

import pickle
import codecs
from collections import deque

In [7]:
# Excel file contains no header column and names gives custom column names
df = pd.read_excel("snippets.xlsx", sheet_name=None, header=None, names=['Snippet','target'])
df = df['Sheet1']
embedding_file = '/gdrive/MyDrive/crawl-300d-2M.vec'

In [8]:
# Removes all non-alpha numeric characters except ',' from a given strring
def clearLine(line):
  cleanedLine = ''
  for letter in line:
    if letter == '?' or letter == '!' or letter == ',' or letter == '"' or letter == '-' or letter == ';' or letter == '.' or ord(letter) > 255 :
      cleanedLine += ' '
    else:
      cleanedLine += letter
  return cleanedLine

In [9]:
# Load pre-trained fasttext word embedding from the file
def loadEmbed():
    print('loading word embeddings...')
    embeddings_index = {}
    f = codecs.open(embedding_file, encoding='utf-8')
    for line in f:
        # Line has the format : Word val1 val2 val3 ..... val300
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('found %s word vectors' % len(embeddings_index))
    return embeddings_index

In [10]:
# Loading pre-trained embeddings
model=loadEmbed()

loading word embeddings...
found 1999996 word vectors


In [11]:
# Stores all distinct words in the dataset.
all_words=[]

# Extract all poosible words in the dataset
for i in range(len(df['Snippet'])):
    line = clearLine(df['Snippet'][i])
    all_words.extend(line.split())

# get all unique words from the dataset
all_words=list(dict.fromkeys(all_words))

# make all words to lowercase
all_words=[x.lower() for x in all_words]

In [12]:
# Stores the vector representations of the words from the dataset.
embeddings={}
for each in all_words:
    # Get vectors for words in the dataset
    if each not in model.keys(): 
        embeddings[each]=model['unk']
    else:
        embeddings[each]=model[each]

# make <pad> as the 0-vector
embeddings['<pad>'] = [0]*300
# make <start> token to start vector
embeddings['<start>'] = model["start"]
# make <end> token to end vector 
embeddings['<end>'] = model["end"]

In [13]:
# Prepare the left context, right context, and candidate word from the dataset.
# The pre-processed data has the data type string
# Pre-Processed Data (ppd)

ppd = {}
ppd['left_context'] = []
ppd['right_context'] = []
ppd['Candidate_word'] = []
ppd['target_status'] = []

for i in range(len(df['Snippet'])):
  # clean the line and split the line into individual words
  line = clearLine(df['Snippet'][i].lower());
  line = line.split()
  # get each individual word in sarcasm target to identify possible sarcasm
  # target words in the sentence
  targetList = df['target'][i].lower().split(",")
  targetWord = []
  for tl in targetList:
    targetWord.extend(tl.split())
  
  # Choosing each word in the sentence as candidate word, extract the left context,
  # right context and the target label.
  for i in range(len(line)):
    word = line[i]
    ppd['Candidate_word'].append(word)
    ppd['left_context'].append(["<start>"] + line[:i])
    ppd['right_context'].append(line[i+1:] + ["<end>"])
    ppd['target_status'].append(int(word in targetWord))

In [14]:
# Embedding Left Context
# Convert each word in left context into vector representation to be used as input 
keras_left_context = []
for i in range(len(ppd['left_context'])):
    one_vector = []
    temp = ppd['left_context'][i]
    for m in temp:
        one_vector.append(embeddings[m])
    one_vector.extend([embeddings['<pad>'] for x in range(78 - len(ppd['left_context'][i]))])
    keras_left_context.append(one_vector)

In [15]:
# Embedding Right Context 
# Convert each word in right context into vector representation to be used as input 
keras_right_context = []
for i in range(len(ppd['right_context'])):
    one_vector = []
    temp = ppd['right_context'][i]
    for m in temp:
        one_vector.append(embeddings[m])
    one_vector.extend([embeddings['<pad>'] for x in range(78 - len(ppd['right_context'][i]))])
    keras_right_context.append(one_vector)

In [16]:
# Embedding Candidate Word
# Convert each candidate word into vector representation to be used as input 
keras_middle = []
for i in range(len(ppd['Candidate_word'])):
    keras_middle.append(embeddings[ppd['Candidate_word'][i]])

labels = ppd['target_status']

In [17]:
#Saving the processed dataset in a pickle file
# f = open(b"Data_fast.pkl","wb")
# pickle.dump(zip(keras_left_context,keras_right_context,keras_middle,ppd['target_status']),f)

Socio-Linguistic Feature Extraction


In [18]:
# create an object of one hot encoder
ohe=OneHotEncoder()
# create an object of label encoder
lb=LabelEncoder()

In [19]:
# Using Stanford POS Tagger API
jar = '/gdrive/MyDrive/features/stanford-postagger-3.9.2.jar'
model = '/gdrive/MyDrive/features/english-left3words-distsim.tagger'
pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')

# Extracting POS Features
POS_snippets=[]
for i in range(len(df['Snippet'])):
    # get pos tags for all words in the sentence. 
    POS_snippets.extend(pos_tagger.tag(clearLine(df['Snippet'][i]).lower().split()))
POS_snippets_type=[x[1] for x in POS_snippets]
POS_snippets_type=lb.fit_transform(POS_snippets_type)	
pos_vec=ohe.fit_transform(np.reshape(POS_snippets_type,(-1, 1)))
pos_vec=pos_vec.todense()

The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordPOSTagger, self).__init__(*args, **kwargs)


In [20]:
# Using Stanford NER Tagger API
jar_n = '/gdrive/MyDrive/features/stanford-ner-3.9.2.jar'
model_n = '/gdrive/MyDrive/features/english.all.3class.distsim.crf.ser.gz'
ner_tagger = StanfordNERTagger(model_n, jar_n, encoding='utf8')

# Extracting NER Features
ner_snippets=[]
for i in range(len(df['Snippet'])):
    # get ner tags for all words in the sentence
    ner_snippets.extend(ner_tagger.tag(clearLine(df['Snippet'][i]).lower().split()))
ner_snippets_type=[x[1] for x in ner_snippets]
ner_snippets_type=lb.fit_transform(ner_snippets_type)	
ner_vec=ohe.fit_transform(np.reshape(ner_snippets_type,(-1, 1)))
ner_vec=ner_vec.todense()

The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordNERTagger, self).__init__(*args, **kwargs)


In [21]:
# Extracting Empath Features
lexicon = Empath()
empath_vec=[]
for text in ppd['Candidate_word']:
    a=lexicon.analyze(text, normalize=True)
    bv=[]
    for i in a.values():
        bv.append(i)
    empath_vec.append(bv)

Model


In [22]:
# Function to group together candidate-words belonging to the same line.
def compress():
    lengths = []
    for i in range (1,len(ppd['left_context'])):
        # If left context contains only '<start>'
        if len(ppd['left_context'][i]) == 1:
            lengths.append(i)
    lengths.append(len(ppd['left_context']))
    compressor = []
    compressor.append(range(lengths[0]))
    for i in range (1,len(lengths)):
        compressor.append(range(lengths[i-1],lengths[i]))
    return compressor

In [23]:
comp = compress()

In [24]:
# Training Data, Test Data Preparation

def prep (train_indices, test_indices):
    print (len(train_indices), len(test_indices))

    # Ungroup training dataset candidate-words belonging to the same line.
    train_ids = []
    for i in range(len(train_indices)):
        train_ids.extend(comp[train_indices[i]])

    # Ungroup testing dataset candidate-words belonging to the same line.
    test_ids = []
    for i in range(len(test_indices)):
        test_ids.extend(comp[test_indices[i]])

    # Training Data Preparation
    train_left   = []
    train_right  = []
    train_middle = []
    train_labels = []
    train_pos_vec= []
    train_ner_vec= []
    train_empath_vec=[]

    for id in train_ids:
        train_left.append(keras_left_context[id])
        train_right.append(keras_right_context[id])
        train_middle.append(keras_middle[id])
        train_labels.append(labels[id])
        train_pos_vec.append(pos_vec[id])
        train_ner_vec.append(ner_vec[id])
        train_empath_vec.append(empath_vec[id])

    train_left   = np.array(train_left)
    train_right  = np.array(train_right)
    train_middle = np.array(train_middle)
    train_labels = np.array(train_labels)
    train_middle = np.expand_dims(train_middle,axis=1)
    train_pos_vec= np.array(train_pos_vec)
    train_ner_vec= np.array(train_ner_vec)
    train_empath_vec= np.array(train_empath_vec)

    # Testing Data Preparation
    val_left   = []
    val_right  = []
    val_middle = []
    val_labels = []
    val_pos_vec= []
    val_ner_vec= []
    val_empath_vec=[]

    for id in test_ids:
        val_left.append(keras_left_context[id])
        val_right.append(keras_right_context[id])
        val_middle.append(keras_middle[id])
        val_labels.append(labels[id])
        val_pos_vec.append(pos_vec[id])
        val_ner_vec.append(ner_vec[id])
        val_empath_vec.append(empath_vec[id])

    val_left   = np.array(val_left)
    val_right  = np.array(val_right)
    val_middle = np.array(val_middle)
    val_labels = np.array(val_labels)
    val_middle = np.expand_dims(val_middle, axis=1)
    val_pos_vec=np.array(val_pos_vec)
    val_ner_vec=np.array(val_ner_vec)
    val_empath_vec=np.array(val_empath_vec)

    # Below part only for TD lstm
    if mode == 'TD':	    
      train_left = np.concatenate((train_left, train_middle), axis=1)
      train_right = np.concatenate((train_middle, train_right), axis=1)
      val_left = np.concatenate((val_left, val_middle), axis=1)
      val_right = np.concatenate((val_middle, val_right), axis=1)

    return(train_left,train_right,train_middle,train_pos_vec,train_ner_vec,train_empath_vec,train_labels,val_left,val_right,val_middle,val_pos_vec,val_ner_vec,val_empath_vec,val_labels)

In [25]:
def de_comp(arr, test_indices):
    arr = deque(arr)
    fin = []
    for i in test_indices:
        temp = []
        for j in range(len(comp[i])):
            temp.append(arr.popleft())
        fin.append(temp)
    return (fin)    

In [26]:
def accuracy (pred, labels, test_indices):
    pred = pred[0]
    num_sent = len(comp)
    num_words = len(pred)
    threshold = 0
    cnt = 0
    # Threshold calculation for binary classification problem
    for a,b in zip (pred,labels):
        if b==1.0:
            threshold+=a
            cnt+=1
    threshold = threshold.item()/cnt

    pred_th = []
    for x in pred:
        if (x<=threshold):
            pred_th.append(0)
        else :
            pred_th.append(1)

    pred_th = np.array(pred_th)
    print ("Number of Test sentences : {}".format(len(test_indices)))
    error = pred_th-labels
    error_d  = de_comp(error,test_indices)
    labels_d = de_comp(labels,test_indices)
    pred_d   = de_comp(pred_th,test_indices)
    em_cnt = 0
    ds_cnt = 0
    mic_f1 = 0

    for err in error_d:
        if (sum(err)==0):
           em_cnt += 1
        ds_cnt += float(len(err)-sum(np.abs(err)))/len(err)

    for lab, pre in zip(labels_d,pred_d):

        tp = 0
        fp = 0
        fn = 0
        tn = 0
              
        for i,j in zip(lab,pre):
            if (int(i) == 1)  and (int(j) ==0) :
                fn += 1
            elif (int(i) == 0)  and (int(j) ==1) :
                fp += 1
            elif (int(i) == 0)  and (int(j) ==0) :
                tn += 1
            elif (int(i) == 1) and (int(j) ==1):
                tp += 1
        try : 
            mic_f1 += float(2*tp) / (2*tp + fn +fp)
        except :
            pass

    TP=0
    TN=0
    FP=0
    FN=0
    for a,b in zip(labels, pred_th):

        if int(a)==0 and b==0:
            TN+=1
        if int(a)==1 and b==1:
            TP+=1
        if int(a)==0 and b==1:
            FP+=1
        if int(a)==1 and b==0:
            FN+=1 
    print ("TP = {}, TN = {},FP = {}, FN = {}".format(TP,TN,FP,FN))
    F1 = float(2*TP)/(2*TP + FP+ FN)
    EM = float(em_cnt)/len(test_indices)
    DS = float(ds_cnt)/len(test_indices)
    uF1= float(mic_f1)/len(test_indices)
    print ("EM Accuracy : {}".format(EM))
    print ("DS Accuracy : {}".format(DS))
    print ("Micro F1    : {}".format(uF1))
    print ("Macro F1 Score = {}".format(F1))
    return (pred_d, labels_d)

In [27]:
# Tuned-Hyper Parameters
embed_size = 1024
hidden_size = 32
num_epochs=30
layer_size = 16
batch_size = 64

# 'Uni' : Unidirectional LSTM  |  'Bi' : Bidirectional LSTM  | 'TD' : Target-dependent LSTM
mode = 'Uni' 
# Use speech features while concatenating in dense layer if True else ignore
augmentation = False

In [28]:
# Define the model and run the model for given epochs
def model(train_l,train_r,train_m,train_pos,train_ner,train_empath,train_labels,test_l,test_r,test_m,test_pos,test_ner,test_empath,test_labels,test_indices):
    # Create placeholder for each input type to define the model
    x=Input(shape=(78, 300))
    y=Input(shape=(78, 300))
    z=Input(shape=(1, 300))
    z1=Input(shape=([1,34]))
    z2=Input(shape=([1,2]))
    # z3=Input(shape=([64]))
    z4=Input(shape=([194]))

    if mode == 'Bi':
      # Define birectional layer on LSTM and use concat as the merge mode.
    	left_out=Bidirectional(LSTM(hidden_size//2,return_sequences=False),input_shape=(train_l.shape[1:]))(x)      
    	middle = Bidirectional(LSTM(hidden_size//2,return_sequences=False),input_shape=(train_m.shape[1:]))(z)
    	right_out=Bidirectional(LSTM(hidden_size//2,return_sequences=False),input_shape=(train_r.shape[1:]))(y)

    else:
      # Define LSTM units
    	left_out  = LSTM(hidden_size,return_sequences=False)(x)
    	middle    = LSTM(hidden_size,return_sequences=False)(z)
    	right_out = LSTM(hidden_size,return_sequences=False)(y)

    pos_dense=Dense(32,activation='relu')(z1)
    ner_dense=Dense(16,activation='relu')(z2)
    # liwc_dense=Dense(64,activation='relu')(z3)
    empath_dense=Dense(64,activation='relu')(z4)

    if mode == 'TD' and augmentation == False :
      out=concatenate([left_out,right_out],axis=-1)

    if mode == 'TD' and augmentation == True :
    	out=concatenate([left_out,right_out,pos_dense,ner_dense,empath_dense],axis=-1)

    if mode != 'TD' and augmentation == False :
      out=concatenate([left_out,middle,right_out],axis=-1)

    if mode != 'TD' and augmentation == True :
    	out=concatenate([left_out,middle,right_out,pos_dense,ner_dense,empath_dense],axis=-1)

    out=Dense(layer_size, activation='relu')(out)
    output=Dense(1, activation='sigmoid')(out)
    model = Model(inputs=[x,y,z,z1,z2,z4], outputs=output)
    model.compile(optimizer=Adam(lr=10e-5),loss='binary_crossentropy',metrics=['accuracy'])
    print ("Starting Epochs")
    for i in range(num_epochs):
        model.fit([train_l,train_r,train_m,train_pos,train_ner,train_empath],train_labels,batch_size=batch_size, epochs=1,verbose=0)
        print('***************************************************************')
        print ("predicting_ Epoch : {}".format(i))
        pred_val=[]
        pred_val.append(model.predict([test_l,test_r,test_m,test_pos,test_ner,test_empath]))
        pre_d, lab_d = accuracy (pred_val, test_labels,test_indices)

        # with open('Tweets Aug-{}.csv'.format(i), mode='w') as file:
        #     file_writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        #     for a,b in zip (pre_d,lab_d):
        #         file_writer.writerow([a,b])
    return model

In [29]:
# Dataset division for 3-fold cross validation
indices = list(range(len(comp)))
np.random.shuffle(indices)
bins = []
bins.append(indices[:int(0.33*len(indices))])
bins.append(indices[int(0.33*len(indices)):int(0.66*len(indices))])
bins.append(indices[int(0.66*len(indices)):])

In [30]:
for i in range (3):
    print ("Fold {}".format(i+1))
    print (len(bins[0] + bins[1]),len(bins[2]))
    train_left,train_right,train_middle,pos_vec_train,ner_vec_train,empath_vec_train,train_labels,val_left,val_right,val_middle,pos_vec_val,ner_vec_val,empath_vec_val,val_labels = prep (bins[i%3] + bins[(i+1)%3], bins[(i+2)%3])
    Sar_model=model(train_left,train_right,train_middle,pos_vec_train,ner_vec_train,empath_vec_train,train_labels,val_left,val_right,val_middle,pos_vec_val,ner_vec_val,empath_vec_val,val_labels,bins[(i+2)%3])
    Sar_model.save_weights("Bert Tweets Aug.h5")
    print("Saved model to disk")

Fold 1
147 77
147 77
Starting Epochs
***************************************************************
predicting_ Epoch : 0
Number of Test sentences : 77
TP = 103, TN = 862,FP = 963, FN = 123
EM Accuracy : 0.025974025974025976
DS Accuracy : 0.46161238374129027
Micro F1    : 0.18036747172988668
Macro F1 Score = 0.15944272445820434
***************************************************************
predicting_ Epoch : 1
Number of Test sentences : 77
TP = 117, TN = 1274,FP = 551, FN = 109
EM Accuracy : 0.03896103896103896
DS Accuracy : 0.5077537325953293
Micro F1    : 0.20269052062279017
Macro F1 Score = 0.26174496644295303
***************************************************************
predicting_ Epoch : 2
Number of Test sentences : 77
TP = 98, TN = 1056,FP = 769, FN = 128
EM Accuracy : 0.012987012987012988
DS Accuracy : 0.5125240043287679
Micro F1    : 0.18477120700496544
Macro F1 Score = 0.17932296431838976
***************************************************************
predicting_ Epoch :