In [None]:
%pip install pandas
%pip install stanfordnlp
%pip install senticnet
%pip install sentistrength
%pip install nltk
%pip install spacy
%pip install sklearn
%pip install numpy
# run this in the terminal
# python -m spacy download en_core_web_sm

In [1]:
import json
import pandas as pd
from stanfordcorenlp import StanfordCoreNLP
import requests
from senticnet.senticnet import SenticNet
from sentistrength import PySentiStr
import nltk
import spacy
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

In [2]:
nltk.download('punkt')
# stanfordNLP = StanfordCoreNLP("http://localhost", port=8000, timeout=30000)
spacyNLP = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# download the zip file from the link https://drive.google.com/file/d/1yvCpB2URy0iFjQPn3RmidNOryTlo6vHG/view?usp=share_link
# extract the zip file and place the folder in the same directory as this file then cd into the folder
# run the following command in the terminal to start the server
# java -mx4g -cp "*" edu.stanford.stanfordNLP.pipeline.StanfordCoreNLPServer -port {8000 or any port} -timeout 30000
# can speed it up by replace 4g with 8g (it represents the ram being used in gigs)
def lemmatize(text):
    # perform lemmatization
    lemmas = []
    output = stanfordNLP.annotate(text, properties={'annotators': 'tokenize,lemma', 'outputFormat': 'json'})
    output_dict = json.loads(output)
    tokens = output_dict['sentences'][0]['tokens']
    for token in tokens:
        lemmas.append(token['lemma'])
   
    return lemmas  

#### Converting the given JSON file into actual JSON format for easier readbility

In [None]:
writeFile = open("Sarcasm_Headlines.json", "w")
writeFile.write("{ \"headlines\": [")
with open("Sarcasm_Headlines_Dataset.json") as readFile:
  for item in readFile:
    writeFile.write(item + ",")
# removed the final comma manually
writeFile.write("]}")
readFile.close()
writeFile.close()

# Preprocessing Stage

#### Reading the dataset and removing all article links as our goal is to analyze the headlines for sarcasm

In [None]:
dataset = json.load(open("Sarcasm_Headlines.json"))
df = pd.DataFrame(dataset["headlines"])
df.drop(["article_link"], axis = 1, inplace = True)
df.head()

#### lemmatizing the dataset

In [40]:
def lemmatizeDataset():
    for index, row in df.iterrows():
        sentence = row['headline']
        row['headline'] = lemmatize(sentence)

lemmatizeDataset()
df.head()

NameError: name 'lemmatize' is not defined

In [3]:
idioms = []
with open("idioms.txt") as file:
    for line in file:
        idioms.append(line.strip())

#### writing to a csv file to avoid having to perform pre-processing again

In [None]:
df.to_csv('lemmatized.csv', index=False)

In [41]:
df = pd.read_csv("lemmatized.csv")
df.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


# Module 1 => Concept Level and Common Sense Knowledge
### ConceptNet
ConceptNet is a semantic network consisting of common-sense knowledge and concepts, represented<br> in the form of nodes (words or
short phrases) and labeled edges (relationships) between them.

In [5]:

# set the API endpoint and parameters
endpoint = 'http://api.conceptnet.io/c/en/'
params = {
    'filter': 'core',
    'limit': 1000
}
def conceptNet(sentence):
    # send a GET request to the API endpoint
    response = requests.get(endpoint + sentence, params=params)

    # parse the JSON response
    data = json.loads(response.text)
    edges = data['edges']
    edges.sort(key=lambda x: x['weight'], reverse=True)

    return edges

# Module 2 => Sentiment Score
### SentiStrength
SentiStrength is a sentiment lexicon that uses linguistic information and rules to detect<br>
sentiment strength in English text. SentiStrength provides positive and negative sentiment<br>
scores for each word. Both scores are integers from 1 to 5, where 1 signifies weak sentiment<br>
and 5 signifies strong sentiment.
<br>
polarity = positiveSentiment - negativeSentiment

### SenticNet
SenticNet is a resource for opinion mining that aims to create a collection of commonly<br> 
used common-sense concepts  with positive and negative sentiment scores. The sentiment <br>
score for each word is scaled from -1 to 1, where -1 signifies strongly negative sentiment,<br>
0 signifies neutral sentiment and 1 signifies strong positive sentiment.
<br> sentiment = score * 5 (in-order to keep it with sentiStrength)

### Rules of w_score (sentiment score) selection:
- if word belongs to SentiStrength || SenticNet => pick the score whichever exists
- if word belongs to SentiStrength && SenticNet => avg score of the lexicons
- else get the concepts from concept net to expand the meaning => select top 5 ranked and calculate the avg sentiment score

### Final Calculation
sum_pos_score = sum of all positive sentiment scores<br>
sum_neg_score = sum of all negative sentiment scores<br>
if sum_pos_score && sum_neg_score > 0, there is a contradiction in the sentence

In [6]:
sn = SenticNet()
def senticNetScore(word):
    try:
        polarityValue = sn.polarity_value(word)
        return float(polarityValue) * 5
    except KeyError:
        return None

In [7]:
senti = PySentiStr()
# got the jar file and data folder from the author (also reverse engineered the pysenti package to extract the jar file)
senti.setSentiStrengthPath('D:/Sarcasm_Detection-Feature_Selection/SentiStrengthCom.jar')
senti.setSentiStrengthLanguageFolderPath('D:/Sarcasm_Detection-Feature_Selection/SentStrength_Data')
def sentiStrengthScore(word):
    result = senti.getSentiment(word)
    return result

In [8]:
def wScore(word):
    senticNet = senticNetScore(word)
    sentiStrength = sentiStrengthScore(word)[0]
    if senticNet == None and sentiStrength == None:
        expansion = conceptNet(word)
        if len(expansion) == 0:
            return 0
        else:
            score = 0
            expansion = expansion[:5]
            for edge in expansion:
                score += wScore(edge['end']['label'])
            return score / 5
    elif senticNet == None:
        return sentiStrength
    elif sentiStrength == None:
        return senticNet
    else:
        return (senticNet + sentiStrength) / 2

In [9]:
def positiveScore(results):
    score = 0
    for result in results:
        if result > 0:
            score += result
    return score
def negativeScore(results):
    score = 0
    for result in results:
        if result < 0:
            score += result
    return score

# Module 3 => Sentence Coherence
Checking the coreference between subjects or objects of a sentence
<br> for two subjects w1 and w2, sentence is coherent if
- if w1 is antecedent of w2
- if w1 and w2 are identical pronouns
- if w1 and w2 are identical subjects
- w2 starts with the word "the" (Definite Noun Phrase)
- w2 starts with "this", "that", "these", "those" (Demonstrative Noun Phrases)
- if w1 and w2 are proper nouns

In [10]:
def extractSubject(sentence):
    doc = spacyNLP(sentence)
    subject = None
    for token in doc:
        if token.dep_ == "nsubj":
            subject = token.text
    return subject

In [11]:
def hasAntecedents(text):
    doc = spacyNLP(text)
    antecedents = []
    for token in doc:
        if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
            for mention in doc.ents:
                if mention.start <= token.i < mention.end:
                    antecedents.append(mention.text)
    return True if len(antecedents) > 0 else False


In [12]:
pronounLemmatizer = WordNetLemmatizer()
def identicalPronouns(w1, w2):
    lemma1 = pronounLemmatizer.lemmatize(w1, 'n')
    lemma2 = pronounLemmatizer.lemmatize(w2, 'n')
    if lemma1 == lemma2:
        return True
    else:
        return False
    

In [13]:
def identicalSubjects(w1,w2):
    cleanedSubject1 = re.sub(r'[^a-zA-Z]', '', w1)
    cleanedSubject2 = re.sub(r'[^a-zA-Z]', '', w2)
    if cleanedSubject1 == cleanedSubject2:
        return True
    else:
        return False

In [14]:
def definiteNounPhraseFeature(text,w2):
    doc = nltk.word_tokenize(text)
    for i in range(len(doc)):
        if i-1 >= 0 and doc[i] == w2:
            if doc[i-1] == 'the':
                return True
    return False

In [15]:
def demonstrativeNounPhraseFeature(text,w2):
    doc = nltk.word_tokenize(text)
    for i in range(len(doc)):
        if doc[i] == w2:
            if i-1 >= 0 and doc[i-1] == 'this' or doc[i-1] == 'that' or doc[i-1] == 'these' or doc[i-1] == 'those':
                return True
    return False

In [16]:
def properNameFeature(w1,w2):
    taggedWords = nltk.pos_tag([w1,w2])
    proper = False
    for word, tag in taggedWords:
        if tag in ['NNP', 'NNPS']:
            proper = True
        else:
            proper = False
            break
    return proper

# Module 4 => Creation of Feature Vector

### Creating N-gram Feature Spaces
- Baseline 1 => unigram space
- Baseline 2 => unigram, bigram and trigram space

In [17]:
def createFeatureSpaces():
    sentences = df["headline"]
    vectorizer1 = CountVectorizer(ngram_range=(1,1))
    vectorizer2 = CountVectorizer(ngram_range=(1,3))
    featureSpace1 = vectorizer1.fit_transform(sentences)
    featureSpace2 = vectorizer2.fit_transform(sentences)
    return (vectorizer1, featureSpace1), (vectorizer2, featureSpace2)

### Binary Features

In [42]:
df["CONTRA"] = np.zeros(len(df))
df["CONTRA_PLUS_COHER"] = np.zeros(len(df))
df["pos_low"] = np.zeros(len(df))
df["pos_med"] = np.zeros(len(df))
df["pos_high"] = np.zeros(len(df))
df["neg_low"] = np.zeros(len(df))
df["neg_med"] = np.zeros(len(df))
df["neg_high"] = np.zeros(len(df))
df["emo_low"] = np.zeros(len(df))
df["emo_med"] = np.zeros(len(df))
df["emo_high"] = np.zeros(len(df))
df["rep_punc_low"] = np.zeros(len(df))
df["rep_punc_med"] = np.zeros(len(df))
df["rep_punc_high"] = np.zeros(len(df))
df["rep_seq_low"] = np.zeros(len(df))
df["rep_seq_med"] = np.zeros(len(df))
df["rep_seq_high"] = np.zeros(len(df))
df["cap_low"] = np.zeros(len(df))
df["cap_med"] = np.zeros(len(df))
df["cap_high"] = np.zeros(len(df))
df["slang_low"] = np.zeros(len(df))
df["slang_med"] = np.zeros(len(df))
df["slang_high"] = np.zeros(len(df))
df["exclaim_low"] = np.zeros(len(df))
df["exclaim_med"] = np.zeros(len(df))
df["exclaim_high"] = np.zeros(len(df))
df["idioms_low"] = np.zeros(len(df))
df["idioms_med"] = np.zeros(len(df))
df["idioms_high"] = np.zeros(len(df))
boosterAndSlangs = ["Lit", "Fleek", "Slay", "Woke", "Stan", "Chill", "On fleek", "Squad", "Bae", "AF", "Savage", "GOAT", "Lit AF", "Yas", "Gucci", "Thirsty", "Mood", "Extra", "Clap back", "Shook", "Lowkey", "Highkey", "Basic", "Lituation", "Snatched", "Throwing shade", "Swag", "Tea", "Glow up", "Fam", "Turnt", "Litty", "Dope", "Hundo P", "Gassed", "FOMO", "Trill", "No cap", "Blessed", "Fire", "Wavy", "Sus", "Tight", "Meme", "Shade", "Receipts", "Slay queen", "Cray", "Thick", "Litmas", "Litmus", "Queen", "Bad", "No chill", "Sorry not sorry", "Real talk", "Dank", "Ship", "Ratchet", "Yolo", "Fierce", "Legendary", "Drama", "Stuntin", "Lit fam", "Flame", "Finna", "Swole", "Squad goals", "Kween", "Salty", "Slaying", "Bounce", "Swerve", "Bussin", "Hype", "Finesse", "Bless up", "Crushin it", "Yaas", "Fleeky", "Fuego", "Cringy", "Dead", "Curve", "Baller", "Wig snatched", "Keep it 100", "Hater", "My bad"]

### Contradiction Feature: <br>
<emsp>We use two binary features Contra and Contra_Coher<br>
<emsp>Contra if headline has one sentence and contradiction in sentiment score occur
<br>
<emsp>Contra_Coher if headline has more than one sentence, contradiction of polarity and the headline is judged coherent<br>

### Sentiment Feature <br>
<emsp>Calculates the +ve and -ve score of the headline and then classify it as low/med/high

### Punctuations and Symbol Features <br>
<emsp>We use 7 indicators<br><br>
    <emsp><emsp>1. Number of emoticons <br>
    <emsp><emsp>2. Number of repetitive sequence of punctuations<br>
    <emsp><emsp>3. Number of repetitive sequence of characters<br>
    <emsp><emsp>4. Number of capitalized word<br>
    <emsp><emsp>5. Number of slang and booster words<br>
    <emsp><emsp>6. Number of exclamation marks<br>
    <emsp><emsp>7. Number of idioms<br>


In [19]:
def remove_symbols(line):
    return ''.join(ch for ch in line if ch.isalnum() or ch == " ")
def calculate_scores(sentence):
    print("Sentence: ",sentence)
    score=[]
    results = []
    for word in nltk.word_tokenize(sentence):
        results.append(wScore(word))
    positiveSum = positiveScore(results)
    negativeSum = negativeScore(results)
    score.append(positiveSum)
    score.append(negativeSum)
    print("positiveScore: ",positiveSum)
    print("negativeScore: ",negativeSum)
    return score

def isContradiction(scores):
    if scores[0]!=0 and scores[1]!=0:
        return True
    return False

def checkCoherence(sentence):
    tokens = nltk.sent_tokenize(sentence)
    if len(tokens) > 1:
        if hasAntecedents(sentence):
            return True
        w1 = extractSubject(tokens[0])
        w2 = extractSubject(tokens[1])
        if identicalPronouns(w1,w2) or identicalSubjects(w1,w2) or definiteNounPhraseFeature(tokens[1],w2) or demonstrativeNounPhraseFeature(tokens[1],w2) or properNameFeature(w1,w2):
            return True   
    return False

def countEmoticons(headline):
    return len(re.findall(r'[^\w\s,]', headline))

def countRepititivePunctuations(headline):
    return len(re.findall(r'([\W_]){2,}', headline))

def countRepititiveSequences(headline):
    return len(re.findall(r'(\S)\1{1,}', headline))

def countCapitalLetters(headline):
    return len(re.findall(r'[A-Z]', headline))

def countBoostersAndSlangs(headline):
    numSlangsBoosters = 0
    for word in headline.split():
        if word.lower() in boosterAndSlangs:
            numSlangsBoosters += 1
    return numSlangsBoosters

def countIdioms(headline):
    numIdioms = 0
    for word in headline.split():
        if word.lower() in idioms:
            numIdioms += 1
    return numIdioms


In [43]:
def assignSentimentFeature(headline,scores):
    positiveScore = scores[0]
    negativeScore = scores[1]
    if positiveScore <= -1:
        df.loc[df["headline"] == headline, "pos_low"] = 1
    elif positiveScore >= 0 and positiveScore <= 1:
        df.loc[df["headline"] == headline, "pos_med"] = 1
    elif positiveScore >= 2:
        df.loc[df["headline"] == headline, "pos_high"] = 1
    if negativeScore >= 1:
        df.loc[df["headline"] == headline, "neg_low"] = 1
    elif negativeScore >= 0 and negativeScore <= 1:
        df.loc[df["headline"] == headline, "neg_med"] = 1
    elif negativeScore <= -2:
        df.loc[df["headline"] == headline, "neg_high"] = 1
def punctuationAndSpecialSymbolFeature(headline):
    numberOfEmoticons = countEmoticons(headline)
    if numberOfEmoticons == 0:
        df.loc[df["headline"] == headline, "emo_low"] = 1
    elif numberOfEmoticons >= 1 and numberOfEmoticons <= 3:
        df.loc[df["headline"] == headline, "emo_med"] = 1
    elif numberOfEmoticons >= 4:
        df.loc[df["headline"] == headline, "emo_high"] = 1
    numberOfPunctuations = countRepititivePunctuations(headline)
    if numberOfPunctuations == 0:
        df.loc[df["headline"] == headline, "rep_punc_low"] = 1
    elif numberOfPunctuations >= 1 and numberOfPunctuations <= 3:
        df.loc[df["headline"] == headline, "rep_punc_med"] = 1
    elif numberOfPunctuations >= 4:
        df.loc[df["headline"] == headline, "rep_punc_high"] = 1
    numberOfRepetitiveSequences = countRepititiveSequences(headline)
    if numberOfRepetitiveSequences == 0:
        df.loc[df["headline"] == headline, "rep_seq_low"] = 1
    elif numberOfRepetitiveSequences >= 1 and numberOfRepetitiveSequences <= 3:
        df.loc[df["headline"] == headline, "rep_seq_med"] = 1
    elif numberOfRepetitiveSequences >= 4:
        df.loc[df["headline"] == headline, "rep_seq_high"] = 1
    numberOfCapitalLetters = countCapitalLetters(headline)
    if numberOfCapitalLetters == 0:
        df.loc[df["headline"] == headline, "cap_low"] = 1
    elif numberOfCapitalLetters >= 1 and numberOfCapitalLetters <= 3:
        df.loc[df["headline"] == headline, "cap_med"] = 1
    elif numberOfCapitalLetters >= 4:
        df.loc[df["headline"] == headline, "cap_high"] = 1
    numberOfBoostersAndSlangs = countBoostersAndSlangs(headline)
    if numberOfBoostersAndSlangs == 0:
        df.loc[df["headline"] == headline, "slang_low"] = 1
    elif numberOfBoostersAndSlangs >= 1 and numberOfBoostersAndSlangs <= 3:
        df.loc[df["headline"] == headline, "slang_med"] = 1
    elif numberOfBoostersAndSlangs >= 4:
        df.loc[df["headline"] == headline, "slang_high"] = 1
    numberOfIdioms = countIdioms(headline)
    if numberOfIdioms == 0:
        df.loc[df["headline"] == headline, "idiom_low"] = 1
    elif numberOfIdioms >= 1 and numberOfIdioms <= 3:
        df.loc[df["headline"] == headline, "idiom_med"] = 1
    elif numberOfIdioms >= 4:
        df.loc[df["headline"] == headline, "idiom_high"] = 1
    
def contradictionFeature():
    for headline in df["headline"]:
        text = remove_symbols(headline)
        sentences = nltk.sent_tokenize(text)
        scores = calculate_scores(text)
        assignSentimentFeature(headline,scores)
        punctuationAndSpecialSymbolFeature(headline)
        if len(sentences) > 1:
            print("CONTRA_PLUS_COHER")
            if isContradiction(scores) and checkCoherence(text):
                df.loc[df["headline"] == headline, "CONTRA_PLUS_COHER"] = 1
            else:
                df.loc[df["headline"] == headline, "CONTRA_PLUS_COHER"] = 0
        else:
            print("CONTRA")
            if isContradiction(scores):
                df.loc[df["headline"] == headline, "CONTRA"] = 1
            else:
                df.loc[df["headline"] == headline, "CONTRA"] = 0

In [44]:
listDFs = []
for i in range(0,26000,1000):
    temp = df[i:i+1000]
    listDFs.append(temp)
listDFs.append(df[26000:])
df = listDFs[4]

contradictionFeature()
print(df.head())
df.to_csv("tempContra/df5.csv")


Sentence:  trump announces hes a very sad man
positiveScore:  2.25
negativeScore:  -3.775
CONTRA
Sentence:  billy eichner boogied with obama and ellen got all the details
positiveScore:  0
negativeScore:  0
CONTRA
Sentence:  dead civilians and the language of war
positiveScore:  0
negativeScore:  -9.225000000000001
CONTRA
Sentence:  these vintage ads prove we had no idea what the future would actually look like
positiveScore:  2.545
negativeScore:  -1.6
CONTRA
Sentence:  former mugabe deputy to be sworn in as president
positiveScore:  0
negativeScore:  0
CONTRA
Sentence:  donald trump cancels press event with black pastors after finding out theyre not endorsing him
positiveScore:  0
negativeScore:  -1
CONTRA
Sentence:  kentucky police stop using punisher logo after realizing what it means
positiveScore:  1.7374999999999998
negativeScore:  -2.0250000000000004
CONTRA
Sentence:  billionaire ceo donates rats ass to worlds poor
positiveScore:  2.0524999999999998
negativeScore:  -2.05
CONTRA

In [46]:
length=5
counter = 0
for i in range(length):
    df = pd.read_csv("tempContra/df"+str(i+1)+".csv")
    print(df.shape)
    sarcastic = df["is_sarcastic"]
    results1 = df["CONTRA"]
    results2 = df["CONTRA_PLUS_COHER"]
    for i in range(len(sarcastic)):
        print(sarcastic[i])
        if sarcastic[i] == 1:
            if sarcastic[i] == results1[i] or sarcastic[i] == results2[i]:
                counter += 1
        else:
            if sarcastic[i] == results1[i] and sarcastic[i] == results2[i]:
                counter += 1
                sarcastic = df["is_sarcastic"]
print((counter/(length*1000)) * 100)

(1000, 33)
0
0
1
1
0
0
0
0
1
0
0
0
0
0
0
1
1
1
0
0
1
1
0
0
0
0
0
1
1
0
0
0
0
0
0
0
1
0
0
1
0
0
1
0
0
1
0
1
1
1
0
0
1
0
0
1
1
1
0
0
1
1
1
1
1
1
0
0
1
0
1
0
1
0
0
0
1
0
1
0
0
0
1
0
1
1
1
0
0
0
0
1
1
0
1
1
0
0
0
0
0
0
1
0
0
0
1
0
1
0
0
1
0
1
1
0
0
1
0
0
1
1
1
1
0
0
0
0
1
0
0
0
0
1
0
1
0
0
0
0
0
0
1
1
0
0
0
1
0
1
0
0
0
0
1
1
1
1
1
0
0
0
1
1
0
1
0
0
0
0
1
0
1
0
0
0
0
0
0
0
1
1
1
0
0
1
1
0
1
0
0
1
1
1
0
1
1
0
1
0
1
1
1
0
1
1
1
0
0
0
1
1
0
0
0
1
0
0
1
1
0
0
0
1
0
1
0
1
0
0
1
1
0
0
0
0
0
1
0
1
0
0
0
0
1
0
0
1
1
0
0
1
0
0
0
0
1
1
0
0
1
0
0
0
0
0
0
0
0
1
0
0
0
1
0
1
1
0
0
1
1
0
0
0
1
1
0
0
0
0
0
1
1
1
0
1
1
0
0
1
0
1
1
1
0
1
1
0
0
1
0
0
0
1
0
0
0
1
0
0
0
1
0
0
0
0
1
0
1
0
1
0
1
1
1
0
0
0
1
1
0
1
1
1
1
0
1
1
1
1
1
0
0
0
0
1
0
0
0
0
1
0
0
1
0
1
0
1
0
0
0
1
0
1
0
0
0
0
0
0
1
1
1
0
1
0
1
0
1
1
0
1
1
1
0
0
1
0
0
1
0
0
0
0
0
0
0
0
0
0
1
0
0
1
0
0
0
1
0
1
1
0
0
0
1
0
0
1
1
1
0
0
1
0
1
0
0
0
0
1
0
1
1
1
0
0
1
1
1
1
0
1
1
0
0
0
0
0
0
1
0
0
1
0
1
1
1
0
1
0
0
1
1
1
0
1
1
0
0
1
0
0
0
1
1
1
1
1
0
0
1
1
0
0
0