In [None]:
%pip install pandas
%pip install stanfordnlp
%pip install senticnet
%pip install sentistrength
%pip install nltk
%pip install spacy
# run this in the terminal
# python -m spacy download en_core_web_sm

In [None]:
import json
import pandas as pd
# from stanfordcorenlp import StanfordCoreNLP
import requests
from senticnet.senticnet import SenticNet
from sentistrength import PySentiStr
import nltk
import spacy
from nltk.stem import WordNetLemmatizer
import re


In [None]:
nltk.download('punkt')
# stanfordNLP = StanfordCoreNLP("http://localhost", port=8000, timeout=30000)
spacyNLP = spacy.load("en_core_web_sm")

In [None]:
# download the zip file from the link https://drive.google.com/file/d/1yvCpB2URy0iFjQPn3RmidNOryTlo6vHG/view?usp=share_link
# extract the zip file and place the folder in the same directory as this file then cd into the folder
# run the following command in the terminal to start the server
# java -mx4g -cp "*" edu.stanford.stanfordNLP.pipeline.StanfordCoreNLPServer -port {8000 or any port} -timeout 30000
# can speed it up by replace 4g with 8g (it represents the ram being used in gigs)
def lemmatize(text):
    # perform lemmatization
    lemmas = []
    output = stanfordNLP.annotate(text, properties={'annotators': 'tokenize,lemma', 'outputFormat': 'json'})
    output_dict = json.loads(output)
    tokens = output_dict['sentences'][0]['tokens']
    for token in tokens:
        lemmas.append(token['lemma'])
   
    return lemmas  

#### Converting the given JSON file into actual JSON format for easier readbility

In [None]:
writeFile = open("Sarcasm_Headlines.json", "w")
writeFile.write("{ \"headlines\": [")
with open("Sarcasm_Headlines_Dataset.json") as readFile:
  for item in readFile:
    writeFile.write(item + ",")
# removed the final comma manually
writeFile.write("]}")
readFile.close()
writeFile.close()

# Preprocessing Stage

#### Reading the dataset and removing all article links as our goal is to analyze the headlines for sarcasm

In [None]:
dataset = json.load(open("Sarcasm_Headlines.json"))
df = pd.DataFrame(dataset["headlines"])
df.drop(["article_link"], axis = 1, inplace = True)
df.head()

#### lemmatizing the dataset

In [None]:
def lemmatizeDataset():
    for index, row in df.iterrows():
        sentence = row['headline']
        row['headline'] = lemmatize(sentence)

lemmatizeDataset()
df.head()

#### writing to a csv file to avoid having to perform pre-processing again

In [None]:
df.to_csv('lemmatized.csv', index=False)

In [None]:
df = pd.read_csv("lemmatized.csv")
df.head()

# Module 1 => Concept Level and Common Sense Knowledge
### ConceptNet
ConceptNet is a semantic network consisting of common-sense knowledge and concepts, represented<br> in the form of nodes (words or
short phrases) and labeled edges (relationships) between them.

In [None]:

# set the API endpoint and parameters
endpoint = 'http://api.conceptnet.io/c/en/'
params = {
    'filter': 'core',
    'limit': 1000
}
def conceptNet(sentence):
    # send a GET request to the API endpoint
    response = requests.get(endpoint + sentence, params=params)

    # parse the JSON response
    data = json.loads(response.text)
    edges = data['edges']
    edges.sort(key=lambda x: x['weight'], reverse=True)

    return edges

# Module 2 => Sentiment Score
### SentiStrength
SentiStrength is a sentiment lexicon that uses linguistic information and rules to detect<br>
sentiment strength in English text. SentiStrength provides positive and negative sentiment<br>
scores for each word. Both scores are integers from 1 to 5, where 1 signifies weak sentiment<br>
and 5 signifies strong sentiment.
<br>
polarity = positiveSentiment - negativeSentiment

### SenticNet
SenticNet is a resource for opinion mining that aims to create a collection of commonly<br> 
used common-sense concepts  with positive and negative sentiment scores. The sentiment <br>
score for each word is scaled from -1 to 1, where -1 signifies strongly negative sentiment,<br>
0 signifies neutral sentiment and 1 signifies strong positive sentiment.
<br> sentiment = score * 5 (in-order to keep it with sentiStrength)

### Rules of w_score (sentiment score) selection:
- if word belongs to SentiStrength || SenticNet => pick the score whichever exists
- if word belongs to SentiStrength && SenticNet => avg score of the lexicons
- else get the concepts from concept net to expand the meaning => select top 5 ranked and calculate the avg sentiment score

### Final Calculation
sum_pos_score = sum of all positive sentiment scores<br>
sum_neg_score = sum of all negative sentiment scores<br>
if sum_pos_score && sum_neg_score > 0, there is a contradiction in the sentence

In [None]:
sn = SenticNet()
def senticNetScore(word):
    try:
        polarityValue = sn.polarity_value(word)
        return float(polarityValue) * 5
    except KeyError:
        return None

In [None]:
senti = PySentiStr()
# got the jar file and data folder from the author (also reverse engineered the pysenti package to extract the jar file)
senti.setSentiStrengthPath('C:/Users/pd/OneDrive/Desktop/IR project/Sarcasm_Detection-Feature_Selection/SentiStrengthCom.jar')
senti.setSentiStrengthLanguageFolderPath('C:/Users/pd/OneDrive/Desktop/IR project/Sarcasm_Detection-Feature_Selection/SentStrength_Data')
def sentiStrengthScore(word):
    result = senti.getSentiment(word)
    return result

In [None]:
def wScore(word):
    senticNet = senticNetScore(word)
    sentiStrength = sentiStrengthScore(word)[0]
    if senticNet == None and sentiStrength == None:
        expansion = conceptNet(word)
        if len(expansion) == 0:
            return 0
        else:
            score = 0
            expansion = expansion[:5]
            for edge in expansion:
                score += wScore(edge['end']['label'])
            return score / 5
    elif senticNet == None:
        return sentiStrength
    elif sentiStrength == None:
        return senticNet
    else:
        return (senticNet + sentiStrength) / 2

In [None]:
def positiveScore(results):
    score = 0
    for result in results:
        if result > 0:
            score += result
    return score
def negativeScore(results):
    score = 0
    for result in results:
        if result < 0:
            score += result
    return score

# Module 3 => Sentence Coherence
Checking the coreference between subjects or objects of a sentence
<br> for two subjects w1 and w2, sentence is coherent if
- if w1 is antecedent of w2
- if w1 and w2 are identical pronouns
- if w1 and w2 are identical subjects
- w2 starts with the word "the" (Definite Noun Phrase)
- w2 starts with "this", "that", "these", "those" (Demonstrative Noun Phrases)
- if w1 and w2 are proper nouns

In [None]:
def extractSubject(sentence):
    doc = spacyNLP(sentence)
    subject = None
    for token in doc:
        if token.dep_ == "nsubj":
            subject = token.text
    return subject

In [None]:
def hasAntecedents(text):
    doc = spacyNLP(text)
    antecedents = []
    for token in doc:
        if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
            for mention in doc.ents:
                if mention.start <= token.i < mention.end:
                    antecedents.append(mention.text)
    return antecedents


In [None]:
pronounLemmatizer = WordNetLemmatizer()
def identicalPronouns(w1, w2):
    lemma1 = pronounLemmatizer.lemmatize(w1, 'n')
    lemma2 = pronounLemmatizer.lemmatize(w2, 'n')
    if lemma1 == lemma2:
        return True
    else:
        return False
    

In [None]:
def identicalSubjects(w1,w2):
    cleanedSubject1 = re.sub(r'[^a-zA-Z]', '', w1)
    cleanedSubject2 = re.sub(r'[^a-zA-Z]', '', w2)
    if cleanedSubject1 == cleanedSubject2:
        return True
    else:
        return False

In [None]:
def definiteNounPhraseFeature(text,w2):
    doc = nltk.word_tokenize(text)
    for i in range(len(doc)):
        if i-1 >= 0 and doc[i] == w2:
            if doc[i-1] == 'the':
                return True
    return False

In [None]:
def demonstrativeNounPhraseFeature(text,w2):
    doc = nltk.word_tokenize(text)
    for i in range(len(doc)):
        if doc[i] == w2:
            if i-1 >= 0 and doc[i-1] == 'this' or doc[i-1] == 'that' or doc[i-1] == 'these' or doc[i-1] == 'those':
                return True
    return False

In [None]:
def properNameFeature(w1,w2):
    taggedWords = nltk.pos_tag([w1,w2])
    proper = False
    for word, tag in taggedWords:
        if tag in ['NNP', 'NNPS']:
            proper = True
        else:
            proper = False
            break
    return proper

# Module 4 Creation of Feature Vector

We need to do following feature classificiation on each headline
- Contradiction Feature: <br>
<emsp>We use two binary features Contra and Contra_Coher<br>
<emsp>Contra if headline has one sentence and contradiction in sentiment score occur
<br>
<emsp>Contra_Coher if headline has more than one sentence, contradiction of polarity and the tweet is judged coherent<br>
- Sentiment Feature <br>
<emsp>Calculates the +ve and -ve score of the headline and then classify it as low/med/high
- Punctuation <br>
<emsp>We use 7 indicators<br><br>
    <emsp><emsp>1. Number of emoticons <br>
    <emsp><emsp>2. Number of repetitive sequence of punctuations<br>
    <emsp><emsp>3. Number of repetitive sequence of characters<br>
    <emsp><emsp>4. Number of capitalized word<br>
    <emsp><emsp>5. Number of slang and booster words<br>
    <emsp><emsp>6. Number of exclamation marks<br>
    <emsp><emsp>7. Number of idioms<br

In [None]:
df = pd.read_csv('lemmatized.csv')
df = df.drop('is_sarcastic',axis='columns')
sentences = df['headline']

In [None]:
def remove_symbols(line):
    return ''.join(ch for ch in line if ch.isalnum())

def calculate_scores(sentence):
    score=[]
    results = []
    for i in range(len(sentence)):
        results[i] = wScore(sentence[i])
    score[0]=positiveScore(results)
    score[1]=negativeScore(results)
    return score

def isContradiction(scores):
    if scores[0]!=0 and scores[1]!=0:
        return True
    return False

def checkCoherence(sentence):
    if hasAntecedents(".".join(sentence)) is not None:
        return True
    for i in range(len(sentence))-1:
        s1 = sentence[i]
        s2 = sentence[i+1]
        w1 = extractSubject(s1)
        w2 = extractSubject(s2)
        
        if identicalPronouns(w1,w2) or identicalSubjects(w1,w2) or definiteNounPhraseFeature(s2,w2) or demonstrativeNounPhraseFeature(s2,w2) or properNameFeature(w1,w2):
            return True
    
    return False
        

for sentence in sentences:
    # if only one sentence
    scores = []
    if '.' not in sentence:
        sentence = sentence.split(' ')
        for i in range(len(sentence)):
            sentence[i] = remove_symbols(sentence[i])
        scores = calculate_scores(sentence)
        if isContradiction(scores):
            df['contra'] = 1
        else:
            df['contra'] = 0
        df['contra_coher'] = 0
    # if more than one sentence
    else:
        sentence = sentence.split('.')
        if checkCoherence(sentence):
            for i in range(len(sentence)):
                temp_sentence = sentence[i]
                temp_sentence = temp_sentence.split(' ')
                for j in range(len(temp_sentence)):
                    temp_sentence[i] = remove_symbols(temp_sentence[i])
                scores = calculate_scores(temp_sentence)
                if isContradiction(scores):
                    df['contra_coher'] = 1
                else:
                    df['contra_coher'] = 0
        df['contra'] = 0
    ## implementing 4.4.3

    ## positive score
    if score[0] < -1:
        df['pos_low'] = 1
        df['pos_medium'] = 0
        df['pos_high'] = 0
    elif score[0] >= 0 and score[0]<=1:
        df['pos_low'] = 0
        df['pos_medium'] = 1
        df['pos_high'] = 0
    elif score[0] >= 2:
        df['pos_low'] = 0
        df['pos_medium'] = 0
        df['pos_high'] = 1
    else:
        df['pos_low'] = 0
        df['pos_medium'] = 0
        df['pos_high'] = 0
    
    ## negative score
    if score[1] < -1:
        df['neg_low'] = 1
        df['neg_medium'] = 0
        df['neg_high'] = 0
    elif score[1] >= 0 and score[1]<=1:
        df['neg_low'] = 0
        df['neg_medium'] = 1
        df['neg_high'] = 0
    elif score[1] >= 2:
        df['neg_low'] = 0
        df['neg_medium'] = 0
        df['neg_high'] = 1
    else:
        df['neg_low'] = 0
        df['neg_medium'] = 0
        df['neg_high'] = 0
df
