# Installing Required Modules

In [1]:
%pip install stanfordnlp
%pip install senticnet
%pip install nltk
%pip install spacy



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing the Relevant Modules

In [3]:
# Install stanza if not already installed
!pip install stanza



In [4]:
import pandas as pd
from sklearn.model_selection import cross_val_predict
import requests
from senticnet.senticnet import SenticNet
from textblob import TextBlob
from sklearn.metrics import accuracy_score
import nltk
import spacy
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')
nltk.download('punkt')
spacyNLP = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
# Import and download the English model
import stanza
stanza.download('en')  # Download the English model only once
!java --version

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...
INFO:stanza:File exists: /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources


openjdk 11.0.24 2024-07-16
OpenJDK Runtime Environment (build 11.0.24+8-post-Ubuntu-1ubuntu322.04)
OpenJDK 64-Bit Server VM (build 11.0.24+8-post-Ubuntu-1ubuntu322.04, mixed mode, sharing)


In [6]:
# Initialize the Stanza NLP pipeline
nlp = stanza.Pipeline('en', processors='tokenize,lemma')

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| lemma     | combined_nocharlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: lemma
INFO:stanza:Done loading processors!


In [7]:
def lemmatize(text):
    # Perform lemmatization using Stanza
    lemmas = []
    doc = nlp(text)
    for sentence in doc.sentences:
        for word in sentence.words:
            lemmas.append(word.lemma)
    return lemmas


In [8]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/Sarcasm_Headlines_Dataset.csv")

# If "article_link" exists in the CSV and isn't needed, drop it
if "article_link" in df.columns:
    df.drop(["article_link"], axis=1, inplace=True)

df.head()


Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


#### lemmatizing the dataset

In [9]:
idioms = []
with open("/content/drive/MyDrive/idioms.txt") as file:
    for line in file:
        idioms.append(line.strip())

In [10]:
def lemmatizeDataset():
    for index, row in df.iterrows():
        sentence = row['headline']
        row['headline'] = lemmatize(sentence)

lemmatizeDataset()
df.head()

KeyboardInterrupt: 

In [None]:
lem_df = df
df.head()

### ConceptNet
ConceptNet is a semantic network consisting of common-sense knowledge and concepts, represented<br> in the form of nodes (words or
short phrases) and labeled edges (relationships) between them.

In [None]:
# set the API endpoint and parameters
endpoint = 'http://api.conceptnet.io/c/en/'
params = {
    'filter': 'core',
    'limit': 1000
}
def conceptNet(sentence):
    # send a GET request to the API endpoint
    response = requests.get(endpoint + sentence, params=params)

    # parse the JSON response
    data = json.loads(response.text)
    edges = data['edges']
    edges.sort(key=lambda x: x['weight'], reverse=True)

    return edges

### SenticNet
SenticNet is a resource for opinion mining that aims to create a collection of commonly<br>
used common-sense concepts  with positive and negative sentiment scores. The sentiment <br>
score for each word is scaled from -1 to 1, where -1 signifies strongly negative sentiment,<br>
0 signifies neutral sentiment and 1 signifies strong positive sentiment.
<br> sentiment = score * 5 (in-order to keep it with sentiStrength)

### Rules of w_score (sentiment score) selection:
- if word belongs to SentiStrength || SenticNet => pick the score whichever exists
- if word belongs to SentiStrength && SenticNet => avg score of the lexicons
- else get the concepts from concept net to expand the meaning => select top 5 ranked and calculate the avg sentiment score

### Final Calculation
sum_pos_score = sum of all positive sentiment scores<br>
sum_neg_score = sum of all negative sentiment scores<br>
if sum_pos_score && sum_neg_score > 0, there is a contradiction in the sentence

In [None]:
sn = SenticNet()
def senticNetScore(word):
    try:
        polarityValue = sn.polarity_value(word)
        return float(polarityValue) * 5
    except KeyError:
        return None

In [None]:
# Initialize TextBlob (replacing PySentiStr)

def sentiStrengthScore(word):
    result = lambda text: TextBlob(text).sentiment.polarity(word)
    return result

In [None]:
def wScore(word):
    senticNet = senticNetScore(word)
    sentiStrength = TextBlob(word).sentiment.polarity
    if senticNet == None and sentiStrength == None:
        expansion = conceptNet(word)
        if len(expansion) == 0:
            return 0
        else:
            score = 0
            expansion = expansion[:5]
            for edge in expansion:
                score += wScore(edge['end']['label'])
            return score / 5
    elif senticNet == None:
        return sentiStrength
    elif sentiStrength == None:
        return senticNet
    else:
        return (senticNet + sentiStrength) / 2

In [None]:
def positiveScore(results):
    score = 0
    for result in results:
        if result > 0:
            score += result
    return score
def negativeScore(results):
    score = 0
    for result in results:
        if result < 0:
            score += result
    return score

Checking the coreference between subjects or objects of a sentence
<br> for two subjects w1 and w2, sentence is coherent if
- if w1 is antecedent of w2
- if w1 and w2 are identical pronouns
- if w1 and w2 are identical subjects
- w2 starts with the word "the" (Definite Noun Phrase)
- w2 starts with "this", "that", "these", "those" (Demonstrative Noun Phrases)
- if w1 and w2 are proper nouns

In [None]:
def extractSubject(sentence):
    doc = spacyNLP(sentence)
    subject = None
    for token in doc:
        if token.dep_ == "nsubj":
            subject = token.text
    return subject

In [None]:
def hasAntecedents(text):
    doc = spacyNLP(text)
    antecedents = []
    for token in doc:
        if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
            for mention in doc.ents:
                if mention.start <= token.i < mention.end:
                    antecedents.append(mention.text)
    return True if len(antecedents) > 0 else False


In [None]:
pronounLemmatizer = WordNetLemmatizer()
def identicalPronouns(w1, w2):
    lemma1 = pronounLemmatizer.lemmatize(w1, 'n')
    lemma2 = pronounLemmatizer.lemmatize(w2, 'n')
    if lemma1 == lemma2:
        return True
    else:
        return False


In [None]:
def identicalSubjects(w1,w2):
    cleanedSubject1 = re.sub(r'[^a-zA-Z]', '', w1)
    cleanedSubject2 = re.sub(r'[^a-zA-Z]', '', w2)
    if cleanedSubject1 == cleanedSubject2:
        return True
    else:
        return False

In [None]:
def definiteNounPhraseFeature(text,w2):
    doc = nltk.word_tokenize(text)
    for i in range(len(doc)):
        if i-1 >= 0 and doc[i] == w2:
            if doc[i-1] == 'the':
                return True
    return False

In [None]:
def demonstrativeNounPhraseFeature(text,w2):
    doc = nltk.word_tokenize(text)
    for i in range(len(doc)):
        if doc[i] == w2:
            if i-1 >= 0 and doc[i-1] == 'this' or doc[i-1] == 'that' or doc[i-1] == 'these' or doc[i-1] == 'those':
                return True
    return False

In [None]:
def properNameFeature(w1,w2):
    taggedWords = nltk.pos_tag([w1,w2])
    proper = False
    for word, tag in taggedWords:
        if tag in ['NNP', 'NNPS']:
            proper = True
        else:
            proper = False
            break
    return proper

### Binary Features

In [None]:
df["CONTRA"] = np.zeros(len(df))
df["CONTRA_PLUS_COHER"] = np.zeros(len(df))
df["pos_low"] = np.zeros(len(df))
df["pos_med"] = np.zeros(len(df))
df["pos_high"] = np.zeros(len(df))
df["neg_low"] = np.zeros(len(df))
df["neg_med"] = np.zeros(len(df))
df["neg_high"] = np.zeros(len(df))
df["emo_low"] = np.zeros(len(df))
df["emo_med"] = np.zeros(len(df))
df["emo_high"] = np.zeros(len(df))
df["rep_punc_low"] = np.zeros(len(df))
df["rep_punc_med"] = np.zeros(len(df))
df["rep_punc_high"] = np.zeros(len(df))
df["rep_seq_low"] = np.zeros(len(df))
df["rep_seq_med"] = np.zeros(len(df))
df["rep_seq_high"] = np.zeros(len(df))
df["cap_low"] = np.zeros(len(df))
df["cap_med"] = np.zeros(len(df))
df["cap_high"] = np.zeros(len(df))
df["slang_low"] = np.zeros(len(df))
df["slang_med"] = np.zeros(len(df))
df["slang_high"] = np.zeros(len(df))
df["exclaim_low"] = np.zeros(len(df))
df["exclaim_med"] = np.zeros(len(df))
df["exclaim_high"] = np.zeros(len(df))
df["idioms_low"] = np.zeros(len(df))
df["idioms_med"] = np.zeros(len(df))
df["idioms_high"] = np.zeros(len(df))
boosterAndSlangs = ["Lit", "Fleek", "Slay", "Woke", "Stan", "Chill", "On fleek", "Squad", "Bae", "AF", "Savage", "GOAT", "Lit AF", "Yas", "Gucci", "Thirsty", "Mood", "Extra", "Clap back", "Shook", "Lowkey", "Highkey", "Basic", "Lituation", "Snatched", "Throwing shade", "Swag", "Tea", "Glow up", "Fam", "Turnt", "Litty", "Dope", "Hundo P", "Gassed", "FOMO", "Trill", "No cap", "Blessed", "Fire", "Wavy", "Sus", "Tight", "Meme", "Shade", "Receipts", "Slay queen", "Cray", "Thick", "Litmas", "Litmus", "Queen", "Bad", "No chill", "Sorry not sorry", "Real talk", "Dank", "Ship", "Ratchet", "Yolo", "Fierce", "Legendary", "Drama", "Stuntin", "Lit fam", "Flame", "Finna", "Swole", "Squad goals", "Kween", "Salty", "Slaying", "Bounce", "Swerve", "Bussin", "Hype", "Finesse", "Bless up", "Crushin it", "Yaas", "Fleeky", "Fuego", "Cringy", "Dead", "Curve", "Baller", "Wig snatched", "Keep it 100", "Hater", "My bad"]

<emsp>Contra if headline has one sentence and contradiction in sentiment score occur
<br>
<emsp>Contra_Coher if headline has more than one sentence, contradiction of polarity and the headline is judged coherent<br>

### Sentiment Feature <br>
<emsp>Calculates the +ve and -ve score of the headline and then classify it as low/med/high

### Punctuations and Symbol Features <br>
<emsp>We use 7 indicators<br><br>
    <emsp><emsp>1. Number of emoticons <br>
    <emsp><emsp>2. Number of repetitive sequence of punctuations<br>
    <emsp><emsp>3. Number of repetitive sequence of characters<br>
    <emsp><emsp>4. Number of capitalized word<br>
    <emsp><emsp>5. Number of slang and booster words<br>
    <emsp><emsp>6. Number of exclamation marks<br>
    <emsp><emsp>7. Number of idioms<br>


In [None]:
def remove_symbols(line):
    return ''.join(ch for ch in line if ch.isalnum() or ch == " ")

def calculate_scores(sentence):
    print("Sentence: ",sentence)
    score=[]
    results = []
    for word in nltk.word_tokenize(sentence):
        results.append(wScore(word))
    positiveSum = positiveScore(results)
    negativeSum = negativeScore(results)
    score.append(positiveSum)
    score.append(negativeSum)
    print("positiveScore: ",positiveSum)
    print("negativeScore: ",negativeSum)
    return score

def isContradiction(scores):
    if scores[0]!=0 and scores[1]!=0:
        return True
    return False

def checkCoherence(sentence):
    tokens = nltk.sent_tokenize(sentence)
    if len(tokens) > 1:
        if hasAntecedents(sentence):
            return True
        w1 = extractSubject(tokens[0])
        w2 = extractSubject(tokens[1])
        if identicalPronouns(w1,w2) or identicalSubjects(w1,w2) or definiteNounPhraseFeature(tokens[1],w2) or demonstrativeNounPhraseFeature(tokens[1],w2) or properNameFeature(w1,w2):
            return True
    return False

def countEmoticons(headline):
    return len(re.findall(r'[^\w\s,]', headline))

def countRepititivePunctuations(headline):
    return len(re.findall(r'([\W_]){2,}', headline))

def countRepititiveSequences(headline):
    return len(re.findall(r'(\S)\1{1,}', headline))

def countCapitalLetters(headline):
    return len(re.findall(r'[A-Z]', headline))

def countBoostersAndSlangs(headline):
    numSlangsBoosters = 0
    for word in headline.split():
        if word.lower() in boosterAndSlangs:
            numSlangsBoosters += 1
    return numSlangsBoosters

def countIdioms(headline):
    numIdioms = 0
    for word in headline.split():
        if word.lower() in idioms:
            numIdioms += 1
    return numIdioms


In [None]:
def assignSentimentFeature(headline,scores):
    positiveScore = scores[0]
    negativeScore = scores[1]
    if positiveScore <= -1:
        df.loc[df["headline"] == headline, "pos_low"] = 1
    elif positiveScore >= 0 and positiveScore <= 1:
        df.loc[df["headline"] == headline, "pos_med"] = 1
    elif positiveScore >= 2:
        df.loc[df["headline"] == headline, "pos_high"] = 1
    if negativeScore >= 1:
        df.loc[df["headline"] == headline, "neg_low"] = 1
    elif negativeScore >= 0 and negativeScore <= 1:
        df.loc[df["headline"] == headline, "neg_med"] = 1
    elif negativeScore <= -2:
        df.loc[df["headline"] == headline, "neg_high"] = 1

def punctuationAndSpecialSymbolFeature(headline):
    numberOfEmoticons = countEmoticons(headline)
    if numberOfEmoticons == 0:
        df.loc[df["headline"] == headline, "emo_low"] = 1
    elif numberOfEmoticons >= 1 and numberOfEmoticons <= 3:
        df.loc[df["headline"] == headline, "emo_med"] = 1
    elif numberOfEmoticons >= 4:
        df.loc[df["headline"] == headline, "emo_high"] = 1
    numberOfPunctuations = countRepititivePunctuations(headline)
    if numberOfPunctuations == 0:
        df.loc[df["headline"] == headline, "rep_punc_low"] = 1
    elif numberOfPunctuations >= 1 and numberOfPunctuations <= 3:
        df.loc[df["headline"] == headline, "rep_punc_med"] = 1
    elif numberOfPunctuations >= 4:
        df.loc[df["headline"] == headline, "rep_punc_high"] = 1
    numberOfRepetitiveSequences = countRepititiveSequences(headline)
    if numberOfRepetitiveSequences == 0:
        df.loc[df["headline"] == headline, "rep_seq_low"] = 1
    elif numberOfRepetitiveSequences >= 1 and numberOfRepetitiveSequences <= 3:
        df.loc[df["headline"] == headline, "rep_seq_med"] = 1
    elif numberOfRepetitiveSequences >= 4:
        df.loc[df["headline"] == headline, "rep_seq_high"] = 1
    numberOfCapitalLetters = countCapitalLetters(headline)
    if numberOfCapitalLetters == 0:
        df.loc[df["headline"] == headline, "cap_low"] = 1
    elif numberOfCapitalLetters >= 1 and numberOfCapitalLetters <= 3:
        df.loc[df["headline"] == headline, "cap_med"] = 1
    elif numberOfCapitalLetters >= 4:
        df.loc[df["headline"] == headline, "cap_high"] = 1
    numberOfBoostersAndSlangs = countBoostersAndSlangs(headline)
    if numberOfBoostersAndSlangs == 0:
        df.loc[df["headline"] == headline, "slang_low"] = 1
    elif numberOfBoostersAndSlangs >= 1 and numberOfBoostersAndSlangs <= 3:
        df.loc[df["headline"] == headline, "slang_med"] = 1
    elif numberOfBoostersAndSlangs >= 4:
        df.loc[df["headline"] == headline, "slang_high"] = 1
    numberOfIdioms = countIdioms(headline)
    if numberOfIdioms == 0:
        df.loc[df["headline"] == headline, "idiom_low"] = 1
    elif numberOfIdioms >= 1 and numberOfIdioms <= 3:
        df.loc[df["headline"] == headline, "idiom_med"] = 1
    elif numberOfIdioms >= 4:
        df.loc[df["headline"] == headline, "idiom_high"] = 1

def contradictionFeature():
    for headline in df["headline"]:
        text = remove_symbols(headline)
        sentences = nltk.sent_tokenize(text)
        scores = calculate_scores(text)
        assignSentimentFeature(headline,scores)
        punctuationAndSpecialSymbolFeature(headline)
        if len(sentences) > 1:
            print("CONTRA_PLUS_COHER")
            if isContradiction(scores) and checkCoherence(text):
                df.loc[df["headline"] == headline, "CONTRA_PLUS_COHER"] = 1
            else:
                df.loc[df["headline"] == headline, "CONTRA_PLUS_COHER"] = 0
        else:
            print("CONTRA")
            if isContradiction(scores):
                df.loc[df["headline"] == headline, "CONTRA"] = 1
            else:
                df.loc[df["headline"] == headline, "CONTRA"] = 0

In [None]:
listDFs = []
for i in range(0,26000,1000):
    temp = df[i:i+1000]
    listDFs.append(temp)
listDFs.append(df[26000:])


In [None]:
rangee= 0
temp = []
for i in range(0,26,1):
    df = listDFs[rangee + i]
    contradictionFeature()
    temp.append(df)

fin_df = pd.concat(temp,ignore_index = True)

In [None]:
fin_df.head()

In [None]:
feature_set_df = fin_df

feature_set_df.shape

In [None]:
df = lem_df
featureSet = feature_set_df
featureSet.drop(["headline"],axis=1,inplace=True)

# N-Grams SVC Prediction

In [None]:
ngramX = df["headline"]  # Features
y = df['is_sarcastic']
ngramRange = (1, 3)
vectorizer = CountVectorizer(ngram_range=ngramRange)
X_vectorized = vectorizer.fit_transform(ngramX)
ngramSVM = SVC(kernel='linear')
ngramSVM.fit(X_vectorized, y)
ngramPredictions = cross_val_predict(ngramSVM, X_vectorized, y, cv=10,n_jobs=4)

# Feature Space SVC Prediction

In [None]:
featureSetX = featureSet.drop('is_sarcastic', axis=1)  # Features
featureVectorizer = CountVectorizer()
featureX = featureVectorizer.fit_transform(featureSetX)
featureSetSVM = SVC(kernel='linear')
featureSetSVM.fit(featureSetX, y)
featureSetPredictions = cross_val_predict(featureSetSVM, featureSetX, y, cv=10,n_jobs=4)

# Final Evaluation

In [None]:
def calculate_metrics(true_labels, predicted_labels):
    true_positives = sum((true == 1 and pred == 1) for true, pred in zip(true_labels, predicted_labels))
    false_positives = sum((true == 0 and pred == 1) for true, pred in zip(true_labels, predicted_labels))
    false_negatives = sum((true == 1 and pred == 0) for true, pred in zip(true_labels, predicted_labels))

    recall = true_positives / (true_positives + false_negatives)
    precision = true_positives / (true_positives + false_positives)
    f1_score = 2 * (precision * recall) / (precision + recall)

    return recall, precision, f1_score

In [None]:
definitionResults=[]
for index, row in featureSet.iterrows():
    if row['CONTRA'] == 1 or row['CONTRA_PLUS_COHER'] == 1:
        definitionResults.append(1)
    else:
        definitionResults.append(0)
recall, precision, f1_score = calculate_metrics(y, definitionResults)
definitionAccuracy = accuracy_score(y, definitionResults)
print("Definition Accuracy: ",definitionAccuracy)
print("Definition Recall: ",recall)
print("Definition Precision: ",precision)
print("Definition F1 Score: ",f1_score)

In [None]:
# Only N-grams
ngramAccuracyScore = accuracy_score(y, ngramPredictions)
ngramRecall, ngramPrecision, ngramF1Score = calculate_metrics(y, ngramPredictions)
print("N-gram Recall: ", ngramRecall)
print("N-gram Precision: ", ngramPrecision)
print("N-gram F1 Score: ", ngramF1Score)
print("N-gram Accuracy Score: ", ngramAccuracyScore)

In [None]:
# Only Feature Set
featureSetAccuracyScore = accuracy_score(y, featureSetPredictions)
featureSetRecall, featureSetPrecision, featureSetF1Score = calculate_metrics(y, featureSetPredictions)
print("Feature Set Recall: ", featureSetRecall)
print("Feature Set Precision: ", featureSetPrecision)
print("Feature Set F1 Score: ", featureSetF1Score)
print("Feature Set Accuracy Score: ", featureSetAccuracyScore)

In [None]:
# N-grams + Feature Set
finalResults = []
for i in range(len(ngramPredictions)):
    if ngramPredictions[i] == featureSetPredictions[i]:
        finalResults.append(ngramPredictions[i])
    else:
        margin_ngrams = ngramSVM.decision_function(X_vectorized[i])
        margin_features = featureSetSVM.decision_function([featureSetX.iloc[i]])  # Remove the wrapping of [featureX[i]] in square brackets
        if abs(margin_ngrams) > abs(margin_features):
            finalResults.append(ngramPredictions[i])
        else:
            finalResults.append(featureSetPredictions[i])
accuracy = accuracy_score(y, finalResults)
recall, precision, f1_score = calculate_metrics(y, finalResults)
print("N-grams + Feature Set Recall: ", recall)
print("N-grams + Feature Set Precision: ", precision)
print("N-grams + Feature Set F1 Score: ", f1_score)
print("N-grams + Feature Set Accuracy Score: ", accuracy)

# Results

<table>
  <tr>
    <th>Number</th>
    <th>Method</th>
    <th>Precision</th>
    <th>Recall</th>
    <th>F-Score</th>
    <th>Accuracy</th>
  </tr>
  <tr>
    <td>1.</td>
    <td>Contradiction in Sentiment Score</td>
    <td>49.14%</td>
    <td>56.14%</td>
    <td>52.41%</td>
    <td>55.25%</td>
  </tr>
  <tr>
    <td>2.</td>
    <td>Uni-gram, bi-gram and tri-gram features</td>
    <td>84.69%</td>
    <td>82.39%</td>
    <td>83.53%</td>
    <td>85.73%</td>
  </tr>
  <tr>
    <td>3.</td>
    <td>Proposed Features</td>
    <td>53.85%</td>
    <td>59.31%</td>
    <td>56.45%</td>
    <td>59.83%</td>
  </tr>
  <tr>
    <td>4.</td>
    <td>N-Grams and Proposed Features Combined</td>
    <td>85.06%</td>
    <td>76.51%</td>
    <td>80.56%</td>
    <td>83.79%</td>
  </tr>
</table>